In [2]:
import pandas as pd
import numpy as np
import pickle # Used for saving the model as a .pkl file
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [3]:
# --- 1. Data Loading and Cleaning ---
print("1. Loading and Cleaning Data...")
# --- Configuration ---
FILE_PATH = 'data/currentDataset.csv'
MODEL_FILENAME = 'Random_forest_traffic_classifier.pkl'
TARGET_COLUMN = 'Prediction'
try:
    df = pd.read_csv(FILE_PATH)
except FileNotFoundError:
    print(f"ERROR: File not found at {FILE_PATH}. Please check the path and ensure the file is present.")
    exit()

# Initial cleaning steps
df.columns = df.columns.str.strip()
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

if TARGET_COLUMN not in df.columns:
    print(f"ERROR: Target column '{TARGET_COLUMN}' not found in the CSV file.")
    exit()



1. Loading and Cleaning Data...


In [4]:
# --- 2. Feature and Target Preparation ---
X = df.drop(TARGET_COLUMN, axis=1)
y = df[TARGET_COLUMN]

# Handle non-numeric features in X (dropping them for simplicity)
object_cols = X.select_dtypes(include=['object']).columns
if len(object_cols) > 0:
    X.drop(columns=object_cols, inplace=True)

# Encode the target variable (y)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

# --- 3. Model Training ---
print("\n2. Training Random Forest Model...")
# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
print("   Model training complete.")

# Quick validation check
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"   Test Set Accuracy: {accuracy * 100:.2f}%")

# --- 4. Creating the PKL File (Saving the Model) ---
print(f"\n3. Saving the trained model to '{MODEL_FILENAME}'...")
try:
    with open(MODEL_FILENAME, 'wb') as file:
        pickle.dump(rf_model, file)
    print(f"   ✅ Success! The model has been saved as {MODEL_FILENAME}")
except Exception as e:
    print(f"   ❌ ERROR during saving: {e}")



2. Training Random Forest Model...
   Model training complete.
   Test Set Accuracy: 100.00%

3. Saving the trained model to 'Random_forest_traffic_classifier.pkl'...
   ✅ Success! The model has been saved as Random_forest_traffic_classifier.pkl
