In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

import joblib
import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv("mobility_dataset.csv")
df.head()


Unnamed: 0,Timestamp,Latitude,Longitude,Vehicle_Count,Traffic_Speed_kmh,Road_Occupancy_%,Traffic_Light_State,Weather_Condition,Accident_Report,Sentiment_Score,Ride_Sharing_Demand,Parking_Availability,Emission_Levels_g_km,Energy_Consumption_L_h,Traffic_Condition
0,2024-03-01 00:00:00,40.842275,-73.703149,205,49.893435,82.65278,Yellow,Clear,0,-0.609199,2,45,450.760055,19.574337,High
1,2024-03-01 00:05:00,40.831119,-73.987354,202,22.383965,45.829298,Green,Clear,0,0.965442,16,1,321.800341,5.385554,High
2,2024-03-01 00:10:00,40.819549,-73.732462,252,46.889699,82.772465,Green,Rain,0,0.28966,16,49,231.152655,10.277477,High
3,2024-03-01 00:15:00,40.725849,-73.980134,37,5.730536,37.695567,Red,Fog,0,-0.271965,66,10,410.384292,29.243279,High
4,2024-03-01 00:20:00,40.813265,-73.961631,64,61.348034,22.313358,Red,Snow,0,-0.797606,3,5,364.466342,16.801459,Low


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Timestamp               5000 non-null   object 
 1   Latitude                5000 non-null   float64
 2   Longitude               5000 non-null   float64
 3   Vehicle_Count           5000 non-null   int64  
 4   Traffic_Speed_kmh       5000 non-null   float64
 5   Road_Occupancy_%        5000 non-null   float64
 6   Traffic_Light_State     5000 non-null   object 
 7   Weather_Condition       5000 non-null   object 
 8   Accident_Report         5000 non-null   int64  
 9   Sentiment_Score         5000 non-null   float64
 10  Ride_Sharing_Demand     5000 non-null   int64  
 11  Parking_Availability    5000 non-null   int64  
 12  Emission_Levels_g_km    5000 non-null   float64
 13  Energy_Consumption_L_h  5000 non-null   float64
 14  Traffic_Condition       5000 non-null   

In [4]:
if 'Timestamp' in df.columns:
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df['Hour'] = df['Timestamp'].dt.hour
    df['Day'] = df['Timestamp'].dt.day
    df['Month'] = df['Timestamp'].dt.month
    df['Weekday'] = df['Timestamp'].dt.weekday
    df.drop('Timestamp', axis=1, inplace=True)

df.head()


Unnamed: 0,Latitude,Longitude,Vehicle_Count,Traffic_Speed_kmh,Road_Occupancy_%,Traffic_Light_State,Weather_Condition,Accident_Report,Sentiment_Score,Ride_Sharing_Demand,Parking_Availability,Emission_Levels_g_km,Energy_Consumption_L_h,Traffic_Condition,Hour,Day,Month,Weekday
0,40.842275,-73.703149,205,49.893435,82.65278,Yellow,Clear,0,-0.609199,2,45,450.760055,19.574337,High,0,1,3,4
1,40.831119,-73.987354,202,22.383965,45.829298,Green,Clear,0,0.965442,16,1,321.800341,5.385554,High,0,1,3,4
2,40.819549,-73.732462,252,46.889699,82.772465,Green,Rain,0,0.28966,16,49,231.152655,10.277477,High,0,1,3,4
3,40.725849,-73.980134,37,5.730536,37.695567,Red,Fog,0,-0.271965,66,10,410.384292,29.243279,High,0,1,3,4
4,40.813265,-73.961631,64,61.348034,22.313358,Red,Snow,0,-0.797606,3,5,364.466342,16.801459,Low,0,1,3,4


In [5]:
categorical_cols = [
    'Traffic_Light_State',
    'Weather_Condition',
    'Accident_Report'
]

le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


In [6]:
X = df.drop("Traffic_Condition", axis=1)
y = LabelEncoder().fit_transform(df["Traffic_Condition"])


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Gradient Boosting": GradientBoostingClassifier()
}


In [9]:
results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    
    print("\n", name)
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))



 Logistic Regression
Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       633
           1       0.73      0.75      0.74        60
           2       0.64      0.58      0.61       307

    accuracy                           0.77      1000
   macro avg       0.73      0.73      0.73      1000
weighted avg       0.76      0.77      0.77      1000


 KNN
Accuracy: 0.735
              precision    recall  f1-score   support

           0       0.81      0.87      0.84       633
           1       0.57      0.47      0.51        60
           2       0.58      0.52      0.55       307

    accuracy                           0.73      1000
   macro avg       0.65      0.62      0.63      1000
weighted avg       0.73      0.73      0.73      1000


 Naive Bayes
Accuracy: 0.438
              precision    recall  f1-score   support

           0       1.00      0.14      0.24       633
           1       0.96      0.75      0

In [10]:
accuracy_df = pd.DataFrame(
    results.items(),
    columns=["Algorithm", "Accuracy"]
).sort_values(by="Accuracy", ascending=False)

accuracy_df


Unnamed: 0,Algorithm,Accuracy
3,Decision Tree,0.999
6,Gradient Boosting,0.999
4,Random Forest,0.998
5,SVM,0.878
0,Logistic Regression,0.77
1,KNN,0.735
2,Naive Bayes,0.438


In [11]:
best_model_name = accuracy_df.iloc[0]["Algorithm"]
print("Best Model:", best_model_name)

best_model = models[best_model_name]

best_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", best_model)
])

best_pipeline.fit(X_train, y_train)

joblib.dump(best_pipeline, "traffic_classification_model.pkl")

print("✅ Model saved successfully")


Best Model: Decision Tree
✅ Model saved successfully


In [12]:
loaded_model = joblib.load("traffic_classification_model.pkl")
loaded_model.predict(X_test[:1])


array([1])