# Importing the Dataset

In [15]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
data = pd.read_csv("Updated_data.csv")
data.head()

Unnamed: 0,record_id,timestamp,intersection_id,intersection_name,latitude,longitude,vehicle_count,average_speed,signal_cycle_time,green_time,yellow_time,red_time,weather_condition,day_of_week,is_holiday,event,pollution_level,car_count,bus_count,truck_count,motorcycle_count,is_weekend,hour_of_day,is_holiday_or_special_event
0,9926.0,2023-01-07 21:20:00,6.0,Allahabad Junction,25.4358,81.8307,75.0,39.72,120.0,63.0,4.0,53.0,0,2,0.0,4,1.21,48.0,4.0,6.0,17.0,0,21,False
1,8595.0,2023-01-06 23:10:00,5.0,Naini Bridge,25.4088,25.4088,139.0,30.33,120.0,51.0,4.0,65.0,0,0,0.0,4,2.86,89.0,13.0,17.0,20.0,0,23,False
2,673.0,2023-01-01 11:10:00,3.0,Chowk,25.4314,81.8437,20.0,53.28,120.0,69.0,5.0,46.0,0,3,1.0,4,0.34,15.0,1.0,2.0,2.0,0,11,True
3,8744.0,2023-01-07 01:40:00,4.0,Tagore Town,25.4675,81.8867,21.0,36.06,120.0,59.0,4.0,57.0,4,2,0.0,4,0.54,12.0,4.0,2.0,3.0,0,1,False
4,316.0,2023-01-01 05:10:00,6.0,Allahabad Junction,25.4358,81.8307,41.0,47.21,120.0,63.852391,5.0,54.0,0,3,1.0,4,0.86,26.0,6.0,2.0,7.0,0,5,True


In [16]:
act_data = data[['pollution_level' ,'car_count','bus_count','truck_count','motorcycle_count'] ]

In [17]:
act_data

Unnamed: 0,pollution_level,car_count,bus_count,truck_count,motorcycle_count
0,1.21,48.0,4.0,6.0,17.0
1,2.86,89.0,13.0,17.0,20.0
2,0.34,15.0,1.0,2.0,2.0
3,0.54,12.0,4.0,2.0,3.0
4,0.86,26.0,6.0,2.0,7.0
...,...,...,...,...,...
9624,0.43,14.0,2.0,2.0,5.0
9625,0.57,20.0,4.0,0.0,10.0
9626,0.41,13.0,0.0,5.0,4.0
9627,5.58,167.0,26.0,32.0,48.0


# SPlitting data into training and test data

In [39]:
X = act_data.drop('pollution_level', axis = 1)
Y = act_data['pollution_level']

In [40]:
from sklearn.model_selection import train_test_split
X_train , X_test , Y_train ,Y_test = train_test_split(X, Y , test_size = 0.2 , random_state = 42)

# Model Selection

In [41]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    "XG Boost Regressor" : XGBRegressor()
}

In [42]:
from sklearn.metrics import mean_squared_error
for model_name, model in models.items():
    
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(Y_test, y_pred)

    print(model_name)
    print("Mean Squarred Error : " ,mse , "\n")

Linear Regression
Mean Squarred Error :  0.023764638324855612 

Decision Tree
Mean Squarred Error :  0.033652468150851274 

Random Forest
Mean Squarred Error :  0.025131544248681748 

XG Boost Regressor
Mean Squarred Error :  0.026012848435282062 



# Cross Validation

In [22]:
from sklearn.model_selection import cross_val_score, KFold
xgb_model = XGBRegressor()
kf = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(model, X, Y, cv=kf)
print(scores)
print(scores.mean())

[0.97435773 0.9812955  0.98620038 0.9856709  0.9796491  0.97728829
 0.9840786  0.98807808 0.98133705 0.97169218]
0.9809647803809434


In [23]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

model1 = DecisionTreeRegressor()

kf = KFold(n_splits=10, shuffle=True, random_state=42)

scores = cross_val_score(model1, X, Y, cv=kf)

print("Cross-validation scores:", scores)
print("Average accuracy:", scores.mean())

Cross-validation scores: [0.95964877 0.96515202 0.97614943 0.97234012 0.96383215 0.96833812
 0.95370499 0.98159281 0.96490112 0.96958926]
Average accuracy: 0.9675248790408911


#  Saving the Model

In [17]:
import pickle
pickle.dump(model1,open('LinearModelVehicleVSPollution.pkl','wb'))