In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model, tree, ensemble, svm, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, scale
import pickle


In [41]:
data = pd.read_csv("traffic volume.csv")
data.head()


Unnamed: 0,holiday,temp,rain,snow,weather,date,Time,traffic_volume
0,,288.28,0.0,0.0,Clouds,02-10-2012,09:00:00,5545
1,,289.36,0.0,0.0,Clouds,02-10-2012,10:00:00,4516
2,,289.58,0.0,0.0,Clouds,02-10-2012,11:00:00,4767
3,,290.13,0.0,0.0,Clouds,02-10-2012,12:00:00,5026
4,,291.14,0.0,0.0,Clouds,02-10-2012,13:00:00,4918


In [42]:
data['temp'].fillna(data['temp'].mean(), inplace=True)
data['rain'].fillna(data['rain'].mean(), inplace=True)
data['snow'].fillna(data['snow'].mean(), inplace=True)
data['weather'].fillna('Clouds', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['temp'].fillna(data['temp'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['rain'].fillna(data['rain'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

In [43]:
le = LabelEncoder()
data['weather'] = le.fit_transform(data['weather'])
data['holiday'] = le.fit_transform(data['holiday'].astype(str))


In [44]:
data[['day', 'month', 'year']] = data['date'].str.split('-', expand=True)
data[['hours', 'minutes', 'seconds']] = data['Time'].str.split(':', expand=True)
data.drop(columns=['date', 'Time'], inplace=True)

# Convert all new columns to numeric
cols_to_convert = ['day', 'month', 'year', 'hours', 'minutes', 'seconds']
data[cols_to_convert] = data[cols_to_convert].apply(pd.to_numeric)


In [45]:
y = data['traffic_volume']
x = data.drop(columns=['traffic_volume'])


In [46]:
x_scaled = pd.DataFrame(scale(x), columns=x.columns)


In [47]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)


In [48]:
lin_reg = linear_model.LinearRegression()
dtree = tree.DecisionTreeRegressor(max_depth=10)
rand = ensemble.RandomForestRegressor(n_estimators=50, max_depth=15, n_jobs=-1)
svr = svm.SVR(kernel='rbf', C=1.0)

lin_reg.fit(x_train, y_train)
dtree.fit(x_train, y_train)
rand.fit(x_train, y_train)
svr.fit(x_train[:5000], y_train[:5000])  # SVR is slow — using smaller subset


0,1,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [49]:
p1 = lin_reg.predict(x_train)
p2 = dtree.predict(x_train)
p3 = rand.predict(x_train)
p4 = svr.predict(x_train[:5000])

print("Linear Regression R²:", metrics.r2_score(y_train, p1))
print("Decision Tree R²:", metrics.r2_score(y_train, p2))
print("Random Forest R²:", metrics.r2_score(y_train, p3))
print("SVR R² (subset):", metrics.r2_score(y_train[:5000], p4))


Linear Regression R²: 0.13255490303952955
Decision Tree R²: 0.8055719438834772
Random Forest R²: 0.8960953866009955
SVR R² (subset): 0.051269164650461496


In [50]:
test_pred = rand.predict(x_test)
mse = metrics.mean_squared_error(y_test, test_pred)
rmse = np.sqrt(mse)
print("Random Forest Test RMSE:", rmse)


Random Forest Test RMSE: 823.2198028275693


In [51]:
with open("Flask/model.pkl", 'wb') as f:
    pickle.dump(rand, f)

with open("Flask/encoder.pkl", 'wb') as f:
    pickle.dump(le, f)
