In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
df1 = pd.read_csv('kolkata2017-2022.csv')
df1.head()

Unnamed: 0.1,Unnamed: 0,Address,Date time,Minimum Temperature,Maximum Temperature,Temperature,Dew Point,Relative Humidity,Heat Index,Wind Speed,...,Visibility,Cloud Cover,Sea Level Pressure,Weather Type,Latitude,Longitude,Resolved Address,Name,Info,Conditions
0,0,kolkata,01/01/2017 00:00:00,62.6,62.6,62.6,59.0,88.01,,0.0,...,1.2,0.0,1015.0,Mist,22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear
1,1,kolkata,01/01/2017 01:00:00,60.8,60.8,60.8,59.0,93.79,,0.0,...,1.2,0.0,1015.0,Mist,22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear
2,2,kolkata,01/01/2017 02:00:00,63.1,63.1,63.1,60.4,90.99,,0.0,...,0.6,0.0,1015.4,"Mist, Fog",22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear
3,3,kolkata,01/01/2017 03:00:00,60.8,60.8,60.8,59.0,93.79,,5.8,...,0.6,0.0,1014.0,Mist,22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear
4,4,kolkata,01/01/2017 04:00:00,59.0,59.0,59.0,57.2,93.75,,0.0,...,1.2,0.0,1014.0,Mist,22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear


In [3]:
# Split the 'Date time' column into two separate columns for date and time
df1[['Date', 'Time']] = df1['Date time'].str.split(' ', expand=True)

# Print the first 5 rows (optional) to verify splitting
df1.head()

Unnamed: 0.1,Unnamed: 0,Address,Date time,Minimum Temperature,Maximum Temperature,Temperature,Dew Point,Relative Humidity,Heat Index,Wind Speed,...,Sea Level Pressure,Weather Type,Latitude,Longitude,Resolved Address,Name,Info,Conditions,Date,Time
0,0,kolkata,01/01/2017 00:00:00,62.6,62.6,62.6,59.0,88.01,,0.0,...,1015.0,Mist,22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear,01/01/2017,00:00:00
1,1,kolkata,01/01/2017 01:00:00,60.8,60.8,60.8,59.0,93.79,,0.0,...,1015.0,Mist,22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear,01/01/2017,01:00:00
2,2,kolkata,01/01/2017 02:00:00,63.1,63.1,63.1,60.4,90.99,,0.0,...,1015.4,"Mist, Fog",22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear,01/01/2017,02:00:00
3,3,kolkata,01/01/2017 03:00:00,60.8,60.8,60.8,59.0,93.79,,5.8,...,1014.0,Mist,22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear,01/01/2017,03:00:00
4,4,kolkata,01/01/2017 04:00:00,59.0,59.0,59.0,57.2,93.75,,0.0,...,1014.0,Mist,22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear,01/01/2017,04:00:00


In [4]:
# Assuming your DataFrame is named 'df1' and the date column is named 'Date time'
df1['Date time'] = pd.to_datetime(df1['Date time'])  # Convert strings to datetime objects
print(df1.dtypes)  # Verify the data type has changed

Unnamed: 0                      int64
Address                        object
Date time              datetime64[ns]
Minimum Temperature           float64
Maximum Temperature           float64
Temperature                   float64
Dew Point                     float64
Relative Humidity             float64
Heat Index                    float64
Wind Speed                    float64
Wind Gust                     float64
Wind Direction                float64
Wind Chill                    float64
Precipitation                 float64
Precipitation Cover           float64
Snow Depth                    float64
Visibility                    float64
Cloud Cover                   float64
Sea Level Pressure            float64
Weather Type                   object
Latitude                      float64
Longitude                     float64
Resolved Address               object
Name                           object
Info                          float64
Conditions                     object
Date        

In [5]:
df1['Day of Week'] = df1['Date time'].dt.strftime('%A')  # Full day name (Monday, Tuesday, etc.)
# Alternatively, use '%a' for abbreviated day names (Mon, Tue, etc.)
df1.head()

Unnamed: 0.1,Unnamed: 0,Address,Date time,Minimum Temperature,Maximum Temperature,Temperature,Dew Point,Relative Humidity,Heat Index,Wind Speed,...,Weather Type,Latitude,Longitude,Resolved Address,Name,Info,Conditions,Date,Time,Day of Week
0,0,kolkata,2017-01-01 00:00:00,62.6,62.6,62.6,59.0,88.01,,0.0,...,Mist,22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear,01/01/2017,00:00:00,Sunday
1,1,kolkata,2017-01-01 01:00:00,60.8,60.8,60.8,59.0,93.79,,0.0,...,Mist,22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear,01/01/2017,01:00:00,Sunday
2,2,kolkata,2017-01-01 02:00:00,63.1,63.1,63.1,60.4,90.99,,0.0,...,"Mist, Fog",22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear,01/01/2017,02:00:00,Sunday
3,3,kolkata,2017-01-01 03:00:00,60.8,60.8,60.8,59.0,93.79,,5.8,...,Mist,22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear,01/01/2017,03:00:00,Sunday
4,4,kolkata,2017-01-01 04:00:00,59.0,59.0,59.0,57.2,93.75,,0.0,...,Mist,22.5705,88.3712,"Kolkata, WB, India","Kolkata, WB, India",,Clear,01/01/2017,04:00:00,Sunday


In [6]:
data_full=df1[['Unnamed: 0','Date','Time','Day of Week','Temperature','Dew Point','Relative Humidity','Wind Speed','Wind Direction','Precipitation','Visibility','Cloud Cover','Weather Type','Conditions']]

In [7]:
df2=df1[['Day of Week','Time','Precipitation','Visibility','Cloud Cover','Conditions']]

In [8]:
df2.head()

Unnamed: 0,Day of Week,Time,Precipitation,Visibility,Cloud Cover,Conditions
0,Sunday,00:00:00,0.0,1.2,0.0,Clear
1,Sunday,01:00:00,0.0,1.2,0.0,Clear
2,Sunday,02:00:00,0.0,0.6,0.0,Clear
3,Sunday,03:00:00,0.0,0.6,0.0,Clear
4,Sunday,04:00:00,0.0,1.2,0.0,Clear


In [9]:
df2['Time'] = pd.to_datetime(df2['Time']).dt.hour
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Time'] = pd.to_datetime(df2['Time']).dt.hour


Unnamed: 0,Day of Week,Time,Precipitation,Visibility,Cloud Cover,Conditions
0,Sunday,0,0.0,1.2,0.0,Clear
1,Sunday,1,0.0,1.2,0.0,Clear
2,Sunday,2,0.0,0.6,0.0,Clear
3,Sunday,3,0.0,0.6,0.0,Clear
4,Sunday,4,0.0,1.2,0.0,Clear


In [10]:
df2.rename(columns={'Day of Week':'day','Time':'time','Precipitation':'precipitation','Visibility':'visibility','Cloud Cover':'cloud','Conditions':'condition'},inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.rename(columns={'Day of Week':'day','Time':'time','Precipitation':'precipitation','Visibility':'visibility','Cloud Cover':'cloud','Conditions':'condition'},inplace= True)


In [11]:
df2.head()

Unnamed: 0,day,time,precipitation,visibility,cloud,condition
0,Sunday,0,0.0,1.2,0.0,Clear
1,Sunday,1,0.0,1.2,0.0,Clear
2,Sunday,2,0.0,0.6,0.0,Clear
3,Sunday,3,0.0,0.6,0.0,Clear
4,Sunday,4,0.0,1.2,0.0,Clear


In [12]:
dummies = pd.get_dummies(df2.day)
dummies

Unnamed: 0,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...
52695,0,0,1,0,0,0,0
52696,0,0,1,0,0,0,0
52697,0,0,1,0,0,0,0
52698,0,0,1,0,0,0,0


In [13]:
merge = pd.concat([df2,dummies],axis='columns')
merge

Unnamed: 0,day,time,precipitation,visibility,cloud,condition,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,Sunday,0,0.0,1.2,0.0,Clear,0,0,0,1,0,0,0
1,Sunday,1,0.0,1.2,0.0,Clear,0,0,0,1,0,0,0
2,Sunday,2,0.0,0.6,0.0,Clear,0,0,0,1,0,0,0
3,Sunday,3,0.0,0.6,0.0,Clear,0,0,0,1,0,0,0
4,Sunday,4,0.0,1.2,0.0,Clear,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
52695,Saturday,20,0.0,1.2,8.2,Clear,0,0,1,0,0,0,0
52696,Saturday,21,0.0,1.2,0.0,Clear,0,0,1,0,0,0,0
52697,Saturday,22,0.0,0.6,0.0,Clear,0,0,1,0,0,0,0
52698,Saturday,23,0.0,0.6,30.0,Partially cloudy,0,0,1,0,0,0,0


In [14]:
dummies2 = pd.get_dummies(df2.condition)
dummies2

Unnamed: 0,Clear,Overcast,Partially cloudy,Rain,"Rain, Overcast","Rain, Partially cloudy"
0,1,0,0,0,0,0
1,1,0,0,0,0,0
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0
...,...,...,...,...,...,...
52695,1,0,0,0,0,0
52696,1,0,0,0,0,0
52697,1,0,0,0,0,0
52698,0,0,1,0,0,0


In [15]:
merge2 = pd.concat([merge,dummies2],axis='columns')
merge2

Unnamed: 0,day,time,precipitation,visibility,cloud,condition,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Clear,Overcast,Partially cloudy,Rain,"Rain, Overcast","Rain, Partially cloudy"
0,Sunday,0,0.0,1.2,0.0,Clear,0,0,0,1,0,0,0,1,0,0,0,0,0
1,Sunday,1,0.0,1.2,0.0,Clear,0,0,0,1,0,0,0,1,0,0,0,0,0
2,Sunday,2,0.0,0.6,0.0,Clear,0,0,0,1,0,0,0,1,0,0,0,0,0
3,Sunday,3,0.0,0.6,0.0,Clear,0,0,0,1,0,0,0,1,0,0,0,0,0
4,Sunday,4,0.0,1.2,0.0,Clear,0,0,0,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52695,Saturday,20,0.0,1.2,8.2,Clear,0,0,1,0,0,0,0,1,0,0,0,0,0
52696,Saturday,21,0.0,1.2,0.0,Clear,0,0,1,0,0,0,0,1,0,0,0,0,0
52697,Saturday,22,0.0,0.6,0.0,Clear,0,0,1,0,0,0,0,1,0,0,0,0,0
52698,Saturday,23,0.0,0.6,30.0,Partially cloudy,0,0,1,0,0,0,0,0,0,1,0,0,0


In [16]:
df4=merge2.drop(['day','Saturday','condition','Rain'],axis='columns')

In [17]:
df4

Unnamed: 0,time,precipitation,visibility,cloud,Friday,Monday,Sunday,Thursday,Tuesday,Wednesday,Clear,Overcast,Partially cloudy,"Rain, Overcast","Rain, Partially cloudy"
0,0,0.0,1.2,0.0,0,0,1,0,0,0,1,0,0,0,0
1,1,0.0,1.2,0.0,0,0,1,0,0,0,1,0,0,0,0
2,2,0.0,0.6,0.0,0,0,1,0,0,0,1,0,0,0,0
3,3,0.0,0.6,0.0,0,0,1,0,0,0,1,0,0,0,0
4,4,0.0,1.2,0.0,0,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52695,20,0.0,1.2,8.2,0,0,0,0,0,0,1,0,0,0,0
52696,21,0.0,1.2,0.0,0,0,0,0,0,0,1,0,0,0,0
52697,22,0.0,0.6,0.0,0,0,0,0,0,0,1,0,0,0,0
52698,23,0.0,0.6,30.0,0,0,0,0,0,0,0,0,1,0,0


In [18]:
df5=df4[['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','time','precipitation','visibility','cloud','Clear','Overcast','Partially cloudy','Rain, Overcast','Rain, Partially cloudy']]

In [19]:
df5

Unnamed: 0,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,time,precipitation,visibility,cloud,Clear,Overcast,Partially cloudy,"Rain, Overcast","Rain, Partially cloudy"
0,1,0,0,0,0,0,0,0.0,1.2,0.0,1,0,0,0,0
1,1,0,0,0,0,0,1,0.0,1.2,0.0,1,0,0,0,0
2,1,0,0,0,0,0,2,0.0,0.6,0.0,1,0,0,0,0
3,1,0,0,0,0,0,3,0.0,0.6,0.0,1,0,0,0,0
4,1,0,0,0,0,0,4,0.0,1.2,0.0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52695,0,0,0,0,0,0,20,0.0,1.2,8.2,1,0,0,0,0
52696,0,0,0,0,0,0,21,0.0,1.2,0.0,1,0,0,0,0
52697,0,0,0,0,0,0,22,0.0,0.6,0.0,1,0,0,0,0
52698,0,0,0,0,0,0,23,0.0,0.6,30.0,0,0,1,0,0


In [20]:
traffic_categories = [0, 1, 2] #Here we have considered 0=low ,1= medium ,2=heigh traffic
weights = [0.3, 0.4, 0.3]
traffic_data = random.choices(traffic_categories, weights=weights, k=52700)

df_traffic = pd.DataFrame({'Traffic': traffic_data})
df_traffic.index = range(len(df_traffic))
df_traffic

Unnamed: 0,Traffic
0,1
1,2
2,2
3,1
4,2
...,...
52695,2
52696,0
52697,2
52698,0


In [21]:
df = pd.merge(df5,df_traffic,left_index=True,right_index=True)
df

Unnamed: 0,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,time,precipitation,visibility,cloud,Clear,Overcast,Partially cloudy,"Rain, Overcast","Rain, Partially cloudy",Traffic
0,1,0,0,0,0,0,0,0.0,1.2,0.0,1,0,0,0,0,1
1,1,0,0,0,0,0,1,0.0,1.2,0.0,1,0,0,0,0,2
2,1,0,0,0,0,0,2,0.0,0.6,0.0,1,0,0,0,0,2
3,1,0,0,0,0,0,3,0.0,0.6,0.0,1,0,0,0,0,1
4,1,0,0,0,0,0,4,0.0,1.2,0.0,1,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52695,0,0,0,0,0,0,20,0.0,1.2,8.2,1,0,0,0,0,2
52696,0,0,0,0,0,0,21,0.0,1.2,0.0,1,0,0,0,0,0
52697,0,0,0,0,0,0,22,0.0,0.6,0.0,1,0,0,0,0,2
52698,0,0,0,0,0,0,23,0.0,0.6,30.0,0,0,1,0,0,0


In [22]:
df.dropna(inplace=True)

Data prep is done now we will start model building

In [23]:
from sklearn.preprocessing import StandardScaler

In [24]:
scaler = StandardScaler()

In [25]:
scaler.fit(df.drop('Traffic',axis=1))

In [26]:
scaled_features = scaler.transform(df.drop('Traffic',axis=1))

In [27]:
X = df.drop(['Traffic'], axis='columns')
y = df.Traffic

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [29]:
len(X_train)

41458

In [30]:
len(X_test)

10365

In [31]:
knn = KNeighborsClassifier(n_neighbors=50)

In [32]:
knn.fit(X_train, y_train)

In [33]:
knn.score(X_test, y_test)

0.3718282682103232

In [34]:
acc=[]
for i in range(1,300,2):
    knn = KNeighborsClassifier(n_neighbors=i)
    new_score=knn.fit(X_train, y_train)
    acc.append(knn.score(X_test, y_test))
    print( i,"------------>", knn.score(X_test, y_test))


1 ------------> 0.33815726000964785
3 ------------> 0.33535938253738545
5 ------------> 0.3414375301495417
7 ------------> 0.34819102749638203
9 ------------> 0.3429811866859624
11 ------------> 0.349831162566329
13 ------------> 0.3525325615050651
15 ------------> 0.3536903039073806
17 ------------> 0.35918958031837916
19 ------------> 0.3580318379160637
21 ------------> 0.3627592860588519
23 ------------> 0.3598649300530632
25 ------------> 0.361794500723589
27 ------------> 0.362469850458273
29 ------------> 0.3616015436565364
31 ------------> 0.3616980221900627
33 ------------> 0.3649782923299566
35 ------------> 0.3625663289917993
37 ------------> 0.3633381572600097
39 ------------> 0.3688374336710082
41 ------------> 0.3686444766039556
43 ------------> 0.368451519536903
45 ------------> 0.36739025566811384
47 ------------> 0.3668113844669561
49 ------------> 0.36999517607332366
51 ------------> 0.3729860106126387
53 ------------> 0.3739507959479016
55 ------------> 0.371442354076

In [35]:
acc

[0.33815726000964785,
 0.33535938253738545,
 0.3414375301495417,
 0.34819102749638203,
 0.3429811866859624,
 0.349831162566329,
 0.3525325615050651,
 0.3536903039073806,
 0.35918958031837916,
 0.3580318379160637,
 0.3627592860588519,
 0.3598649300530632,
 0.361794500723589,
 0.362469850458273,
 0.3616015436565364,
 0.3616980221900627,
 0.3649782923299566,
 0.3625663289917993,
 0.3633381572600097,
 0.3688374336710082,
 0.3686444766039556,
 0.368451519536903,
 0.36739025566811384,
 0.3668113844669561,
 0.36999517607332366,
 0.3729860106126387,
 0.3739507959479016,
 0.37144235407621806,
 0.37076700434153403,
 0.37028461167390253,
 0.37250361794500725,
 0.3750120598166908,
 0.3736613603473227,
 0.3712493970091655,
 0.36999517607332366,
 0.373082489146165,
 0.37260009647853354,
 0.3718282682103232,
 0.3712493970091655,
 0.373757838880849,
 0.3728895320791124,
 0.37250361794500725,
 0.3766521948866377,
 0.37433671008200675,
 0.3765557163531114,
 0.376748673420164,
 0.3780028943560058,
 0.377

In [36]:
for i in range(0,len(acc)):
    if acc[i]==max(acc):
        print("Maximum Accuracy is appearing when k =",i,"------>",max(acc))

Maximum Accuracy is appearing when k = 149 ------> 0.4000964785335263
