In [2]:

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load dataset
file_path = r"C:\Users\Sandeep Gowda\Dataset_Uber Traffic.csv"  # Update path if needed
df = pd.read_csv(file_path)
df.head()


Unnamed: 0,DateTime,Junction,Vehicles,ID
0,01/11/15 0:00,1,15,20151101001
1,01/11/15 1:00,1,13,20151101011
2,01/11/15 2:00,1,10,20151101021
3,01/11/15 3:00,1,7,20151101031
4,01/11/15 4:00,1,9,20151101041


In [3]:

# Convert DateTime column to datetime type
df['DateTime'] = pd.to_datetime(df['DateTime'], format="%d/%m/%y %H:%M", errors='coerce')

# Handle missing values: fill numeric with mean, drop rows if DateTime missing
df['Vehicles'] = df['Vehicles'].fillna(df['Vehicles'].mean())
df = df.dropna(subset=['DateTime'])

# Remove duplicates
df = df.drop_duplicates()

# Correct datatypes
df['Junction'] = df['Junction'].astype(int)
df['Vehicles'] = df['Vehicles'].astype(int)

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48120 entries, 0 to 48119
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   DateTime  48120 non-null  datetime64[ns]
 1   Junction  48120 non-null  int32         
 2   Vehicles  48120 non-null  int32         
 3   ID        48120 non-null  int64         
dtypes: datetime64[ns](1), int32(2), int64(1)
memory usage: 1.1 MB


In [4]:

# Aggregate traffic data into hourly intervals for each junction
traffic_hourly = df.groupby([pd.Grouper(key="DateTime", freq="H"), "Junction"]).agg(
    Vehicle_Count=("Vehicles", "sum")
).reset_index()

traffic_hourly.head()


Unnamed: 0,DateTime,Junction,Vehicle_Count
0,2015-11-01 00:00:00,1,15
1,2015-11-01 00:00:00,2,6
2,2015-11-01 00:00:00,3,9
3,2015-11-01 01:00:00,1,13
4,2015-11-01 01:00:00,2,6


In [5]:

# Normalize data (Min-Max Scaling)
scaler = MinMaxScaler()
traffic_hourly['Vehicle_Count_Normalized'] = scaler.fit_transform(
    traffic_hourly[['Vehicle_Count']]
)

traffic_hourly.head()


Unnamed: 0,DateTime,Junction,Vehicle_Count,Vehicle_Count_Normalized
0,2015-11-01 00:00:00,1,15,0.078212
1,2015-11-01 00:00:00,2,6,0.027933
2,2015-11-01 00:00:00,3,9,0.044693
3,2015-11-01 01:00:00,1,13,0.067039
4,2015-11-01 01:00:00,2,6,0.027933


In [6]:
# Extract hour, day of week, month from DateTime
traffic_hourly['Hour'] = traffic_hourly['DateTime'].dt.hour
traffic_hourly['DayOfWeek'] = traffic_hourly['DateTime'].dt.dayofweek  # Monday=0, Sunday=6
traffic_hourly['Month'] = traffic_hourly['DateTime'].dt.month

traffic_hourly[['DateTime', 'Hour', 'DayOfWeek', 'Month']].head()


Unnamed: 0,DateTime,Hour,DayOfWeek,Month
0,2015-11-01 00:00:00,0,6,11
1,2015-11-01 00:00:00,0,6,11
2,2015-11-01 00:00:00,0,6,11
3,2015-11-01 01:00:00,1,6,11
4,2015-11-01 01:00:00,1,6,11


In [7]:
# Create lag features for previous 1 hour and 24 hours
traffic_hourly['Lag_1hr'] = traffic_hourly.groupby('Junction')['Vehicle_Count'].shift(1)
traffic_hourly['Lag_24hr'] = traffic_hourly.groupby('Junction')['Vehicle_Count'].shift(24)

traffic_hourly[['DateTime','Junction','Vehicle_Count','Lag_1hr','Lag_24hr']].head(30)


Unnamed: 0,DateTime,Junction,Vehicle_Count,Lag_1hr,Lag_24hr
0,2015-11-01 00:00:00,1,15,,
1,2015-11-01 00:00:00,2,6,,
2,2015-11-01 00:00:00,3,9,,
3,2015-11-01 01:00:00,1,13,15.0,
4,2015-11-01 01:00:00,2,6,6.0,
5,2015-11-01 01:00:00,3,7,9.0,
6,2015-11-01 02:00:00,1,10,13.0,
7,2015-11-01 02:00:00,2,5,6.0,
8,2015-11-01 02:00:00,3,5,7.0,
9,2015-11-01 03:00:00,1,7,10.0,


In [8]:
# Weekend indicator (1 if Saturday or Sunday, else 0)
traffic_hourly['IsWeekend'] = traffic_hourly['DayOfWeek'].isin([5,6]).astype(int)

# Special events example (let’s say Nov 1st is a special event) 
traffic_hourly['IsSpecialEvent'] = (traffic_hourly['DateTime'].dt.strftime("%m-%d") == "11-01").astype(int)

traffic_hourly[['DateTime','DayOfWeek','IsWeekend','IsSpecialEvent']].head(30)


Unnamed: 0,DateTime,DayOfWeek,IsWeekend,IsSpecialEvent
0,2015-11-01 00:00:00,6,1,1
1,2015-11-01 00:00:00,6,1,1
2,2015-11-01 00:00:00,6,1,1
3,2015-11-01 01:00:00,6,1,1
4,2015-11-01 01:00:00,6,1,1
5,2015-11-01 01:00:00,6,1,1
6,2015-11-01 02:00:00,6,1,1
7,2015-11-01 02:00:00,6,1,1
8,2015-11-01 02:00:00,6,1,1
9,2015-11-01 03:00:00,6,1,1


In [9]:
# Compute correlation matrix (only numeric columns)
corr = traffic_hourly.corr(numeric_only=True)

# Show correlation of each feature with Vehicle_Count
corr['Vehicle_Count'].sort_values(ascending=False)


Vehicle_Count               1.000000
Vehicle_Count_Normalized    1.000000
Lag_1hr                     0.970143
Lag_24hr                    0.905146
Hour                        0.219938
IsSpecialEvent             -0.013457
Month                      -0.022723
DayOfWeek                  -0.126027
IsWeekend                  -0.150550
Junction                   -0.613787
Name: Vehicle_Count, dtype: float64

In [10]:
from sklearn.ensemble import RandomForestRegressor

# Drop rows with NaN values from lag features
train_data = traffic_hourly.dropna()

# Select features
features = ['Hour','DayOfWeek','Month','Lag_1hr','Lag_24hr','IsWeekend','IsSpecialEvent']
X = train_data[features]
y = train_data['Vehicle_Count']

# Train a simple Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# Feature importances
importances = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
print(importances)


Lag_1hr           0.944637
Lag_24hr          0.020305
Hour              0.018811
Month             0.008406
DayOfWeek         0.006686
IsWeekend         0.001130
IsSpecialEvent    0.000025
dtype: float64
