In [75]:
# Importing Libraries:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler

# Data Cleaning & Pre-processing:

In [78]:
# 1. Loading dataset:
uber_data= pd.read_csv(r"C:\Users\hp\Desktop\Dataset_Uber Traffic.csv")
uber_data.head()

Unnamed: 0,DateTime,Junction,Vehicles,ID
0,01-11-2015 0.00,1,15,20151101001
1,01-11-2015 1.00,1,13,20151101011
2,01-11-2015 2.00,1,10,20151101021
3,01-11-2015 3.00,1,7,20151101031
4,01-11-2015 4.00,1,9,20151101041


In [80]:
# Initial Inspections:
uber_data.shape

(48120, 4)

In [82]:
uber_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48120 entries, 0 to 48119
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   DateTime  48120 non-null  object
 1   Junction  48120 non-null  int64 
 2   Vehicles  48120 non-null  int64 
 3   ID        48120 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB


In [84]:
uber_data.describe()

Unnamed: 0,Junction,Vehicles,ID
count,48120.0,48120.0,48120.0
mean,2.180549,22.791334,20163300000.0
std,0.966955,20.750063,5944854.0
min,1.0,1.0,20151100000.0
25%,1.0,9.0,20160420000.0
50%,2.0,15.0,20160930000.0
75%,3.0,29.0,20170230000.0
max,4.0,180.0,20170630000.0


In [86]:
# 2. Clean Data:
# Checking null values
uber_data.isnull().sum()

DateTime    0
Junction    0
Vehicles    0
ID          0
dtype: int64

In [88]:
# Dropping duplicate values:
uber_data.drop_duplicates(inplace=True)

In [90]:
uber_data.shape

(48120, 4)

In [92]:
# Correct Data types:
uber_data['DateTime'] = pd.to_datetime(uber_data['DateTime'],format='%d-%m-%Y %H.%M')
uber_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48120 entries, 0 to 48119
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   DateTime  48120 non-null  datetime64[ns]
 1   Junction  48120 non-null  int64         
 2   Vehicles  48120 non-null  int64         
 3   ID        48120 non-null  int64         
dtypes: datetime64[ns](1), int64(3)
memory usage: 1.5 MB


In [93]:
# 3. Aggregate Traffic Data (Hourly per Junction):
# Group by hourly DateTime and Junction and aggregate vehicle count
traffic_hourly = uber_data.groupby([pd.Grouper(key='DateTime', freq='H'), 'Junction'])['Vehicles'].sum().reset_index()

In [94]:
traffic_hourly.head()

Unnamed: 0,DateTime,Junction,Vehicles
0,2015-11-01 00:00:00,1,15
1,2015-11-01 00:00:00,2,6
2,2015-11-01 00:00:00,3,9
3,2015-11-01 01:00:00,1,13
4,2015-11-01 01:00:00,2,6


In [112]:
# Pre-process the data:
# Normalize or standardize the data to facilitate comparison across different time periods and junctions.
scaler = StandardScaler()
traffic_hourly['Vehicles_scaled'] = scaler.fit_transform(traffic_hourly[['Vehicles']])
traffic_hourly.head()

Unnamed: 0,DateTime,Junction,Vehicles,Vehicles_scaled
0,2015-11-01 00:00:00,1,15,-0.375489
1,2015-11-01 00:00:00,2,6,-0.809227
2,2015-11-01 00:00:00,3,9,-0.664648
3,2015-11-01 01:00:00,1,13,-0.471875
4,2015-11-01 01:00:00,2,6,-0.809227


# Feature Engineering and Selection:

In [116]:
# Time-based features
traffic_hourly['Hour'] = traffic_hourly['DateTime'].dt.hour
traffic_hourly['DayOfWeek'] = traffic_hourly['DateTime'].dt.dayofweek
traffic_hourly['Month'] = traffic_hourly['DateTime'].dt.month
traffic_hourly['IsWeekend'] = traffic_hourly['DayOfWeek'].isin([5, 6]).astype(int)

# Sort values to create lag features
traffic_hourly = traffic_hourly.sort_values(['Junction', 'DateTime'])

# Lag features
traffic_hourly['Lag_1'] = traffic_hourly.groupby('Junction')['Vehicles'].shift(1)
traffic_hourly['Lag_2'] = traffic_hourly.groupby('Junction')['Vehicles'].shift(2)
traffic_hourly['Lag_24'] = traffic_hourly.groupby('Junction')['Vehicles'].shift(24)

traffic_hourly.head()

Unnamed: 0,DateTime,Junction,Vehicles,Vehicles_scaled,Hour,DayOfWeek,Month,IsWeekend,Lag_1,Lag_2,Lag_24
0,2015-11-01 00:00:00,1,15,-0.375489,0,6,11,1,,,
3,2015-11-01 01:00:00,1,13,-0.471875,1,6,11,1,15.0,,
6,2015-11-01 02:00:00,1,10,-0.616454,2,6,11,1,13.0,15.0,
9,2015-11-01 03:00:00,1,7,-0.761034,3,6,11,1,10.0,13.0,
12,2015-11-01 04:00:00,1,9,-0.664648,4,6,11,1,7.0,10.0,


In [124]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Drop NA due to lag creation
df_model = traffic_hourly.dropna()

# Feature columns
feature_cols = ['Hour', 'DayOfWeek', 'Month', 'IsWeekend', 'Lag_1', 'Lag_2', 'Lag_24']
X = df_model[feature_cols]
y = df_model['Vehicles']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit RandomForest to get feature importance
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Create feature importance DataFrame
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)


feature_importance.head(8)

Unnamed: 0,Feature,Importance
4,Lag_1,0.942443
6,Lag_24,0.017474
0,Hour,0.016546
5,Lag_2,0.011259
2,Month,0.006071
1,DayOfWeek,0.005165
3,IsWeekend,0.001043
