# Data Cleaning and Pre-processing

In [42]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

## 1. Load Raw Data
* Read the collected data into pandas DataFrames.

In [43]:
# Load raw data
df = pd.read_csv("Dataset_Uber Traffic.csv")
df.head()

Unnamed: 0,DateTime,Junction,Vehicles,ID
0,01/11/15 0:00,1,15,20151101001
1,01/11/15 1:00,1,13,20151101011
2,01/11/15 2:00,1,10,20151101021
3,01/11/15 3:00,1,7,20151101031
4,01/11/15 4:00,1,9,20151101041


In [44]:
# Check the shape of the DataFrame
df.shape

(48120, 4)

In [45]:
# Check the data type of the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48120 entries, 0 to 48119
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   DateTime  48120 non-null  object
 1   Junction  48120 non-null  int64 
 2   Vehicles  48120 non-null  int64 
 3   ID        48120 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB


## 2. Clean Data:
* Missing values using methods like imputation or removal (df.fillna(), df.dropna()).
* Remove duplicates (df.drop_duplicates()).
* Correct data types (df.astype()).


In [46]:
# Hadle missing values
df.fillna(method = "ffill", inplace = True)          # Forward fill missing values
df.dropna(inplace = True)                            # Dropping NaN (null) values

In [47]:
# Remove duplicates
df.drop_duplicates(inplace = True)

In [48]:
# Correct data types
# Convert DateTime column to datetime data type
df["DateTime"] = pd.to_datetime(df["DateTime"])

## 3. Aggregate traffic data:
- Compile traffic data into hourly intervals for each junction.
- Ensure data includes relevant details such as vehicle counts.

In [49]:
# Aggregate data into hourly intervals
df.set_index('DateTime', inplace = True)
df = df.resample('H').sum().reset_index()
df.head()

Unnamed: 0,DateTime,Junction,Vehicles,ID
0,2015-01-11 00:00:00,6,30,60453303006
1,2015-01-11 01:00:00,6,26,60453303036
2,2015-01-11 02:00:00,6,20,60453303066
3,2015-01-11 03:00:00,6,14,60453303096
4,2015-01-11 04:00:00,6,18,60453303126


## 4. Feature Engineering:
* Create new features from raw data:
  - Generate time-based features such as hour of the day, day of the week, and month.
  - Develop lag features by including traffic data from previous hours or days to capture temporal dependencies.
  - Create binary indicators for weekends and special events to account for their impact on traffic.

In [50]:
# Generate time-based features such as hour of the day, day of the week, and month
df["hour"] = df['DateTime'].dt.hour
df["day_of_week"] = df['DateTime'].dt.dayofweek
df['month'] = df['DateTime'].dt.month

In [53]:
# Create lalg features 
df['traffic_lag_1'] = df['Vehicles'].shift(1)
df['traffic_lag_24'] = df['Vehicles'].shift(24)

In [56]:
# Create binary indicators for weekends
df['weekend'] = df['day_of_week'].apply(lambda x : 1 if x >= 5 else 0)

In [57]:
# Fill NaN values for shifting
df.fillna(0, inplace = True)

## 5. Preprocess the data:
* Normalize or standardize the data to facilitate comparison across different time periods and junctions.

In [58]:
scaler = StandardScaler()
df[['Vehicles', 'traffic_lag_1', 'traffic_lag_24']] = scaler.fit_transform(
    df[['Vehicles', 'traffic_lag_1', 'traffic_lag_24']])

## 6. Feature Selection using Linear Regreesion (OLS):

In [66]:
# Feature Selection using Linear Regression (OLS)
X = df[['hour', 'day_of_week', 'month', 'weekend', 'traffic_lag_1', 'traffic_lag_24']]
y = df['Vehicles']

In [67]:
# Add constant for intercept in OLS
X = sm.add_constant(X)

In [68]:
# Fit the model
model = sm.OLS(y, X).fit()

In [69]:
# Print feature selection results
print("\nFeature Selection Results (p-values): ")
print(model.summary())


Feature Selection Results (p-values): 
                            OLS Regression Results                            
Dep. Variable:               Vehicles   R-squared:                       0.962
Model:                            OLS   Adj. R-squared:                  0.962
Method:                 Least Squares   F-statistic:                 1.073e+05
Date:                Mon, 03 Mar 2025   Prob (F-statistic):               0.00
Time:                        11:49:51   Log-Likelihood:                 5490.8
No. Observations:               25464   AIC:                        -1.097e+04
Df Residuals:                   25457   BIC:                        -1.091e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
cons

In [73]:
# Filter out features with p-values > 0.05 (not statistically significant)
selected_features = X.columns[model.pvalues < 0.05].tolist()
if 'const' in selected_features:
    selected_features.remove('const')        # Remove intercept

In [74]:
print("\nSelected Features: ", selected_features)


Selected Features:  ['hour', 'day_of_week', 'traffic_lag_1', 'traffic_lag_24']
