In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

# For handling imbalanced data
from imblearn.over_sampling import SMOTE


In [3]:
from google.colab import files


uploaded = files.upload()

Saving traffic_volume_data.csv to traffic_volume_data.csv


In [5]:
# Load the dataset
data = pd.read_csv('traffic_volume_data.csv')

# Display the first few rows of the dataset
data.head()

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,...,last_2_hour_traffic,last_3_hour_traffic,last_4_hour_traffic,last_5_hour_traffic,last_6_hour_traffic,hour,month_day,weekday,month,year
0,2012-10-02 15:00:00,0,184,64,3,328,7,7,293.17,0.0,...,4918.0,5026.0,4767.0,4516.0,5545.0,15,2,2,10,2012
1,2012-10-02 16:00:00,0,167,64,3,327,7,7,293.86,0.0,...,5181.0,4918.0,5026.0,4767.0,4516.0,16,2,2,10,2012
2,2012-10-02 17:00:00,0,119,63,3,327,6,6,294.14,0.0,...,5584.0,5181.0,4918.0,5026.0,4767.0,17,2,2,10,2012
3,2012-10-02 18:00:00,0,161,63,3,326,3,3,293.1,0.0,...,6015.0,5584.0,5181.0,4918.0,5026.0,18,2,2,10,2012
4,2012-10-02 19:00:00,0,243,62,3,326,8,8,290.97,0.0,...,5791.0,6015.0,5584.0,5181.0,4918.0,19,2,2,10,2012


In [19]:
# Check the basic information of the dataset
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33744 entries, 0 to 33743
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   date_time            33744 non-null  datetime64[ns]
 1   is_holiday           33744 non-null  int64         
 2   air_pollution_index  33744 non-null  int64         
 3   humidity             33744 non-null  int64         
 4   wind_speed           33744 non-null  int64         
 5   wind_direction       33744 non-null  int64         
 6   visibility_in_miles  33744 non-null  int64         
 7   dew_point            33744 non-null  int64         
 8   temperature          33744 non-null  float64       
 9   rain_p_h             33744 non-null  float64       
 10  snow_p_h             33744 non-null  float64       
 11  clouds_all           33744 non-null  int64         
 12  weather_type         33744 non-null  object        
 13  weather_description  33744 non-

In [21]:
#Handle missing values
# 1. Fill with mean (for numerical columns)
# 1. Fill with mean (for numerical columns only)
numeric_columns = data.select_dtypes(include=np.number).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

In [22]:
# Check if missing values are filled
print(data.isnull().sum())

date_time              0
is_holiday             0
air_pollution_index    0
humidity               0
wind_speed             0
wind_direction         0
visibility_in_miles    0
dew_point              0
temperature            0
rain_p_h               0
snow_p_h               0
clouds_all             0
weather_type           0
weather_description    0
traffic_volume         0
last_1_hour_traffic    0
last_2_hour_traffic    0
last_3_hour_traffic    0
last_4_hour_traffic    0
last_5_hour_traffic    0
last_6_hour_traffic    0
hour                   0
month_day              0
weekday                0
month                  0
year                   0
day                    0
day_of_week            0
is_weekend             0
dtype: int64


In [23]:
# Get statistical summary
data.describe()

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,...,last_5_hour_traffic,last_6_hour_traffic,hour,month_day,weekday,month,year,day,day_of_week,is_weekend
count,33744,33744.0,33744.0,33744.0,33744.0,33744.0,33744.0,33744.0,33744.0,33744.0,...,33744.0,33744.0,33744.0,33744.0,33744.0,33744.0,33744.0,33744.0,33744.0,33744.0
mean,2015-02-17 20:15:46.194879232,0.001274,154.847143,71.209282,3.378289,199.448821,4.989983,4.989983,280.067816,0.448819,...,3240.25163,3240.346373,11.404368,15.649301,3.983019,6.407035,2014.640292,15.649301,2.983019,0.28325
min,2012-10-02 15:00:00,0.0,10.0,13.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,2012.0,1.0,0.0,0.0
25%,2013-08-16 08:45:00,0.0,83.0,60.0,2.0,130.0,3.0,3.0,271.72,0.0,...,1165.0,1165.0,5.0,8.0,2.0,3.0,2013.0,8.0,1.0,0.0
50%,2015-07-26 10:30:00,0.0,155.0,72.0,3.0,200.0,5.0,5.0,280.15,0.0,...,3335.5,3336.0,11.0,16.0,4.0,6.0,2015.0,16.0,3.0,0.0
75%,2016-07-28 11:15:00,0.0,228.0,85.0,5.0,290.0,7.0,7.0,290.62,0.0,...,4926.0,4926.0,17.0,23.0,6.0,10.0,2016.0,23.0,5.0,1.0
max,2017-05-17 23:00:00,1.0,299.0,100.0,16.0,360.0,9.0,9.0,308.24,9831.3,...,7280.0,7280.0,23.0,31.0,7.0,12.0,2017.0,31.0,6.0,1.0
std,,0.035675,83.733242,16.853337,2.055956,99.835021,2.569985,2.569985,13.415782,53.531259,...,1991.548197,1991.581794,6.951659,8.699438,2.005721,3.543232,1.572591,8.699438,2.005721,0.450584


In [10]:
# Convert 'date_time' to datetime object
data['date_time'] = pd.to_datetime(data['date_time'])


In [11]:
# Extract date and time related features
data['year'] = data['date_time'].dt.year
data['month'] = data['date_time'].dt.month
data['day'] = data['date_time'].dt.day
data['hour'] = data['date_time'].dt.hour
data['day_of_week'] = data['date_time'].dt.dayofweek
data['is_weekend'] = data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)


In [26]:
print(data.columns)


Index(['date_time', 'is_holiday', 'air_pollution_index', 'humidity',
       'wind_speed', 'wind_direction', 'visibility_in_miles', 'dew_point',
       'temperature', 'rain_p_h', 'snow_p_h', 'clouds_all', 'weather_type',
       'weather_description', 'traffic_volume', 'last_1_hour_traffic',
       'last_2_hour_traffic', 'last_3_hour_traffic', 'last_4_hour_traffic',
       'last_5_hour_traffic', 'last_6_hour_traffic', 'hour', 'month_day',
       'weekday', 'month', 'year', 'day', 'day_of_week', 'is_weekend'],
      dtype='object')


In [27]:
#Encode Categorical Variables
# Assuming 'weather_main' and 'weather_description' are categorical variables
data = pd.get_dummies(data, columns=['weather_type', 'weather_description'], drop_first=True)


In [29]:
numerical_cols = data.select_dtypes(include=['number']).columns

print(numerical_cols)

Index(['is_holiday', 'air_pollution_index', 'humidity', 'wind_speed',
       'wind_direction', 'visibility_in_miles', 'dew_point', 'temperature',
       'rain_p_h', 'snow_p_h', 'clouds_all', 'traffic_volume',
       'last_1_hour_traffic', 'last_2_hour_traffic', 'last_3_hour_traffic',
       'last_4_hour_traffic', 'last_5_hour_traffic', 'last_6_hour_traffic',
       'hour', 'month_day', 'weekday', 'month', 'year', 'day', 'day_of_week',
       'is_weekend'],
      dtype='object')


In [31]:
#Scale Numerical Features
# List numerical features to scale
numerical_features = ['temperature','rain_p_h', 'snow_p_h', 'clouds_all']  # example numerical columns

# Initialize the scaler
scaler = StandardScaler()

# Scale the numerical features
data[numerical_features] = scaler.fit_transform(data[numerical_features])


In [32]:
import numpy as np
#Outlier Detection and Treatment:
# Example: Handling outliers in 'traffic_volume'
Q1 = data['traffic_volume'].quantile(0.25)
Q3 = data['traffic_volume'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
data = data[(data['traffic_volume'] >= lower_bound) & (data['traffic_volume'] <= upper_bound)]


In [37]:
#Feature Interaction
# Example: Create interaction between 'temp' and 'hour'
data['temp_hour_interaction'] = data['temperature'] * data['hour']


In [43]:
#Dimensionality Reduction:
from sklearn.decomposition import PCA

# Initialize PCA. Set n_components to be less than or equal to the number of features.
pca = PCA(n_components=4)

# Select the features you want to use for PCA. Replace with your actual columns.
features_for_pca = ['temperature', 'rain_p_h', 'snow_p_h', 'clouds_all']
X = data[features_for_pca]

# Fit and transform the data
X_pca = pca.fit_transform(X)

In [58]:
#Handling Imbalanced Data
import pandas as pd
from imblearn.over_sampling import RandomOverSampler

# Assuming X_train and y_train are your training features and labels
# Drop 'date_time' from the training data if needed
X_train_no_date = X_train.drop('date_time', axis=1)

# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Apply Random Oversampling to the training data
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_no_date, y_train)


In [60]:
#Feature Selection
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Assuming X contains 'date_time' and you want to drop it
X_no_date = X.drop('date_time', axis=1)  # Remove the datetime column

# Initialize RFE with a linear regression model
model = LinearRegression()
rfe = RFE(model, n_features_to_select=10)

# Fit RFE on the modified data without the datetime column
X_rfe = rfe.fit_transform(X_no_date, y)

In [61]:
# Drop unnecessary columns
data.drop(['date_time'], axis=1, inplace=True)


In [62]:
#Split the Dataset into Training and Testing Sets
from sklearn.model_selection import train_test_split

# Define the target variable and features
X = data.drop(['traffic_volume'], axis=1)
y = data['traffic_volume']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [63]:
# Check the shape of the training and testing sets
print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Testing set: {X_test.shape}, {y_test.shape}")


Training set: (26995, 73), (26995,)
Testing set: (6749, 73), (6749,)
