In [25]:
import pandas as pd
import numpy as np

from ucimlrepo import fetch_ucirepo
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split

In [26]:
metro_interstate_traffic_volume = fetch_ucirepo(id=492) 
traffic_data = metro_interstate_traffic_volume.data.original
traffic_data

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918
...,...,...,...,...,...,...,...,...,...
48199,,283.45,0.0,0.0,75,Clouds,broken clouds,2018-09-30 19:00:00,3543
48200,,282.76,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 20:00:00,2781
48201,,282.73,0.0,0.0,90,Thunderstorm,proximity thunderstorm,2018-09-30 21:00:00,2159
48202,,282.09,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 22:00:00,1450


In [27]:
traffic_data.head(5)

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


In [28]:
traffic_data.tail(5)

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
48199,,283.45,0.0,0.0,75,Clouds,broken clouds,2018-09-30 19:00:00,3543
48200,,282.76,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 20:00:00,2781
48201,,282.73,0.0,0.0,90,Thunderstorm,proximity thunderstorm,2018-09-30 21:00:00,2159
48202,,282.09,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 22:00:00,1450
48203,,282.12,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 23:00:00,954


In [29]:
traffic_data.sample(5)

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
19638,,271.86,0.0,0.0,90,Clouds,overcast clouds,2015-11-23 06:00:00,5651
18228,,290.07,0.0,0.0,1,Clear,sky is clear,2015-09-13 10:00:00,3069
20218,,270.69,0.0,0.0,90,Snow,snow,2015-12-26 06:00:00,843
29356,,275.77,0.0,0.0,40,Clouds,scattered clouds,2016-12-22 15:00:00,5496
14774,,299.04,0.0,0.0,40,Clouds,scattered clouds,2014-05-31 09:00:00,4155


In [30]:
# check for Data types
traffic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   holiday              61 non-null     object 
 1   temp                 48204 non-null  float64
 2   rain_1h              48204 non-null  float64
 3   snow_1h              48204 non-null  float64
 4   clouds_all           48204 non-null  int64  
 5   weather_main         48204 non-null  object 
 6   weather_description  48204 non-null  object 
 7   date_time            48204 non-null  object 
 8   traffic_volume       48204 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 3.3+ MB


In [31]:
# check for basic statistics (numeric columns and categorical columns)
traffic_data.describe(include='all')

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
count,61,48204.0,48204.0,48204.0,48204.0,48204,48204,48204,48204.0
unique,11,,,,,11,38,40575,
top,Labor Day,,,,,Clouds,sky is clear,2013-05-19 10:00:00,
freq,7,,,,,15164,11665,6,
mean,,281.20587,0.334264,0.000222,49.362231,,,,3259.818355
std,,13.338232,44.789133,0.008168,39.01575,,,,1986.86067
min,,0.0,0.0,0.0,0.0,,,,0.0
25%,,272.16,0.0,0.0,1.0,,,,1193.0
50%,,282.45,0.0,0.0,64.0,,,,3380.0
75%,,291.806,0.0,0.0,90.0,,,,4933.0


In [32]:
traffic_data.isna().sum()

holiday                48143
temp                       0
rain_1h                    0
snow_1h                    0
clouds_all                 0
weather_main               0
weather_description        0
date_time                  0
traffic_volume             0
dtype: int64

In [34]:
traffic_data['date_time'] = pd.to_datetime(traffic_data['date_time'])
traffic_data.sample(5)

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
27137,,284.001,0.0,0.0,36,Clouds,scattered clouds,2016-10-06 08:00:00,5694
30888,,256.29,0.0,0.0,1,Clear,sky is clear,2017-02-09 07:00:00,6630
4303,,269.42,0.0,0.0,90,Mist,mist,2013-03-15 09:00:00,4631
46488,,298.5,0.0,0.0,40,Clouds,scattered clouds,2018-08-06 14:00:00,4814
14347,,280.92,0.0,0.0,32,Clouds,scattered clouds,2014-05-15 11:00:00,5977


In [35]:
print({traffic_data['date_time'].min(), traffic_data['date_time'].max()})
print(len(traffic_data))

{Timestamp('2018-09-30 23:00:00'), Timestamp('2012-10-02 09:00:00')}
48204


In [38]:
len(pd.date_range(start=traffic_data['date_time'].min(), end=traffic_data['date_time'].max(), freq='h'))

52551

In [39]:
skeleton = pd.date_range(start = traffic_data['date_time'].min(), end = traffic_data['date_time'].max(), freq='h')
len(skeleton)

52551

In [40]:
skeleton_df = pd.DataFrame(skeleton, columns = ['date_time'])

In [41]:
df_full = pd.merge(skeleton_df, traffic_data, on = 'date_time', how = 'left')
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60180 entries, 0 to 60179
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   date_time            60180 non-null  datetime64[ns]
 1   holiday              61 non-null     object        
 2   temp                 48204 non-null  float64       
 3   rain_1h              48204 non-null  float64       
 4   snow_1h              48204 non-null  float64       
 5   clouds_all           48204 non-null  float64       
 6   weather_main         48204 non-null  object        
 7   weather_description  48204 non-null  object        
 8   traffic_volume       48204 non-null  float64       
 9   temp_C               48204 non-null  float64       
 10  hour                 48204 non-null  float64       
 11  hour_sin             48204 non-null  float64       
 12  hour_cos             48204 non-null  float64       
dtypes: datetime64[ns](1), float64(9

In [None]:
#encoding categorical data into binary or one-hot encoded variables, if you want to use them in you GLM
traffic_data_encoded = pd.get_dummies(traffic_data, drop_first=True)
traffic_data_encoded.head(5)

# Define the formula for the GLM model

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,traffic_volume,holiday_Columbus Day,holiday_Independence Day,holiday_Labor Day,holiday_Martin Luther King Jr Day,holiday_Memorial Day,...,date_time_2018-09-30 14:00:00,date_time_2018-09-30 15:00:00,date_time_2018-09-30 16:00:00,date_time_2018-09-30 17:00:00,date_time_2018-09-30 18:00:00,date_time_2018-09-30 19:00:00,date_time_2018-09-30 20:00:00,date_time_2018-09-30 21:00:00,date_time_2018-09-30 22:00:00,date_time_2018-09-30 23:00:00
0,288.28,0.0,0.0,40,5545,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,289.36,0.0,0.0,75,4516,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,289.58,0.0,0.0,90,4767,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,290.13,0.0,0.0,90,5026,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,291.14,0.0,0.0,75,4918,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [36]:
#convert temperature from Kelvin to °Celcius
traffic_data['temp_C'] = traffic_data['temp'] - 273.15
traffic_data[['temp', 'temp_C']].head(5)


Unnamed: 0,temp,temp_C
0,288.28,15.13
1,289.36,16.21
2,289.58,16.43
3,290.13,16.98
4,291.14,17.99


In [37]:
#encode time data from linear to cyclic representation
traffic_data['hour'] = traffic_data['date_time'].dt.hour
traffic_data['hour_sin'] = np.sin(2 * np.pi * traffic_data['hour'] / 24)
traffic_data['hour_cos'] = np.cos(2 * np.pi * traffic_data['hour'] / 24)
traffic_data[['hour', 'hour_sin', 'hour_cos']].head(5)  


Unnamed: 0,hour,hour_sin,hour_cos
0,9,0.7071068,-0.707107
1,10,0.5,-0.866025
2,11,0.258819,-0.965926
3,12,1.224647e-16,-1.0
4,13,-0.258819,-0.965926
