# Context

JanataHackaton: Machine Learning for IoT


IoT devices are becoming popular nowadays. The widespread use of IoT yields huge amounts of raw data. This data can be effectively processed by using machine learning to derive many useful insights that can become game changers and affect our lives deeply.


## Problem statement 

This analysis is part of the JanataHack. 
We're working with our government to transform our city into a smart city. the vision is to convert it into a digital and intelligent city to improve the efficiency of services for the citizens.

One of the problems faced by the government is traffic. Through this analysis, we need to manage the traffic of the city better and provide input infrasctructure planning for the future.

The gov wants to implement a robust traffic system by being prepared for peaks. They would like to understand the traffic patterns of the 4 junctions, on holidays, yearly occasions and their difference with normal working days.

## Mission

Our mission is to predict traffic in each of the 4 junctions for the next 4 months based on historical data of the past 20 months.

# Environment setting & data loading 

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import time

''' Data visualisation'''
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

''' Scikit-Learn'''
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import set_config

set_config(display='diagram')
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.metrics import confusion_matrix

''' pipeline, preproc and models '''
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor , BaggingRegressor , GradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor


In [2]:
# load data 
data= pd.read_csv('../IOTTimeSeries/data/train_ML_IOT.csv')
test_data= pd.read_csv('../IOTTimeSeries/data/test_ML_IOT.csv')

In [3]:
data.head()

Unnamed: 0,DateTime,Junction,Vehicles,ID
0,2015-11-01 00:00:00,1,15,20151101001
1,2015-11-01 01:00:00,1,13,20151101011
2,2015-11-01 02:00:00,1,10,20151101021
3,2015-11-01 03:00:00,1,7,20151101031
4,2015-11-01 04:00:00,1,9,20151101041


In [4]:
test_data.tail()

Unnamed: 0,DateTime,Junction,ID
11803,2017-10-31 19:00:00,4,20171031194
11804,2017-10-31 20:00:00,4,20171031204
11805,2017-10-31 21:00:00,4,20171031214
11806,2017-10-31 22:00:00,4,20171031224
11807,2017-10-31 23:00:00,4,20171031234


# EDA 

In [5]:
# data shape
print(data.shape , test_data.shape)

(48120, 4) (11808, 3)


In [6]:
#train info 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48120 entries, 0 to 48119
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   DateTime  48120 non-null  object
 1   Junction  48120 non-null  int64 
 2   Vehicles  48120 non-null  int64 
 3   ID        48120 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB


In [7]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11808 entries, 0 to 11807
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   DateTime  11808 non-null  object
 1   Junction  11808 non-null  int64 
 2   ID        11808 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 276.9+ KB


In [8]:
# train data description
data.describe()

Unnamed: 0,Junction,Vehicles,ID
count,48120.0,48120.0,48120.0
mean,2.180549,22.791334,20163300000.0
std,0.966955,20.750063,5944854.0
min,1.0,1.0,20151100000.0
25%,1.0,9.0,20160420000.0
50%,2.0,15.0,20160930000.0
75%,3.0,29.0,20170230000.0
max,4.0,180.0,20170630000.0


In [9]:
# test data description
test_data.describe()

Unnamed: 0,Junction,ID
count,11808.0,11808.0
mean,2.5,20170870000.0
std,1.118081,112466.5
min,1.0,20170700000.0
25%,1.75,20170730000.0
50%,2.5,20170830000.0
75%,3.25,20171000000.0
max,4.0,20171030000.0


In [10]:
#check for data leakage
np.intersect1d(data['ID'], test_data['ID']).shape[0]/data['ID'].nunique()

0.0

In [11]:
# Dropping ID column 
data.drop(['ID'], axis=1, inplace=True)
test_ID= test_data['ID']
test_data.drop(['ID'], axis=1, inplace=True)

## duplicates 

In [12]:
print('before removing duplicates - No of rows :', len(data))
duplicates= data.duplicated()
print('duplicated rows in train data:', duplicates.sum())
data.drop_duplicates(inplace=True)
print('No of rows after removing duplicates:', len(data))

before removing duplicates - No of rows : 48120
duplicated rows in train data: 0
No of rows after removing duplicates: 48120


In [13]:
print('before removing duplicates - No of rows :', len(test_data))
duplicates_test= test_data.duplicated()
print('duplicated rows in train data:', duplicates_test.sum())
test_data.drop_duplicates(inplace=True)
print('No of rows after removing duplicates:', len(test_data))

before removing duplicates - No of rows : 11808
duplicated rows in train data: 0
No of rows after removing duplicates: 11808


## Missing values 

In [14]:
print(data.isnull().sum())
print('\n')
print(test_data.isnull().sum())

DateTime    0
Junction    0
Vehicles    0
dtype: int64


DateTime    0
Junction    0
dtype: int64


## Dtypes

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48120 entries, 0 to 48119
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   DateTime  48120 non-null  object
 1   Junction  48120 non-null  int64 
 2   Vehicles  48120 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.5+ MB


In [16]:
print(data.Junction.nunique())
print(test_data.Junction.nunique())

4
4


In [17]:
# converting Datetime column in datetime object 
data['DateTime']= pd.to_datetime(data.DateTime)
test_data['DateTime']= pd.to_datetime(test_data.DateTime)

In [18]:
print(data.info())
print('\n')
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48120 entries, 0 to 48119
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   DateTime  48120 non-null  datetime64[ns]
 1   Junction  48120 non-null  int64         
 2   Vehicles  48120 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 1.5 MB
None


<class 'pandas.core.frame.DataFrame'>
Int64Index: 11808 entries, 0 to 11807
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   DateTime  11808 non-null  datetime64[ns]
 1   Junction  11808 non-null  int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 276.8 KB
None


## Features Engineering 

In [19]:
import warnings
warnings.filterwarnings("ignore")

## Features creation 

In [20]:
#creating features from the timestamp

data['Time'] = [((date.hour*60+(date.minute))*60)+date.second for date in data.DateTime]
data['dayofweek'] = data['DateTime'].dt.dayofweek
data['dayofmonth'] = data['DateTime'].dt.day
data['weekofyear'] = data['DateTime'].dt.weekofyear


In [21]:
#creating features from the timestamp in the test data 

test_data['Time'] = [((date.hour*60+(date.minute))*60)+date.second for date in test_data.DateTime]
test_data['dayofweek'] = test_data['DateTime'].dt.dayofweek
test_data['dayofmonth'] = test_data['DateTime'].dt.day
test_data['weekofyear'] = test_data['DateTime'].dt.weekofyear

In [22]:
# train data info with the newly created features
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48120 entries, 0 to 48119
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   DateTime    48120 non-null  datetime64[ns]
 1   Junction    48120 non-null  int64         
 2   Vehicles    48120 non-null  int64         
 3   Time        48120 non-null  int64         
 4   dayofweek   48120 non-null  int64         
 5   dayofmonth  48120 non-null  int64         
 6   weekofyear  48120 non-null  int64         
dtypes: datetime64[ns](1), int64(6)
memory usage: 2.9 MB


In [23]:
#checking if there are anhy potential categorical variables
data.nunique()

DateTime      14592
Junction          4
Vehicles        141
Time             24
dayofweek         7
dayofmonth       31
weekofyear       53
dtype: int64

In [24]:
data.head()

Unnamed: 0,DateTime,Junction,Vehicles,Time,dayofweek,dayofmonth,weekofyear
0,2015-11-01 00:00:00,1,15,0,6,1,44
1,2015-11-01 01:00:00,1,13,3600,6,1,44
2,2015-11-01 02:00:00,1,10,7200,6,1,44
3,2015-11-01 03:00:00,1,7,10800,6,1,44
4,2015-11-01 04:00:00,1,9,14400,6,1,44


## Features  selection and preproc 

In [25]:
# convert DateTime timestamp to seconds
data['DateTime']= [time.mktime(date.timetuple()) for date in data.DateTime]
test_data['DateTime']= [time.mktime(date.timetuple()) for date in test_data.DateTime]

In [26]:
def display_head_tail(data, head_rows, tail_rows):
    display("Data Head & Tail :")
    display(data.head(head_rows).append(data.tail(tail_rows)))
    
display_head_tail(data, head_rows=3, tail_rows=2)

'Data Head & Tail :'

Unnamed: 0,DateTime,Junction,Vehicles,Time,dayofweek,dayofmonth,weekofyear
0,1446332000.0,1,15,0,6,1,44
1,1446336000.0,1,13,3600,6,1,44
2,1446340000.0,1,10,7200,6,1,44
48118,1498853000.0,4,22,79200,4,30,26
48119,1498856000.0,4,12,82800,4,30,26


In [27]:
#convert Junction , dayofweek, dayofmonth into string for onehotencoding
''' train columns '''
data['Junction']= data.Junction.astype('str') 
data['dayofweek']= data.dayofweek.astype('str') 
data['dayofmonth']= data.dayofmonth.astype('str') 

''' test columns '''
test_data['Junction']= test_data.Junction.astype('str') 
test_data['dayofweek']= test_data.dayofweek.astype('str') 
test_data['dayofmonth']= test_data.dayofmonth.astype('str') 

In [28]:
print(data.columns)
print(data.info())

Index(['DateTime', 'Junction', 'Vehicles', 'Time', 'dayofweek', 'dayofmonth',
       'weekofyear'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 48120 entries, 0 to 48119
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   DateTime    48120 non-null  float64
 1   Junction    48120 non-null  object 
 2   Vehicles    48120 non-null  int64  
 3   Time        48120 non-null  int64  
 4   dayofweek   48120 non-null  object 
 5   dayofmonth  48120 non-null  object 
 6   weekofyear  48120 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 2.9+ MB
None


# Model

In [29]:
# train test  split 
X= data.drop(['Vehicles'], axis=1)
y= data.Vehicles

X_train, X_test, y_train, y_test= train_test_split(X, y , test_size=0.2, random_state=42)
print(X_train.shape , X_test.shape, y_train.shape, y_test.shape)

(38496, 6) (9624, 6) (38496,) (9624,)


## Baseline model 

In [30]:
#baseline model 
from sklearn.dummy import DummyRegressor
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X, y)
dummy_regr.score(X, y)

0.0

In [31]:
dummy = DummyRegressor(strategy="mean")
result_dummy = cross_validate(
    dummy, X_train, y_train, cv=5, scoring=["r2", "neg_mean_squared_error"], n_jobs=2
)
results= pd.DataFrame(result_dummy)
results.head()


Unnamed: 0,fit_time,score_time,test_r2,test_neg_mean_squared_error
0,0.002666,0.000499,-6.680369e-07,-429.113603
1,0.002886,0.000495,-2.917894e-06,-436.540854
2,0.002815,0.000469,-9.692621e-05,-450.63299
3,0.002266,0.000437,-0.0003116473,-424.468212
4,0.001899,0.00039,-6.82603e-05,-440.649658


In [32]:
print(results.test_r2.mean())
print(results.test_neg_mean_squared_error.mean())

-9.60839520705914e-05
-436.28106359790144


In [33]:
#preproc pipeline
preproc= ColumnTransformer([
    ('ohe', OneHotEncoder(), make_column_selector(dtype_include=['object','bool'])),
    ])

#model pipeline
pipe= Pipeline([
    ('preproc', preproc),
    ('lr', LinearRegression())
])
pipe

In [34]:
#scoring on train data 
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

0.5361716459690211

In [35]:
# XGboost
#model pipeline
pipe_xgb= Pipeline([
    ('preproc', preproc),
    ('xgb', XGBRegressor())
])
pipe_xgb

In [36]:
pipe_xgb.fit(X_train, y_train)
pipe_xgb.score(X_train, y_train)

0.6561893790162324

In [37]:
# Random Forest Regressor
#model pipeline
pipe_rfr= Pipeline([
    ('preproc', preproc),
    ('rfr', RandomForestRegressor())
])
pipe_rfr

In [38]:
pipe_rfr.fit(X_train, y_train)
pipe_rfr.score(X_train, y_train)

0.6575137826351484

In [39]:
lr= LinearRegression()
rf= RandomForestRegressor()
sgd= SGDRegressor()
ensemble = VotingRegressor(estimators=[('lr', lr),
                            ('rf', rf),
                            ('sgd',sgd)], 
                         weights=[1,1,1])

In [40]:
ensemble

In [41]:
# Voting regressor
#model pipeline

pipe_vote= Pipeline([
    ('preproc', preproc),
    ('ensemble', ensemble)
])
pipe_vote

In [42]:
pipe_vote.fit(X_train, y_train)

In [43]:
pipe_vote.score(X_test, y_test)

0.5919463605899684

In [47]:
import lightgbm as ltb

In [48]:
# LGTBM regressor
#model pipeline
pipe_ltb = Pipeline([
    ('preproc', preproc),
    ('lgtbm', ltb.LGBMRegressor())
])
pipe_ltb

In [50]:
pipe_ltb.fit(X_train, y_train)
pipe_ltb.score(X_train, y_train)

0.646835924812113

# Tuning 

In [55]:
 pipe_rfr.get_params()

{'memory': None,
 'steps': [('preproc',
   ColumnTransformer(transformers=[('ohe', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x17c4a5100>)])),
  ('rfr', RandomForestRegressor())],
 'verbose': False,
 'preproc': ColumnTransformer(transformers=[('ohe', OneHotEncoder(),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x17c4a5100>)]),
 'rfr': RandomForestRegressor(),
 'preproc__n_jobs': None,
 'preproc__remainder': 'drop',
 'preproc__sparse_threshold': 0.3,
 'preproc__transformer_weights': None,
 'preproc__transformers': [('ohe',
   OneHotEncoder(),
   <sklearn.compose._column_transformer.make_column_selector at 0x17c4a5100>)],
 'preproc__verbose': False,
 'preproc__verbose_feature_names_out': True,
 'preproc__ohe': OneHotEncoder(),
 'preproc__ohe__categories': 'auto',
 'preproc__ohe__drop': None,
 'preproc__ohe__dtype': numpy.float64,
 'preproc__ohe__handle_

In [66]:
# grid search 
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 4)]
# Number of features to consider at every split
#max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
#bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
               }
random_grid

{'n_estimators': [200, 800, 1400, 2000],
 'max_depth': [10, 35, 60, 85, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4]}

In [65]:
search = GridSearchCV(estimator = rf, 
                            param_grid = random_grid, 
                            scoring="r2",
                            cv = 3, 
                            verbose=2, 
                            n_jobs = -1)
search.fit(X_train, y_train)

Fitting 3 folds for each of 1080 candidates, totalling 3240 fits


KeyboardInterrupt: 

# Forecasting 