# 1. Hourly Traffic Volume prediction

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import datetime 
import heapq
import time
from sklearn.metrics import mean_squared_log_error

## <font color='darkblue'>Data Preparation</font> 

In [2]:
df = pd.read_csv('https://aisgaiap.blob.core.windows.net/aiap5-assessment-data/traffic_data.csv')


In [3]:
# Drop duplicate rows of date_time records
df.drop_duplicates(subset='date_time', keep='first', inplace=True)

In [4]:
# Drop snow_1h column since all are zero values
df = df.drop(['snow_1h'], axis = 1)

In [5]:
# Convert rain_1h from continuous to discrete variable.
df['rain_1h'] = df['rain_1h'].apply(lambda x: 0 if x == 0 else 1)

In [6]:
# Convert clouds_all from continuous to discrete variable.
def cloud_all_binning(value):
    if(value <= 20):
        return 0
    elif(value <= 40):
        return 1
    elif(value <= 60):
        return 2
    elif(value <= 70):
        return 3
    elif(value <= 80):
        return 4
    else:
        return 5

In [7]:
# Perform binning on clouds_all column
df['clouds_all'] = df['clouds_all'].apply(lambda x: cloud_all_binning(x))

In [8]:
# Drop temp column as there seems to be little or no correlation with the output variable
df = df.drop(['temp'], axis = 1)

## <font color='darkblue'>Feature Engineering</font> 

In [9]:
# Extract the hour of a timestamp into a new column
hours = df['date_time'].apply(lambda x: x.split(' ')[1].split(':')[0])
# One-hot encoding of data
hours = pd.get_dummies(hours).iloc[:, 1:]

In [10]:
def getDayOfWeek(date):
    int_day = datetime.datetime.strptime(date, '%Y %m %d').weekday()
    days_mapping = {0:'Mon', 1:'Tues', 2:'Wed', 3:'Thurs', 4:'Fri', 5:'Saturday', 6:'Sunday'}
    return (days_mapping[int_day])

# Extract the day of week of a timestamp into a new column
date = df['date_time'].apply(lambda x: x.split(' ')[0].replace('-',' '))
day_of_week = date.apply(lambda x: getDayOfWeek(x))
day_of_week = pd.get_dummies(day_of_week).iloc[:, 1:]


## <font color='darkblue'>Model</font> 

In [11]:
X = df.iloc[:, [1,2]]
y = df['traffic_volume'].values
# Concat with the engineered features (extracted hours)
X = pd.concat([X, pd.DataFrame(hours)], axis = 1)
X = pd.concat([X, pd.DataFrame(day_of_week)], axis = 1)


# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [12]:
"""
#Model engineering
#Tuning of n_estimators parameter

acc_scores = []
num_esti = []
num_trees = np.arange(1, 280, 10) 

for k in num_trees:
    rfc = RandomForestRegressor(n_estimators=k, random_state=42)
    predictions = rfc.fit(X_train, y_train).predict(X_test)
    errors = abs(y_test - predictions)
    mean_perc_error = 100 * (errors / y_test)
    accuracy = 100 - np.mean(mean_perc_error)
    acc_scores.append(accuracy)
    num_esti.append(k)
    
indexes = heapq.nlargest(10, range(len(acc_scores)), acc_scores.__getitem__)
n_max_score = [acc_scores[i] for i in indexes]
n_max_estimators = [num_esti[i] for i in indexes]
print(n_max_score)
print(n_max_estimators)
"""

'\n#Model engineering\n#Tuning of n_estimators parameter\n\nacc_scores = []\nnum_esti = []\nnum_trees = np.arange(1, 280, 10) \n\nfor k in num_trees:\n    rfc = RandomForestRegressor(n_estimators=k, random_state=42)\n    predictions = rfc.fit(X_train, y_train).predict(X_test)\n    errors = abs(y_test - predictions)\n    mean_perc_error = 100 * (errors / y_test)\n    accuracy = 100 - np.mean(mean_perc_error)\n    acc_scores.append(accuracy)\n    num_esti.append(k)\n    \nindexes = heapq.nlargest(10, range(len(acc_scores)), acc_scores.__getitem__)\nn_max_score = [acc_scores[i] for i in indexes]\nn_max_estimators = [num_esti[i] for i in indexes]\nprint(n_max_score)\nprint(n_max_estimators)\n'

In [13]:
# Training the model
# we will use n_estimators = 151 as it ranks among the top few in terms of accuracy score from the results of model engineering
randForest = RandomForestRegressor(n_estimators = 151, random_state = 42)
time_start = time.time()
randForest.fit(X_train, y_train)
time_end = time.time() - time_start
print('Execution Time:', round(time_end, 3), 's')
preds = randForest.predict(X_train)
ypreds = randForest.predict(X_test)


Execution Time: 3.22 s


In [14]:
# Evaluation metrics on model performance
print('Training set')
print('Root Mean Squared Log Error =', round(np.sqrt(mean_squared_log_error(y_train, preds)), 3))
mean_perc_error = 100 * (abs(y_train - preds) / y_train)
accuracy = 100 - np.mean(mean_perc_error)
print('Accuracy Score:', round(accuracy, 3), '%')
print('Test set')
errors = abs(y_test - ypreds)
print('Root Mean Squared Log Error =', round(np.sqrt(mean_squared_log_error(y_test, ypreds)), 3))
mean_perc_error1 = 100 * (errors / y_test)
accuracy = 100 - np.mean(mean_perc_error1)
print('Accuracy Score:', round(accuracy, 3), '%')

Training set
Root Mean Squared Log Error = 0.172
Accuracy Score: 89.164 %
Test set
Root Mean Squared Log Error = 0.177
Accuracy Score: 88.223 %


In [15]:
# Get numerical feature importances
#feature_list = list(X.columns)
#importances = list(randForest.feature_importances_)
# List of tuples with variable and importance
#feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
#feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
#[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];