#Linear Regression Model to Predict Hourly Threshold

### Importing required modules

In [11]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import absolute
from numpy import sqrt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import sys

Linear Regression Model which predicts if there is congestion or no congestation. This model uses the dataset created by fahad.

In [12]:
hourly_samples_path = 'https://drive.google.com/uc?export=download&id=1bkbxmNhTE0i096XbdgmSb29Q--vkxwGq'
df = pd.read_csv(hourly_samples_path)

In [None]:
df.head()

In [13]:
def pre_processing(dataframe):
    dataframe['congested'] = np.where(dataframe.traffic_hourly_count >= dataframe.threshold, 1, 0)
    features = dataframe.iloc[:, :]
    target = dataframe.iloc[:, 10]
    columns_drop = ['full_name', 'threshold','latitude', 'longitude']
    features = features.drop(columns_drop, axis=1)
    return features, target

In [14]:
def evaluate(model, x_train, y_train, x_test, y_test ):
    if(model == None):
        print("Model does not exist")
        return
    print(f"Train accuracy {model.score(x_train,y_train)} ")
    print(f"Test accuracy {model.score(x_test,y_test)} ")
    return

In [31]:
x, y = pre_processing(df)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=10)

#Build multiple linear regression model
model = LinearRegression()
model.fit(x_train,y_train)

evaluate(model, x_train, y_train, x_test, y_test)

Train accuracy 0.6137394170973683 
Test accuracy 0.5314140549095809 


## Evaluating Model performance using MAE (Mean Absolute Error)
The Lower the Better

In [38]:
# Defined cross-validation method to use
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# Using k-fold CV to evaluate model
scores = cross_val_score(model, x, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)
# View mean absolute error
print('Mean Absolute Error is ')
mean(absolute(scores))

Mean Absolute Error is 


174.5583066921808

## Evaluating Model performance using RMSE (Root Mean Squared Error)
The Lower the Better

In [39]:
# Defined cross-validation method to use
cv = KFold(n_splits=5, random_state=1, shuffle=True) 

# Using Leave One Out Cross Validation to evaluate model
scores = cross_val_score(model, x, y, scoring='neg_mean_squared_error',
                         cv=cv, n_jobs=-1)
#view RMSE
print('Root Mean Squared Error is ')
sqrt(mean(absolute(scores)))

Root Mean Squared Error is 


225.2045912288194