In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/MyDrive/Colab Notebooks/Hands-On Predictive Analysis with Python Workshop/Week 2 - Project/Restaurant Revenue Prediction

/content/gdrive/MyDrive/Colab Notebooks/Hands-On Predictive Analysis with Python Workshop/Week 2 - Project/Restaurant Revenue Prediction


# **Import the necessary Libraries**

In [None]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Import the data

In [None]:
restaurants = pd.read_csv('train.csv')

X = restaurants.iloc[:, :-1].values #all columns except the last one
y = restaurants.iloc[:, -1].values #only the last column

restaurants.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12,P13,P14,P15,P16,P17,P18,P19,P20,P21,P22,P23,P24,P25,P26,P27,P28,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,07/17/1999,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,2,5,4,5,5,3,5,5.0,1,2,2,2,4,5,4,1,3,3,1,1,1.0,4.0,2.0,3.0,5,3,4,5,5,4,3,4,5653753.0
1,1,02/14/2008,Ankara,Big Cities,FC,4,5.0,4.0,4.0,1,2,5,5,5,5,1,5,5.0,0,0,0,0,0,3,2,1,3,2,0,0,0.0,0.0,3.0,3.0,0,0,0,0,0,0,0,0,6923131.0
2,2,03/09/2013,Diyarbakır,Other,IL,2,4.0,2.0,5.0,2,3,5,5,5,5,2,5,5.0,0,0,0,0,0,1,1,1,1,1,0,0,0.0,0.0,1.0,3.0,0,0,0,0,0,0,0,0,2055379.0
3,3,02/02/2012,Tokat,Other,IL,6,4.5,6.0,6.0,4,4,10,8,10,10,8,10,7.5,6,4,9,3,12,20,12,6,1,10,2,2,2.5,2.5,2.5,7.5,25,12,10,6,18,12,12,6,2675511.0
4,4,05/09/2009,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,2,5,5,5,5,2,5,5.0,2,1,2,1,4,2,2,1,2,1,2,3,3.0,5.0,1.0,3.0,5,1,3,2,3,4,3,3,4316715.0


# Preprocessing the data
* Convert *Open Date* to no. of days that have elapsed since the time of opening

In [None]:
X[:,1] = abs((pd.to_datetime(X[:,1]) - pd.to_datetime('today').normalize()).days)

* Use One-Hot Encoding to convert the categorical variables to numeric types

In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2,3,4])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

* Normalize the data.

* Generally we don't normalize the target values. Here I normalized it because the difference in the mean squared error was drastic!

In [None]:
mms = MinMaxScaler()
X = np.array(mms.fit_transform(X))

y = [(revenue - min(y))/(max(y) - min(y)) for revenue in y]

# Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# random_state = 0 ensures that the same split is maintained each time we run the prorgram.

# Train the **Multi Linear Regression** model on the training set

In [None]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Use the regressor to predict the output on the test set

In [None]:
y_pred = regressor.predict(X_test)

# Compute the mean squared  error

In [None]:
se = np.array(y_test - y_pred)**2
se

array([1.13010250e-03, 5.35989024e+23, 1.79547759e-03, 1.89445315e-02,
       1.25463846e-02, 6.76700460e-04, 1.50346525e-02, 2.64187634e-03,
       1.78103987e-01, 7.13002010e-02, 1.29767983e-03, 3.81782232e-02,
       8.14414253e-03, 1.16892130e-02, 1.40470636e-03, 9.71890149e-02,
       1.91797853e-02, 9.01880527e-03, 1.73267607e-01, 3.31278878e-02,
       2.99217542e-02, 6.47248619e-02, 5.37646713e-03, 1.43711127e-02,
       2.13117657e-02, 4.34987227e-02, 4.89251619e-03, 5.11176124e-01])

In [None]:
mse = se.mean()
mse

1.9142465154380156e+22

If we notice carefully we can see that there's only one value in the squared errors that's having abnormally large value ($\sim 10^{24}$).

Just to get a feel of how huge the difference is in the mean squared error without that single value...

In [None]:
se = se[se<1]
mse = se.mean()
mse

0.05147941863372108

In [None]:
#relative errors...
np.divide(y_test - y_pred, y_test)

array([-5.57911481e-01, -3.57083274e+12, -3.56980117e-01,  9.38022019e-01,
        1.31254863e+00,  7.60373026e-02,  8.10402717e-01, -9.47403822e-01,
        3.00701391e+00,  6.38679361e-01, -2.72638490e-01, -3.82616296e+00,
        4.77949845e-01, -4.67924678e-01, -1.68055071e-01,  1.54886123e+00,
        8.54616029e-01,  5.61521546e-01,  2.32689176e+00,  1.08871647e+00,
        1.59210527e+00,  1.12678188e+00, -2.41445071e-01, -1.82048351e+00,
       -1.03984604e+00,  6.39176489e-01,  3.88212179e-01,  7.14965820e-01])