In [1]:
# Import libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
import scipy.stats as stats
import plotly.express as px
%matplotlib inline

# machine learning 
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [2]:
# import each csv file
test_raw = pd.read_csv('Test.csv')
train_raw = pd.read_csv('Train.csv')
riders_raw = pd.read_csv('Riders.csv')

# Join riders to test & train data and initialise working dataframes
train = pd.merge(train_raw, riders_raw, on = "Rider Id", how="left") 
test = pd.merge(test_raw, riders_raw, how="left", on = "Rider Id")

In [3]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7068 entries, 0 to 7067
Data columns (total 29 columns):
Order No                                7068 non-null object
User Id                                 7068 non-null object
Vehicle Type                            7068 non-null object
Platform Type                           7068 non-null int64
Personal or Business                    7068 non-null object
Placement - Day of Month                7068 non-null int64
Placement - Weekday (Mo = 1)            7068 non-null int64
Placement - Time                        7068 non-null object
Confirmation - Day of Month             7068 non-null int64
Confirmation - Weekday (Mo = 1)         7068 non-null int64
Confirmation - Time                     7068 non-null object
Arrival at Pickup - Day of Month        7068 non-null int64
Arrival at Pickup - Weekday (Mo = 1)    7068 non-null int64
Arrival at Pickup - Time                7068 non-null object
Pickup - Day of Month                   7068 n

In [4]:
test.columns

Index(['Order No', 'User Id', 'Vehicle Type', 'Platform Type',
       'Personal or Business', 'Placement - Day of Month',
       'Placement - Weekday (Mo = 1)', 'Placement - Time',
       'Confirmation - Day of Month', 'Confirmation - Weekday (Mo = 1)',
       'Confirmation - Time', 'Arrival at Pickup - Day of Month',
       'Arrival at Pickup - Weekday (Mo = 1)', 'Arrival at Pickup - Time',
       'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)', 'Pickup - Time',
       'Distance (KM)', 'Temperature', 'Precipitation in millimeters',
       'Pickup Lat', 'Pickup Long', 'Destination Lat', 'Destination Long',
       'Rider Id', 'No_Of_Orders', 'Age', 'Average_Rating', 'No_of_Ratings'],
      dtype='object')

In [5]:
# drop 14 unnecessary variables
to_drop = ['Order No', 
           'User Id', 
           'Vehicle Type', 
           'Platform Type',
           'Placement - Day of Month',
           'Placement - Weekday (Mo = 1)', 
           'Placement - Time',
           'Confirmation - Day of Month', 
           'Confirmation - Weekday (Mo = 1)',
           'Confirmation - Time', 
           'Arrival at Pickup - Day of Month',
           'Arrival at Pickup - Weekday (Mo = 1)', 
           'Arrival at Pickup - Time',
           'Rider Id']

test.drop(to_drop, axis = 1, inplace = True)
train.drop(to_drop, axis = 1, inplace = True)

In [6]:
# columns that should be dropped from the train data only
train_drop = ['Arrival at Destination - Time', 
              'Arrival at Destination - Weekday (Mo = 1)', 
              'Arrival at Destination - Day of Month']

train.drop(train_drop, axis = 1, inplace = True)

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21201 entries, 0 to 21200
Data columns (total 16 columns):
Personal or Business            21201 non-null object
Pickup - Day of Month           21201 non-null int64
Pickup - Weekday (Mo = 1)       21201 non-null int64
Pickup - Time                   21201 non-null object
Distance (KM)                   21201 non-null int64
Temperature                     16835 non-null float64
Precipitation in millimeters    552 non-null float64
Pickup Lat                      21201 non-null float64
Pickup Long                     21201 non-null float64
Destination Lat                 21201 non-null float64
Destination Long                21201 non-null float64
Time from Pickup to Arrival     21201 non-null int64
No_Of_Orders                    21201 non-null int64
Age                             21201 non-null int64
Average_Rating                  21201 non-null float64
No_of_Ratings                   21201 non-null int64
dtypes: float64(7), int64(7),

In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7068 entries, 0 to 7067
Data columns (total 15 columns):
Personal or Business            7068 non-null object
Pickup - Day of Month           7068 non-null int64
Pickup - Weekday (Mo = 1)       7068 non-null int64
Pickup - Time                   7068 non-null object
Distance (KM)                   7068 non-null int64
Temperature                     5631 non-null float64
Precipitation in millimeters    199 non-null float64
Pickup Lat                      7068 non-null float64
Pickup Long                     7068 non-null float64
Destination Lat                 7068 non-null float64
Destination Long                7068 non-null float64
No_Of_Orders                    7068 non-null int64
Age                             7068 non-null int64
Average_Rating                  7068 non-null float64
No_of_Ratings                   7068 non-null int64
dtypes: float64(7), int64(6), object(2)
memory usage: 883.5+ KB


In [9]:
train.loc[train['Personal or Business'] == 'Personal', 'Personal or Business'] = 1
train.loc[train['Personal or Business'] == 'Business', 'Personal or Business'] = 0

In [10]:
test.loc[test['Personal or Business'] == 'Personal', 'Personal or Business'] = 1
test.loc[test['Personal or Business'] == 'Business', 'Personal or Business'] = 0

In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7068 entries, 0 to 7067
Data columns (total 15 columns):
Personal or Business            7068 non-null int64
Pickup - Day of Month           7068 non-null int64
Pickup - Weekday (Mo = 1)       7068 non-null int64
Pickup - Time                   7068 non-null object
Distance (KM)                   7068 non-null int64
Temperature                     5631 non-null float64
Precipitation in millimeters    199 non-null float64
Pickup Lat                      7068 non-null float64
Pickup Long                     7068 non-null float64
Destination Lat                 7068 non-null float64
Destination Long                7068 non-null float64
No_Of_Orders                    7068 non-null int64
Age                             7068 non-null int64
Average_Rating                  7068 non-null float64
No_of_Ratings                   7068 non-null int64
dtypes: float64(7), int64(7), object(1)
memory usage: 883.5+ KB


In [12]:
# converting object data types for Pickup Times to date_time 
train['Pickup - Time'] = pd.to_datetime(train['Pickup - Time'])
test['Pickup - Time'] = pd.to_datetime(test['Pickup - Time'])

train['Pickup - Time'] = train['Pickup - Time'].apply(lambda time: time.hour)
test['Pickup - Time'] = test['Pickup - Time'].apply(lambda time: time.hour)

In [13]:
# fill precipitation null values with 0
train['Precipitation in millimeters'].fillna(0,inplace=True)
test['Precipitation in millimeters'].fillna(0,inplace=True)

In [14]:
# impute temperature values by mean
train.Temperature.fillna(23.25,inplace=True)
test.Temperature.fillna(23.25,inplace=True)

In [15]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7068 entries, 0 to 7067
Data columns (total 15 columns):
Personal or Business            7068 non-null int64
Pickup - Day of Month           7068 non-null int64
Pickup - Weekday (Mo = 1)       7068 non-null int64
Pickup - Time                   7068 non-null int64
Distance (KM)                   7068 non-null int64
Temperature                     7068 non-null float64
Precipitation in millimeters    7068 non-null float64
Pickup Lat                      7068 non-null float64
Pickup Long                     7068 non-null float64
Destination Lat                 7068 non-null float64
Destination Long                7068 non-null float64
No_Of_Orders                    7068 non-null int64
Age                             7068 non-null int64
Average_Rating                  7068 non-null float64
No_of_Ratings                   7068 non-null int64
dtypes: float64(7), int64(8)
memory usage: 883.5 KB


In [16]:
# Seperating the features and target variables
X = train.copy()
X.drop('Time from Pickup to Arrival', axis = 1, inplace = True)
y = train['Time from Pickup to Arrival'].ravel()

# Splitting the data in a test and train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101 )

In [18]:
# Initialise variables to be scaled 
X_Train_s = X_train.copy()
X_Test_s = X_test.copy()
Test_s = test.copy()

print(Test_s.columns)

Index(['Personal or Business', 'Pickup - Day of Month',
       'Pickup - Weekday (Mo = 1)', 'Pickup - Time', 'Distance (KM)',
       'Temperature', 'Precipitation in millimeters', 'Pickup Lat',
       'Pickup Long', 'Destination Lat', 'Destination Long', 'No_Of_Orders',
       'Age', 'Average_Rating', 'No_of_Ratings'],
      dtype='object')


In [19]:
# extract the column names of all the variables that should be scaled
col_names = ['Pickup - Day of Month',
       'Pickup - Weekday (Mo = 1)', 'Pickup - Time', 'Distance (KM)',
       'Temperature', 'Precipitation in millimeters', 'Pickup Lat',
       'Pickup Long', 'Destination Lat', 'Destination Long', 'No_Of_Orders',
       'Age', 'Average_Rating', 'No_of_Ratings']

# X_Train data
train_features = X_Train_s[col_names]
train_scaler = StandardScaler().fit(train_features.values)
train_features = train_scaler.transform(train_features.values)
X_Train_s[col_names] = train_features

# X_Test data
X_test_features = X_Test_s[col_names]
X_test_features = train_scaler.fit_transform(X_test_features.values)
X_Test_s[col_names] = X_test_features

#Test data
test_features = Test_s[col_names]
test_features = train_scaler.fit_transform(test_features.values)
Test_s[col_names] = test_features

In [25]:
Test_s.head()

Unnamed: 0,Personal or Business,Pickup - Day of Month,Pickup - Weekday (Mo = 1),Pickup - Time,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,No_Of_Orders,Age,Average_Rating,No_of_Ratings
0,0,1.296507,-0.158057,1.632092,-0.261685,0.000965,-0.071986,-1.705347,1.571875,-0.671685,0.254311,-0.784681,0.76263,-0.689113,-0.419027
1,0,0.15704,1.107114,0.063793,-0.799773,0.000965,-0.071986,0.313017,-0.432695,0.153311,0.289179,-0.754597,-1.079228,0.610704,-0.726317
2,0,1.296507,0.474528,-0.720357,-0.799773,-0.139437,-0.071986,-0.29463,0.311474,0.16596,0.920333,-0.542775,-0.801015,-0.925443,-0.672663
3,0,0.15704,-1.423227,0.455867,-0.799773,0.390969,-0.071986,-0.281625,-0.039556,-0.617144,-0.475089,-0.755211,-0.652238,-0.216452,-0.728756
4,0,-0.526641,-0.790642,-0.720357,-0.62041,0.359768,-0.071986,0.032021,0.086282,0.461188,-0.440536,1.868924,0.180913,0.256208,1.627138


# Building a random forest regression model 
forest = RandomForestRegressor(n_estimators=500, random_state=101, max_depth=10, max_leaf_nodes=200)
forest.fit(X_train, y_train)

# Random forest regression 
y_pred_test_forest = forest.predict(X_test)
y_pred_train_forest = forest.predict(X_train)
rmse_test_forest = round(np.sqrt(mean_squared_error(y_test, y_pred_test_forest)),2)
rmse_train_forest = round(np.sqrt(mean_squared_error(y_train, y_pred_train_forest)),2)

In [20]:
# Building Lasso regression model
lasso = Lasso(alpha = 0.1)
lasso.fit(X_Train_s, y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [21]:
# Lasso regression
y_pred_lasso_train= lasso.predict(X_Train_s)
y_pred_lasso_test= lasso.predict(X_Test_s)
rmse_test_lasso = round(np.sqrt(mean_squared_error(y_test, y_pred_lasso_test)),2)
rmse_train_lasso = round(np.sqrt(mean_squared_error(y_train, y_pred_lasso_train)),2)

In [22]:
rmse_train_lasso

799.77

In [23]:
rmse_test_lasso

782.71

In [24]:
import pickle

model_save_path = "lasso.pkl"
with open(model_save_path,'wb') as file:
    pickle.dump(lasso,file)

In [68]:
pip install -U flask 

Collecting flask
  Downloading https://files.pythonhosted.org/packages/f2/28/2a03252dfb9ebf377f40fba6a7841b47083260bf8bd8e737b0c6952df83f/Flask-1.1.2-py2.py3-none-any.whl (94kB)
Installing collected packages: flask
  Found existing installation: Flask 1.1.1
    Uninstalling Flask-1.1.1:
      Successfully uninstalled Flask-1.1.1
Successfully installed flask-1.1.2
Note: you may need to restart the kernel to use updated packages.
