# Mount the Google Drive and Import Packages

In [None]:
from google.colab import drive
import sys

# Mount Google Drive
drive.mount('/content/drive')


# Get the absolute path of the current folder
abspath_curr = '/content/drive/My Drive/Colab Notebooks/'

# Get the absolute path of the shallow utilities folder
abspath_util_shallow = '/content/drive/My Drive/Colab Notebooks/'

# Get the absolute path of the shallow models folder
abspath_model_shallow = '/content/drive/My Drive/Colab Notebooks/'

Mounted at /content/drive


In [None]:
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# Set matplotlib sizes
plt.rc('font', size=20)
plt.rc('axes', titlesize=20)
plt.rc('axes', labelsize=20)
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
plt.rc('legend', fontsize=20)
plt.rc('figure', titlesize=20)

In [None]:
# The magic below allows us to use tensorflow version 2.x
%tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
# The random seed
random_seed = 42

# Set random seed in tensorflow
tf.random.set_seed(random_seed)

# Set random seed in numpy
import numpy as np
np.random.seed(random_seed)

In [None]:

%cd $abspath_util_shallow

# Import the shallow utitilities
%run pmlm_utilities_shallow.ipynb

%run pmlm_models_shallow.ipynb

/content/drive/My Drive/Colab Notebooks


# Loading the data

In [None]:
import pandas as pd

# Load the raw training data
df_raw_train = pd.read_csv(abspath_curr + 'nyc-taxi-trip-duration/train.csv',
                           header=0)
#df_raw_train.head()

# Make a copy of df_raw_train
df_train = df_raw_train.copy(deep=True)

# Load the raw test data
df_raw_test = pd.read_csv(abspath_curr + 'nyc-taxi-trip-duration/test.csv',
                          header=0)
# Make a copy of df_raw_test
df_test = df_raw_test.copy(deep=True)

# Get the name of the target
target = 'trip_duration'

In [None]:
pd.DataFrame([[df_train.shape[0], df_train.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,1458644,11


In [None]:
# Print the dimension of df_test
pd.DataFrame([[df_test.shape[0], df_test.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,625134,9


In [None]:
df_train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [None]:
df_test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


# Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

# Divide the training data into training (80%) and validation (20%)
df_train, df_val = train_test_split(df_train, train_size=0.8, random_state=random_seed)

# Reset the index
df_train, df_val = df_train.reset_index(drop=True), df_val.reset_index(drop=True)

In [None]:
# Print the dimension of df_train
pd.DataFrame([[df_train.shape[0], df_train.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,1166915,11


In [None]:
# Print the dimension of df_train
pd.DataFrame([[df_val.shape[0], df_val.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,291729,11


# Data Pre-Processing


## Handling uncommon features




In [None]:
df_common_var = common_var_checker(df_train, df_val, df_test, target)

# Print df_common_var
df_common_var

Unnamed: 0,common var
0,dropoff_latitude
1,dropoff_longitude
2,id
3,passenger_count
4,pickup_datetime
5,pickup_latitude
6,pickup_longitude
7,store_and_fwd_flag
8,trip_duration
9,vendor_id


In [None]:
# Get the features in the training data but not in the validation or test data
uncommon_feature_train_not_val_test = np.setdiff1d(df_train.columns, df_common_var['common var'])

# Print the uncommon features
pd.DataFrame(uncommon_feature_train_not_val_test, columns=['uncommon feature'])

Unnamed: 0,uncommon feature
0,dropoff_datetime


In [None]:
# Get the features in the validation data but not in the training or test data
uncommon_feature_val_not_train_test = np.setdiff1d(df_val.columns, df_common_var['common var'])

# Print the uncommon features
pd.DataFrame(uncommon_feature_val_not_train_test, columns=['uncommon feature'])

Unnamed: 0,uncommon feature
0,dropoff_datetime


In [None]:
# Get the features in the test data but not in the training or validation data
uncommon_feature_test_not_train_val = np.setdiff1d(df_test.columns, df_common_var['common var'])

# Print the uncommon features
pd.DataFrame(uncommon_feature_test_not_train_val, columns=['uncommon feature'])

Unnamed: 0,uncommon feature


### Removing Uncommon Feature- Dropoff DateTime

In [None]:
# Remove the uncommon features from the training data
df_train = df_train.drop(columns=uncommon_feature_train_not_val_test)

# Print the first 5 rows of df_train
df_train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id0458976,2,2016-06-29 18:21:02,1,-73.862762,40.768822,-73.891701,40.746689,N,1133
1,id0434613,2,2016-04-25 13:03:26,1,-73.958038,40.783237,-73.97551,40.760853,N,887
2,id3809234,2,2016-05-07 12:36:09,1,-73.96946,40.785519,-73.989243,40.771748,N,686
3,id1203705,1,2016-05-14 18:44:17,1,-73.981743,40.736549,-73.998352,40.72644,N,818
4,id1896645,2,2016-04-10 22:51:25,1,-73.977913,40.752609,-73.975647,40.733139,N,951


In [None]:
# Remove the uncommon features from the validation data
df_val = df_val.drop(columns=uncommon_feature_val_not_train_test)

# Print the first 5 rows of df_val
df_val.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2793718,2,2016-06-08 07:36:19,1,-73.985611,40.735943,-73.980331,40.760468,N,1040
1,id3485529,2,2016-04-03 12:58:11,1,-73.978394,40.764351,-73.991623,40.749859,N,827
2,id1816614,2,2016-06-05 02:49:13,5,-73.989059,40.744389,-73.973381,40.748692,N,614
3,id1050851,2,2016-05-05 17:18:27,2,-73.990326,40.731136,-73.991264,40.748917,N,867
4,id0140657,1,2016-05-12 17:43:38,4,-73.789497,40.646675,-73.987137,40.759232,N,4967


In [None]:
# Remove the uncommon features from the test data
df_test = df_test.drop(columns=uncommon_feature_test_not_train_val)

# Print the first 5 rows of df_test
df_test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


## Handling Identifiers

### Combining the three datasets

In [None]:
# Combine df_train, df_val and df_test
df = pd.concat([df_train, df_val, df_test], sort=False)

### Identify the identifier

In [None]:
# Call id_checker on df
# See the implementation in pmlm_utilities.ipynb
df_id = id_checker(df)

# Print the first 5 rows of df_id
df_id.head()

Unnamed: 0,id
0,id0458976
1,id0434613
2,id3809234
3,id1203705
4,id1896645


### Removing the identifier

In [None]:
import numpy as np

# Remove identifiers from df_train
df_train.drop(columns=np.intersect1d(df_id.columns, df_train.columns), inplace=True)

# Remove identifiers from df_val
df_val.drop(columns=np.intersect1d(df_id.columns, df_val.columns), inplace=True)

# Remove identifiers from df_test
df_test.drop(columns=np.intersect1d(df_id.columns, df_test.columns), inplace=True)

In [None]:
df_train.head()

Unnamed: 0,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,2,2016-06-29 18:21:02,1,-73.862762,40.768822,-73.891701,40.746689,N,1133
1,2,2016-04-25 13:03:26,1,-73.958038,40.783237,-73.97551,40.760853,N,887
2,2,2016-05-07 12:36:09,1,-73.96946,40.785519,-73.989243,40.771748,N,686
3,1,2016-05-14 18:44:17,1,-73.981743,40.736549,-73.998352,40.72644,N,818
4,2,2016-04-10 22:51:25,1,-73.977913,40.752609,-73.975647,40.733139,N,951


In [None]:
df_val.head()

Unnamed: 0,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,2,2016-06-08 07:36:19,1,-73.985611,40.735943,-73.980331,40.760468,N,1040
1,2,2016-04-03 12:58:11,1,-73.978394,40.764351,-73.991623,40.749859,N,827
2,2,2016-06-05 02:49:13,5,-73.989059,40.744389,-73.973381,40.748692,N,614
3,2,2016-05-05 17:18:27,2,-73.990326,40.731136,-73.991264,40.748917,N,867
4,1,2016-05-12 17:43:38,4,-73.789497,40.646675,-73.987137,40.759232,N,4967


In [None]:
df_test.head()

Unnamed: 0,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


## Handling Date Time Variable

### Converting the DateTime variable into 6 different variables

In [None]:
# Get the date time variables
datetime_vars = ['pickup_datetime']

In [None]:
# Call datetime_transformer on df_train
# See the implementation in pmlm_utilities.ipynb
df_train = datetime_transformer(df_train, datetime_vars)

# Print the first 5 rows of df_train
df_train.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_hour,pickup_datetime_minute,pickup_datetime_second
0,2,1,-73.862762,40.768822,-73.891701,40.746689,N,1133,2016,6,29,18,21,2
1,2,1,-73.958038,40.783237,-73.97551,40.760853,N,887,2016,4,25,13,3,26
2,2,1,-73.96946,40.785519,-73.989243,40.771748,N,686,2016,5,7,12,36,9
3,1,1,-73.981743,40.736549,-73.998352,40.72644,N,818,2016,5,14,18,44,17
4,2,1,-73.977913,40.752609,-73.975647,40.733139,N,951,2016,4,10,22,51,25


In [None]:
# Call datetime_transformer on df_train
# See the implementation in pmlm_utilities.ipynb
df_val = datetime_transformer(df_val, datetime_vars)

# Print the first 5 rows of df_train
df_val.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_hour,pickup_datetime_minute,pickup_datetime_second
0,2,1,-73.985611,40.735943,-73.980331,40.760468,N,1040,2016,6,8,7,36,19
1,2,1,-73.978394,40.764351,-73.991623,40.749859,N,827,2016,4,3,12,58,11
2,2,5,-73.989059,40.744389,-73.973381,40.748692,N,614,2016,6,5,2,49,13
3,2,2,-73.990326,40.731136,-73.991264,40.748917,N,867,2016,5,5,17,18,27
4,1,4,-73.789497,40.646675,-73.987137,40.759232,N,4967,2016,5,12,17,43,38


In [None]:
# Call datetime_transformer on df_train
# See the implementation in pmlm_utilities.ipynb
df_test = datetime_transformer(df_test, datetime_vars)

# Print the first 5 rows of df_train
df_test.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_hour,pickup_datetime_minute,pickup_datetime_second
0,1,1,-73.988129,40.732029,-73.990173,40.75668,N,2016,6,30,23,59,58
1,1,1,-73.964203,40.679993,-73.959808,40.655403,N,2016,6,30,23,59,53
2,1,1,-73.997437,40.737583,-73.98616,40.729523,N,2016,6,30,23,59,47
3,2,1,-73.95607,40.7719,-73.986427,40.730469,N,2016,6,30,23,59,41
4,1,1,-73.970215,40.761475,-73.96151,40.75589,N,2016,6,30,23,59,33


## Handling Missing Data

### Identifying the columns with missing data

In [None]:
# Combine df_train, df_val and df_test
df = pd.concat([df_train, df_val, df_test], sort=False)

In [None]:
# Call nan_checker on df
# See the implementation in pmlm_utilities.ipynb
df_nan = nan_checker(df)

# Print df_nan
df_nan

Unnamed: 0,var,proportion,dtype
0,trip_duration,0.3,float64


In [None]:
# Print the unique data type of variables with NaN
pd.DataFrame(df_nan['dtype'].unique(), columns=['dtype'])

Unnamed: 0,dtype
0,float64


In [None]:
# Get the variables with missing values, their proportion of missing values and data type
df_miss = df_nan[df_nan['dtype'] == 'float64'].reset_index(drop=True)

# Print df_miss
df_miss

Unnamed: 0,var,proportion,dtype
0,trip_duration,0.3,float64


### Splitting the data into train, val, and test

In [None]:
# Separating the training data
df_train = df.iloc[:df_train.shape[0], :]

# Separating the validation data
df_val = df.iloc[df_train.shape[0]:df_train.shape[0] + df_val.shape[0], :]

# Separating the test data
df_test = df.iloc[df_train.shape[0] + df_val.shape[0]:, :]

In [None]:
# Print the dimension of df_train
pd.DataFrame([[df_train.shape[0], df_train.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,1166915,14


In [None]:
# Print the dimension of df_val
pd.DataFrame([[df_val.shape[0], df_val.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,291729,14


In [None]:
# Print the dimension of df_test
pd.DataFrame([[df_test.shape[0], df_test.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,625134,14


### Imputing the missing values with 'Mean' values of train data

In [None]:
from sklearn.impute import SimpleImputer

# If there are missing values
if len(df_miss['var']) > 0:
    # The SimpleImputer
    si = SimpleImputer(missing_values=np.nan, strategy='mean')

    # Impute the variables with missing values in df_train, df_val and df_test
    df_train[df_miss['var']] = si.fit_transform(df_train[df_miss['var']])
    df_val[df_miss['var']] = si.transform(df_val[df_miss['var']])
    df_test[df_miss['var']] = si.transform(df_test[df_miss['var']])

## Encoding the data

In [None]:
# Combine df_train, df_val and df_test
df = pd.concat([df_train, df_val, df_test], sort=False)

# Print the unique data type of variables in df
pd.DataFrame(df.dtypes.unique(), columns=['dtype'])

Unnamed: 0,dtype
0,int64
1,float64
2,object


### Identify the categorical variables

In [None]:
# Call cat_var_checker on df
# See the implementation in pmlm_utilities.ipynb
df_cat = cat_var_checker(df)

# Print the dataframe
df_cat

Unnamed: 0,var,nunique
0,store_and_fwd_flag,2


### One hot encoding

In [None]:
# One-hot-encode the categorical features in the combined data
df = pd.get_dummies(df, columns=np.setdiff1d(df_cat['var'], [target]))

# Print the first 5 rows of df
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_hour,pickup_datetime_minute,pickup_datetime_second,store_and_fwd_flag_N,store_and_fwd_flag_Y
0,2,1,-73.862762,40.768822,-73.891701,40.746689,1133.0,2016,6,29,18,21,2,1,0
1,2,1,-73.958038,40.783237,-73.97551,40.760853,887.0,2016,4,25,13,3,26,1,0
2,2,1,-73.96946,40.785519,-73.989243,40.771748,686.0,2016,5,7,12,36,9,1,0
3,1,1,-73.981743,40.736549,-73.998352,40.72644,818.0,2016,5,14,18,44,17,1,0
4,2,1,-73.977913,40.752609,-73.975647,40.733139,951.0,2016,4,10,22,51,25,1,0


### Splitting the three datasets

In [None]:
# Separating the training data
df_train = df.iloc[:df_train.shape[0], :]

# Separating the validation data
df_val = df.iloc[df_train.shape[0]:df_train.shape[0] + df_val.shape[0], :]

# Separating the test data
df_test = df.iloc[df_train.shape[0] + df_val.shape[0]:, :]

In [None]:
# Print the dimension of df_train
pd.DataFrame([[df_train.shape[0], df_train.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,1166915,15


In [None]:

# Print the dimension of df_val
pd.DataFrame([[df_val.shape[0], df_val.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,291729,15


In [None]:
# Print the dimension of df_test
pd.DataFrame([[df_test.shape[0], df_test.shape[1]]], columns=['# rows', '# columns'])

Unnamed: 0,# rows,# columns
0,625134,15


# Feature Engineering

## Calculating Distance by using the pickup and dropoff coordinates

###### It will take a few minutes to run this cell.

In [None]:
import pandas as pd
import geopy.distance
from geopy.distance import distance

# Sample DataFrame


# Function to calculate Vincenty distance
def vincenty_distance(row):
    pickup_coords = (row['pickup_latitude'], row['pickup_longitude'])
    dropoff_coords = (row['dropoff_latitude'], row['dropoff_longitude'])
    dist = distance(pickup_coords, dropoff_coords).kilometers
    return dist

# Apply the function to create a new 'distance' column
df_train['distance'] = df_train.apply(vincenty_distance, axis=1)

# Display the updated DataFrame
df_train.head()


Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_hour,pickup_datetime_minute,pickup_datetime_second,store_and_fwd_flag_N,store_and_fwd_flag_Y,distance
0,2,1,-73.862762,40.768822,-73.891701,40.746689,1133.0,2016,6,29,18,21,2,1,0,3.46587
1,2,1,-73.958038,40.783237,-73.97551,40.760853,887.0,2016,4,25,13,3,26,1,0,2.890477
2,2,1,-73.96946,40.785519,-73.989243,40.771748,686.0,2016,5,7,12,36,9,1,0,2.264417
3,1,1,-73.981743,40.736549,-73.998352,40.72644,818.0,2016,5,14,18,44,17,1,0,1.79689
4,2,1,-73.977913,40.752609,-73.975647,40.733139,951.0,2016,4,10,22,51,25,1,0,2.1706


In [None]:
df_val['distance'] = df_val.apply(vincenty_distance, axis=1)
df_val.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_hour,pickup_datetime_minute,pickup_datetime_second,store_and_fwd_flag_N,store_and_fwd_flag_Y,distance
0,2,1,-73.985611,40.735943,-73.980331,40.760468,1040.0,2016,6,8,7,36,19,1,0,2.759702
1,2,1,-73.978394,40.764351,-73.991623,40.749859,827.0,2016,4,3,12,58,11,1,0,1.959062
2,2,5,-73.989059,40.744389,-73.973381,40.748692,614.0,2016,6,5,2,49,13,1,0,1.407727
3,2,2,-73.990326,40.731136,-73.991264,40.748917,867.0,2016,5,5,17,18,27,1,0,1.976073
4,1,4,-73.789497,40.646675,-73.987137,40.759232,4967.0,2016,5,12,17,43,38,1,0,20.861805


In [None]:
df_test['distance'] = df_test.apply(vincenty_distance, axis=1)
df_test.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_hour,pickup_datetime_minute,pickup_datetime_second,store_and_fwd_flag_N,store_and_fwd_flag_Y,distance
0,1,1,-73.988129,40.732029,-73.990173,40.75668,959.273585,2016,6,30,23,59,58,1,0,2.742863
1,1,1,-73.964203,40.679993,-73.959808,40.655403,959.273585,2016,6,30,23,59,53,1,0,2.755774
2,1,1,-73.997437,40.737583,-73.98616,40.729523,959.273585,2016,6,30,23,59,47,1,0,1.307112
3,2,1,-73.95607,40.7719,-73.986427,40.730469,959.273585,2016,6,30,23,59,41,1,0,5.266978
4,1,1,-73.970215,40.761475,-73.96151,40.75589,959.273585,2016,6,30,23,59,33,1,0,0.961745


#### Removing the unneccasry columns

In [None]:
exclude=['pickup_longitude','pickup_latitude','dropoff_longitude','dropff_latitude','store_and_fwd_flag_N','store_and_fwd_flag_Y']

## Splitting the feature and target

In [None]:
# Get the feature matrix
X_train = df_train[np.setdiff1d(df_train.columns, [target] +exclude )].values
X_val = df_val[np.setdiff1d(df_val.columns, [target]+ exclude)].values
X_test = df_test[np.setdiff1d(df_test.columns, [target]+ exclude)].values

# Get the target vector
y_train = df_train[target].values
y_val = df_val[target].values
y_test = df_test[target].values

## Scaling the data

### Since, its a regression problem, we will scale feature and targe by standardization

In [None]:
from sklearn.preprocessing import StandardScaler

# The StandardScaler
ss = StandardScaler()

In [None]:
# Standardize the training data
X_train = ss.fit_transform(X_train)

# Standardize the validation data
X_val = ss.transform(X_val)

# Standardize the test data
X_test = ss.transform(X_test)

In [None]:
# Standardize the training data
y_train = ss.fit_transform(y_train.reshape(-1, 1)).reshape(-1)

# Standardize the validation data
y_val = ss.transform(y_val.reshape(-1, 1)).reshape(-1)

# Standardize the test data
y_test = ss.transform(y_test.reshape(-1, 1)).reshape(-1)

# Hyperparameter Tuning

#### Model Dictionary

In [None]:
%cd $abspath_model_shallow

# Import the shallow models
%run pmlm_models_shallow.ipynb

from sklearn.linear_model import SGDRegressor

models = {'sgd': SGDRegressor(random_state=random_seed),
           #'lr_mbgd':LinearRegression_MBGD(random_state=random_seed)

}

/content/drive/My Drive/Colab Notebooks


In [None]:
from sklearn.pipeline import Pipeline

pipes = {}

for acronym, model in models.items():
    pipes[acronym] = Pipeline([('model', model)])

#### Parameters for SGDRegressor

In [None]:
# Get the:
# feature matrix and target velctor in the combined training and validation data
# target vector in the combined training and validation data
# PredefinedSplit
# See the implementation in pmlm_utilities.ipynb
X_train_val, y_train_val, ps = get_train_val_ps(X_train, y_train, X_val, y_val)

In [None]:
param_grids = {}

In [None]:
# The parameter grid of eta
eta_grid = [0.0119, 0.01, 0.002, 0.067,0.00189]

# The parameter grid of alpha
alpha_grid = [0.075, 0.1,0.09, 0.081]

# Update param_grids
param_grids['sgd'] = [{'model__eta0': eta_grid,
                       'model__alpha': alpha_grid}]

#### Parameter grid for LinearRegressionMBGD

In [None]:
# # The parameter grid of eta
# eta_grid = [0.01, 0.2]

# # The parameter grid of alpha
# alpha_grid = [0.075, 0.1]

# # Update param_grids
# param_grids['lr_mbgd'] = [{'model__eta': eta_grid,
#                            'model__alpha': alpha_grid}]

In [None]:
param_grids

{'sgd': [{'model__eta0': [0.0119, 0.01, 0.002, 0.067, 0.00189],
   'model__alpha': [0.075, 0.1, 0.09, 0.081]}]}

In [None]:
pipes

{'sgd': Pipeline(steps=[('model', SGDRegressor(random_state=42))])}

In [None]:
# Make directory
directory = os.path.dirname(abspath_curr + 'result/cv_results/GridSearchCV/')
if not os.path.exists(directory):
    os.makedirs(directory)

#### Tuning with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

# The list of [best_score_, best_params_, best_estimator_] obtained by GridSearchCV
best_score_params_estimator_gs = []

# For each model
for acronym in pipes.keys():
    # GridSearchCV
    gs = GridSearchCV(estimator=pipes[acronym],
                      param_grid=param_grids[acronym],
                      scoring='neg_mean_squared_error',
                      n_jobs=2,
                      cv=ps,
                      return_train_score=True)

    # Fit the pipeline
    gs = gs.fit(X_train_val, y_train_val)

    # Update best_score_params_estimator_gs
    best_score_params_estimator_gs.append([gs.best_score_, gs.best_params_, gs.best_estimator_])

    # Sort cv_results in ascending order of 'rank_test_score' and 'std_test_score'
    cv_results = pd.DataFrame.from_dict(gs.cv_results_).sort_values(by=['rank_test_score', 'std_test_score'])

    # Get the important columns in cv_results
    important_columns = ['rank_test_score',
                         'mean_test_score',
                         'std_test_score',
                         'mean_train_score',
                         'std_train_score',
                         'mean_fit_time',
                         'std_fit_time',
                         'mean_score_time',
                         'std_score_time']

    # Move the important columns ahead
    cv_results = cv_results[important_columns + sorted(list(set(cv_results.columns) - set(important_columns)))]

    # Write cv_results file
    cv_results.to_csv(path_or_buf=abspath_curr + 'result/cv_results/GridSearchCV/' + acronym + '.csv', index=False)

# Sort best_score_params_estimator_gs in descending order of the best_score_
best_score_params_estimator_gs = sorted(best_score_params_estimator_gs, key=lambda x : x[0], reverse=True)

# Print best_score_params_estimator_gs
pd.DataFrame(best_score_params_estimator_gs, columns=['best_score', 'best_param', 'best_estimator'])

Unnamed: 0,best_score,best_param,best_estimator
0,-0.326057,"{'model__alpha': 0.075, 'model__eta0': 0.002}","(SGDRegressor(alpha=0.075, eta0=0.002, random_..."


In [None]:
best_model = best_score_params_estimator_gs[0][2]

# Make predictions on training, validation, and test sets
y_train_pred = best_model.predict(X_train)
y_val_pred = best_model.predict(X_val)
y_test_pred = best_model.predict(X_test)

# Calculate Mean Squared Error for each set
train_error = mean_squared_error(y_train, y_train_pred)
val_error = mean_squared_error(y_val, y_val_pred)
test_error = mean_squared_error(y_test, y_test_pred)

# Print or use the errors as needed
print(f"Training Error: {train_error:.6f}")
print(f"Validation Error: {val_error:.6f}")
print(f"Test Error: {test_error:.6f}")

Training Error: 0.992077
Validation Error: 0.326058
Test Error: 0.009377


###### I had run 3 models- SGD, Linear Regression_BGD, and Linear Regression_MBGD. I have commented out the Linear regression models as they were performing worse than the SGD model and were also taking a lot of time to run.

#### Tuning with RandomizedSearchCv

In [None]:
param_dists = {}

### Parameters for SGD Regressor

In [None]:
from scipy.stats import uniform, norm,poisson,randint, expon, loguniform

# The distribution for eta: a uniform distribution over [loc, loc + scale]
eta_dist = uniform(loc=0.001, scale=0.019)

# The distribution for alpha: a uniform distribution over [loc, loc + scale]
alpha_dist =uniform(loc=0.02, scale=0.09)

# Update param_dists
param_dists['sgd'] = [{'model__eta0': eta_dist,
                       'model__alpha': alpha_dist}]

In [None]:
# # The distribution for eta: a uniform distribution over [loc, loc + scale]
# eta_dist = uniform(loc=0.001, scale=0.019)

# # The distribution for alpha: a uniform distribution over [loc, loc + scale]
# alpha_dist = uniform(loc=0.01, scale=0.09)

# # Update param_dists
# param_dists['lr_mbgd'] = [{'model__eta': eta_dist,
#                            'model__alpha': alpha_dist}]

In [None]:
# Make directory
directory = os.path.dirname(abspath_curr + 'result/cv_results/RandomizedSearchCV/')
if not os.path.exists(directory):
    os.makedirs(directory)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# The list of [best_score_, best_params_, best_estimator_] obtained by RandomizedSearchCV
best_score_params_estimator_rs = []

for acronym in pipes.keys():
    # RandomizedSearchCV
    rs = RandomizedSearchCV(estimator=pipes[acronym],
                            param_distributions=param_dists[acronym],
                            n_iter=4,
                            scoring='neg_mean_squared_error',
                            n_jobs=2,
                            cv=ps,
                            random_state=random_seed,
                            return_train_score=True)

    # Fit the pipeline
    rs = rs.fit(X_train_val, y_train_val)

    # Update best_score_param_estimators
    best_score_params_estimator_rs.append([rs.best_score_, rs.best_params_, rs.best_estimator_])

    # Sort cv_results in ascending order of 'rank_test_score' and 'std_test_score'
    cv_results = pd.DataFrame.from_dict(rs.cv_results_).sort_values(by=['rank_test_score', 'std_test_score'])

    # Get the important columns in cv_results
    important_columns = ['rank_test_score',
                         'mean_test_score',
                         'std_test_score',
                         'mean_train_score',
                         'std_train_score',
                         'mean_fit_time',
                         'std_fit_time',
                         'mean_score_time',
                         'std_score_time']

    # Move the important columns ahead
    cv_results = cv_results[important_columns + sorted(list(set(cv_results.columns) - set(important_columns)))]

    # Write cv_results file
    cv_results.to_csv(path_or_buf=abspath_curr + 'result/cv_results/RandomizedSearchCV/' + acronym + '.csv', index=False)

# Sort best_score_params_estimator_rs in descending order of the best_score_
best_score_params_estimator_rs = sorted(best_score_params_estimator_rs, key=lambda x : x[0], reverse=True)

# Print best_score_params_estimator_rs
pd.DataFrame(best_score_params_estimator_rs, columns=['best_score', 'best_param', 'best_estimator'])


Unnamed: 0,best_score,best_param,best_estimator
0,-0.326088,"{'model__alpha': 0.03404167763981929, 'model__...","(SGDRegressor(alpha=0.03404167763981929, eta0=..."


##### The RandomizedSearchCV gives a worse model than grid search cv. So our, best model is given by GridSearchCV using SGDRegressor.

# Model Selection

In [None]:

# Get the best_score, best_params and best_estimator obtained by GridSearchCV
best_score_gs, best_params_gs, best_estimator_gs = best_score_params_estimator_gs[0]


# Generating Submission File

In [None]:
# Make directory
directory = os.path.dirname(abspath_curr + 'result/submission/')
if not os.path.exists(directory):
    os.makedirs(directory)

In [None]:
# Get the prediction on the testing data using best_model
y_test_pred = best_estimator_gs.predict(X_test)

# Get the dataframe of y_test_pred, which has the same shape as df_train
df_y_test_pred = pd.DataFrame(np.tile(y_test_pred.reshape(-1, 1), df_train.shape[1]),
                              columns=df_train.columns)

# Transform df_y_test_pred back to the original scale
df_y_test_pred = pd.DataFrame(ss.inverse_transform(df_y_test_pred),
                              columns=df_train.columns)

# Get the submission dataframe
df_submit = pd.DataFrame(np.hstack((df_raw_test[['id']], df_y_test_pred[[target]])),
                         columns=['id', target])

# Generate the submission file
df_submit.to_csv(abspath_curr + 'result/submission/submission.csv', index=False)

In [None]:
from sklearn.metrics import mean_squared_error

# Assuming y_test_true contains the true target values for the testing data

# Calculate Mean Squared Error for the test set
test_error = mean_squared_error(y_test, y_test_pred)

# Print or use the test error as needed
print(f"Test Error: {test_error:.6f}")


Test Error: 0.009377


# Conclusion


##### 1.  I have done the required data pre-processing to clean the data and make it ready for training.

##### 2. I have added a column called distance to improve the the prediction of the trip duration.

##### 3. I have dropped the coordinates and set flags column from the feature matrix as it doesnt't provide much information.

##### 4. I have performed 2 methods of hyperparameter tuning- GridSearchCV and RandomizedSearchCV. GridSearchCv gave better results.

##### 5. I also used 3 models of which SGDRegressor gave the best result.

##### 6. I have also changed the alpha and eta values in the grid to get the least error.