In [112]:
import pandas as pd
import sys
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

In [80]:
df= pd.read_csv(r"D:\MLOps\MLOps-Projects\New_York_City_Taxi_Trip_Duration\data\raw\train.csv")
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


### Dataset Description
The dataset is based on the 2016 NYC Yellow Cab trip record data made available in Big Query on Google Cloud Platform. The data was originally published by the NYC Taxi and Limousine Commission (TLC). Based on individual trip attributes, predict the duration of each trip in the test set.

### Data fields
id - a unique identifier for each trip

vendor_id - a code indicating the provider associated with the trip record

pickup_datetime - date and time when the meter was engaged

dropoff_datetime - date and time when the meter was disengaged

passenger_count - the number of passengers in the vehicle (driver entered value)

pickup_longitude - the longitude where the meter was engaged

pickup_latitude - the latitude where the meter was engaged

dropoff_longitude - the longitude where the meter was disengaged

dropoff_latitude - the latitude where the meter was disengaged

store_and_fwd_flag - This flag indicates whether the trip record was held in vehicle memory before sending to the vendor because the vehicle did not have a connection to the server - Y=store and forward; N=not a store and forward trip

trip_duration - duration of the trip in seconds

In [81]:
df.shape

(1458644, 11)

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


In [83]:
df.isnull().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [84]:
df.duplicated().sum()

0

In [85]:
df.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.4923
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,5237.432
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0


In [86]:
df.drop(columns='id', axis=1, inplace=True)

In [87]:
df.sample(2)

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
518607,1,2016-06-05 20:47:47,2016-06-05 20:56:15,1,-73.982872,40.767677,-73.976273,40.751259,N,508
1037198,2,2016-05-07 13:31:22,2016-05-07 14:06:10,3,-74.00666,40.744202,-73.967941,40.688381,N,2088


In [88]:
df.vendor_id.value_counts()

vendor_id
2    780302
1    678342
Name: count, dtype: int64

In [89]:
df.passenger_count.value_counts()

passenger_count
1    1033540
2     210318
5      78088
3      59896
6      48333
4      28404
0         60
7          3
9          1
8          1
Name: count, dtype: int64

In [90]:
df["store_and_fwd_flag"].value_counts()

store_and_fwd_flag
N    1450599
Y       8045
Name: count, dtype: int64

In [91]:
df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])
df["pickup_hour"] = df["pickup_datetime"].dt.hour
df["pickup_minute"] = df["pickup_datetime"].dt.minute
df["pickup_second"] = df["pickup_datetime"].dt.second/100
df["pickup_minute_of_the_day"] = df["pickup_hour"] * 60 + df["pickup_minute"]
df["pickup_day_week"] =df["pickup_datetime"].dt.dayofweek
df["pickup_month"] = df["pickup_datetime"].dt.month


df["dropoff_datetime"] = pd.to_datetime(df["dropoff_datetime"])
df["dropoff_hour"] = df["dropoff_datetime"].dt.hour
df["dropoff_minute"] = df["dropoff_datetime"].dt.minute
df["dropoff_second"] = df["dropoff_datetime"].dt.second/100
df["dropoff_minute_of_the_day"] = df["dropoff_hour"] * 60 + df["dropoff_minute"]
df["dropoff_day_week"] =df["dropoff_datetime"].dt.dayofweek
df["dropoff_month"] = df["dropoff_datetime"].dt.month

In [92]:
df.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,...,pickup_second,pickup_minute_of_the_day,pickup_day_week,pickup_month,dropoff_hour,dropoff_minute,dropoff_second,dropoff_minute_of_the_day,dropoff_day_week,dropoff_month
0,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,...,0.55,1044,0,3,17,32,0.3,1052,0,3
1,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,...,0.35,43,6,6,0,54,0.38,54,6,6
2,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,...,0.24,695,1,1,12,10,0.48,730,1,1
3,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,...,0.31,1172,2,4,19,39,0.4,1179,2,4
4,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,...,0.55,810,5,3,13,38,0.1,818,5,3


In [93]:
df.drop(columns=["pickup_datetime", "dropoff_datetime"], axis=1, inplace=True)

In [94]:
df["trip_duration"].value_counts()

trip_duration
368      1624
408      1584
348      1582
367      1581
358      1577
         ... 
7378        1
83250       1
6615        1
34014       1
6124        1
Name: count, Length: 7417, dtype: int64

In [95]:
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_hour,pickup_minute,pickup_second,pickup_minute_of_the_day,pickup_day_week,pickup_month,dropoff_hour,dropoff_minute,dropoff_second,dropoff_minute_of_the_day,dropoff_day_week,dropoff_month
0,2,1,-73.982155,40.767937,-73.96463,40.765602,N,455,17,24,0.55,1044,0,3,17,32,0.3,1052,0,3
1,1,1,-73.980415,40.738564,-73.999481,40.731152,N,663,0,43,0.35,43,6,6,0,54,0.38,54,6,6
2,2,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,11,35,0.24,695,1,1,12,10,0.48,730,1,1
3,2,1,-74.01004,40.719971,-74.012268,40.706718,N,429,19,32,0.31,1172,2,4,19,39,0.4,1179,2,4
4,2,1,-73.973053,40.793209,-73.972923,40.78252,N,435,13,30,0.55,810,5,3,13,38,0.1,818,5,3


In [96]:
((df["dropoff_minute_of_the_day"]+df["dropoff_second"]) - (df["pickup_minute_of_the_day"]+df["pickup_second"]))*60

0           465.0
1           661.8
2          2114.4
3           425.4
4           453.0
            ...  
1458639     778.8
1458640     657.0
1458641     770.4
1458642     367.8
1458643     190.8
Length: 1458644, dtype: float64

In [97]:
Cate_columns=df.columns[df.dtypes=="object"]
Num_columns=df.columns[(df.dtypes == float) | (df.dtypes == int)]

In [98]:
Num_columns

Index(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'pickup_hour', 'pickup_minute', 'pickup_second',
       'pickup_minute_of_the_day', 'pickup_day_week', 'pickup_month',
       'dropoff_hour', 'dropoff_minute', 'dropoff_second',
       'dropoff_minute_of_the_day', 'dropoff_day_week', 'dropoff_month'],
      dtype='object')

In [99]:
df.columns[(df.dtypes == float) | (df.dtypes == int)]

Index(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'pickup_hour', 'pickup_minute', 'pickup_second',
       'pickup_minute_of_the_day', 'pickup_day_week', 'pickup_month',
       'dropoff_hour', 'dropoff_minute', 'dropoff_second',
       'dropoff_minute_of_the_day', 'dropoff_day_week', 'dropoff_month'],
      dtype='object')

In [100]:
X= df.drop(columns="trip_duration", axis=1)
y= df["trip_duration"]
X.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_hour,pickup_minute,pickup_second,pickup_minute_of_the_day,pickup_day_week,pickup_month,dropoff_hour,dropoff_minute,dropoff_second,dropoff_minute_of_the_day,dropoff_day_week,dropoff_month
0,2,1,-73.982155,40.767937,-73.96463,40.765602,N,17,24,0.55,1044,0,3,17,32,0.3,1052,0,3
1,1,1,-73.980415,40.738564,-73.999481,40.731152,N,0,43,0.35,43,6,6,0,54,0.38,54,6,6
2,2,1,-73.979027,40.763939,-74.005333,40.710087,N,11,35,0.24,695,1,1,12,10,0.48,730,1,1
3,2,1,-74.01004,40.719971,-74.012268,40.706718,N,19,32,0.31,1172,2,4,19,39,0.4,1179,2,4
4,2,1,-73.973053,40.793209,-73.972923,40.78252,N,13,30,0.55,810,5,3,13,38,0.1,818,5,3


In [101]:
y.head()

0     455
1     663
2    2124
3     429
4     435
Name: trip_duration, dtype: int64

In [102]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1166915, 19), (1166915,), (291729, 19), (291729,))

In [103]:
X_train.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_hour,pickup_minute,pickup_second,pickup_minute_of_the_day,pickup_day_week,pickup_month,dropoff_hour,dropoff_minute,dropoff_second,dropoff_minute_of_the_day,dropoff_day_week,dropoff_month
1053743,2,1,-73.862762,40.768822,-73.891701,40.746689,N,18,21,0.02,1101,2,6,18,39,0.55,1119,2,6
273748,2,1,-73.958038,40.783237,-73.97551,40.760853,N,13,3,0.26,783,0,4,13,18,0.13,798,0,4
433988,2,1,-73.96946,40.785519,-73.989243,40.771748,N,12,36,0.09,756,5,5,12,47,0.35,767,5,5
1442481,1,1,-73.981743,40.736549,-73.998352,40.72644,N,18,44,0.17,1124,5,5,18,57,0.55,1137,5,5
1025834,2,1,-73.977913,40.752609,-73.975647,40.733139,N,22,51,0.25,1371,6,4,23,7,0.16,1387,6,4


In [104]:
X_train.columns

Index(['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'pickup_hour', 'pickup_minute', 'pickup_second',
       'pickup_minute_of_the_day', 'pickup_day_week', 'pickup_month',
       'dropoff_hour', 'dropoff_minute', 'dropoff_second',
       'dropoff_minute_of_the_day', 'dropoff_day_week', 'dropoff_month'],
      dtype='object')

In [105]:

trf1= Pipeline(steps=[
    ("ohe",OneHotEncoder(drop='first', dtype=np.int32))
])


trf2= Pipeline(steps=[
    ("SS", StandardScaler())
])

trf3= Pipeline(steps=[SelectKBest(score_func=chi2,k=15)])


In [106]:
store_and_fwd_flag=df[["store_and_fwd_flag"]].columns
store_and_fwd_flag

Index(['store_and_fwd_flag'], dtype='object')

In [107]:
preprocessor= ColumnTransformer(
    [
        ("ohe", trf1, store_and_fwd_flag),
        ("SS", trf2, Num_columns)
    ], remainder="passthrough"
)

In [108]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [109]:
preprocessor.get_feature_names_out()

array(['ohe__store_and_fwd_flag_Y', 'SS__pickup_longitude',
       'SS__pickup_latitude', 'SS__dropoff_longitude',
       'SS__dropoff_latitude', 'SS__pickup_hour', 'SS__pickup_minute',
       'SS__pickup_second', 'SS__pickup_minute_of_the_day',
       'SS__pickup_day_week', 'SS__pickup_month', 'SS__dropoff_hour',
       'SS__dropoff_minute', 'SS__dropoff_second',
       'SS__dropoff_minute_of_the_day', 'SS__dropoff_day_week',
       'SS__dropoff_month', 'remainder__vendor_id',
       'remainder__passenger_count'], dtype=object)

In [120]:
def evaluate_model(true, predict):
    r2score= r2_score(true, predict)
    mse= mean_squared_error(true, predict)
    mae= mean_absolute_error(true, predict)
    rmse= np.sqrt(mean_squared_error(true, predict))

    return r2score, mse, mae, rmse




In [123]:
models= {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(max_iter=1000),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    "BaggingRegressor": BaggingRegressor(base_estimator=LinearRegression(),
                                         n_estimators=500,
                                         max_samples=0.25,
                                         bootstrap=True,
                                         max_features=0.5,
                                         bootstrap_features=True,
                                         random_state=42),
    
    
}

In [118]:
model_list=[]
r2_list=[]
trained_model_list=[]


In [124]:
for i in range(len(list(models))):
    
    model=list(models.values())[i]
    pipe=Pipeline([
        ("model", model)
    ])
    pipe.fit(X_train,y_train)

    #Make Predictions
    y_pred=pipe.predict(X_test)

    #this is a validation(test) score
    r2score, mse, mae, rmse=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2score*100)

    r2_list.append(r2score)

    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 3239.4287739074957
MAE: 616.2325779574609
R2 score 0.9247008407282764




  model = cd_fast.enet_coordinate_descent(


Lasso
Model Training Performance
RMSE: 3242.582142146731
MAE: 609.9851379429075
R2 score 0.7317205445395825


Ridge
Model Training Performance
RMSE: 3239.035427575189
MAE: 615.8983432802441
R2 score 0.9487597364491118


Elasticnet
Model Training Performance
RMSE: 3246.927032644574
MAE: 608.7894289225056
R2 score 0.4655137059683345






BaggingRegressor
Model Training Performance
RMSE: 3241.0602187903564
MAE: 606.9354974223371
R2 score 0.8248828772892347


