In [10]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
import mlflow
import datetime
import pytz
import glob
from sklearn.pipeline import Pipeline
random_seed = 99
thailand_timezone = pytz.timezone('Asia/Bangkok')

mlflow.set_tracking_uri('http://127.0.0.1:50000')
mlflow.set_experiment('NYC-TAXI_TRIP-DURATION-PREDICTION')

<Experiment: artifact_location='/Users/bossthanison/Documents/Portfoilio/project/NYC-Taxi-Trip-Duration-Prediction/artifacts/1', creation_time=1687501755554, experiment_id='1', last_update_time=1687501755554, lifecycle_stage='active', name='NYC-TAXI_TRIP-DURATION-PREDICTION', tags={}>

In [2]:
file_path = "../data/raw/2020/*.parquet"
df = pd.concat(map(pd.read_parquet, glob.glob(file_path)))
df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
df['duration'] = df['duration'].apply(lambda x: round(x.total_seconds() / 60, 2))
df[['PULocationID', 'DOLocationID']] = df[['PULocationID', 'DOLocationID']].astype(str)
df = df[(df['duration'] >= 0) & (df['duration'] <= 70)]
print(df.shape)
df.head()

(1717365, 21)


Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
0,2,2020-03-01 00:20:18,2020-03-01 00:45:29,N,1.0,41,13,1.0,8.24,26.5,...,0.5,7.64,0.0,,0.3,38.19,1.0,1.0,2.75,25.18
1,2,2020-03-01 00:15:42,2020-03-01 00:44:36,N,1.0,181,107,1.0,4.87,21.0,...,0.5,0.0,0.0,,0.3,25.05,2.0,1.0,2.75,28.9
2,2,2020-03-01 00:36:18,2020-03-01 00:41:03,N,1.0,41,166,1.0,0.69,5.0,...,0.5,0.0,0.0,,0.3,6.3,2.0,1.0,0.0,4.75
3,1,2020-03-01 00:22:14,2020-03-01 00:32:57,N,1.0,129,7,1.0,1.8,9.0,...,0.5,0.0,0.0,,0.3,10.3,2.0,1.0,0.0,10.72
4,2,2020-03-01 00:07:22,2020-03-01 00:14:16,N,1.0,74,152,1.0,1.25,7.0,...,0.5,2.49,0.0,,0.3,10.79,1.0,1.0,0.0,6.9


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1717365 entries, 0 to 63109
Data columns (total 21 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   lpep_pickup_datetime   datetime64[ns]
 2   lpep_dropoff_datetime  datetime64[ns]
 3   store_and_fwd_flag     object        
 4   RatecodeID             float64       
 5   PULocationID           object        
 6   DOLocationID           object        
 7   passenger_count        float64       
 8   trip_distance          float64       
 9   fare_amount            float64       
 10  extra                  float64       
 11  mta_tax                float64       
 12  tip_amount             float64       
 13  tolls_amount           float64       
 14  ehail_fee              object        
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  payment_type           float64       
 18  trip_type              float6

In [4]:
X = df[['PULocationID', 'DOLocationID', 'trip_distance']]
y = df['duration']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
print(f"Train data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

Train data shape: (1373892, 3)
Test data shape: (343473, 3)


In [11]:
# Define the regression models
models = [
    ("LinearRegression", LinearRegression()),
    ("Ridge", Ridge(random_state=random_seed)),
    ("Lasso", Lasso(random_state=random_seed)),
    ("ElasticNet", ElasticNet(random_state=random_seed)),
    ("DecisionTreeRegressor", DecisionTreeRegressor(random_state=random_seed)),
    ("RandomForestRegressor", RandomForestRegressor(random_state=random_seed)),
    ("XGBRegressor", XGBRegressor(random_state=random_seed)),
    ("LGBMRegressor", LGBMRegressor(random_state=random_seed)),
    ("CatBoostRegressor", CatBoostRegressor(random_state=random_seed)),
    ("SVR", SVR()),
    ("KNeighborsRegressor", KNeighborsRegressor()),
    ("AdaBoostRegressor", AdaBoostRegressor(random_state=random_seed)),
    ("GradientBoostingRegressor", GradientBoostingRegressor(random_state=random_seed)),
]

for name, model in models:
    current_datetime = datetime.datetime.now(thailand_timezone).strftime("%Y%m%dT%H%M%S")
    run_name = name + current_datetime
    with mlflow.start_run(run_name=run_name) as run:
        params = model.get_params()
        pipeline = Pipeline([('convert_to_dict', FunctionTransformer(lambda x: x.to_dict(orient='records'))),
                             ('vectorizer', DictVectorizer()),
                             (name, model)])
        pipeline.fit(X_train, y_train)
        train_predictions = pipeline.predict(X_train)
        train_rmse = mean_squared_error(y_train, train_predictions, squared=False)

        test_predictions = pipeline.predict(X_test)
        test_rmse = mean_squared_error(y_test, test_predictions, squared=False)

        mlflow.set_tags({"stage": "model_selection", "model": name})
        mlflow.log_params(params)
        mlflow.log_metric("Train RMSE", train_rmse)
        mlflow.log_metric("Test RMSE", test_rmse)

        final_model = pipeline.fit(X, y)
        mlflow.sklearn.log_model(final_model, "model")