In [27]:
import os
import pickle
import argparse
import mlflow
import numpy as np
import pandas as pd
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import ExtraTreesRegressor  # Change: Using ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer

In [28]:
#define the training filepath
filename_jul_train = './dataset/yellow_tripdata_2023_07.parquet'
filename_aug_validate = './dataset/yellow_tripdata_2023_08.parquet'
filename_sept_test = './dataset/yellow_tripdata_2023_09.parquet'
model_output_filepath = './models'

In [13]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("nyc-extra-trees-hyperopt-v1")  # Change: Updated experiment name

2023/12/09 02:19:53 INFO mlflow.tracking.fluent: Experiment with name 'nyc-extra-trees-hyperopt-v1' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1702106393966, experiment_id='1', last_update_time=1702106393966, lifecycle_stage='active', name='nyc-extra-trees-hyperopt-v1', tags={}>

In [29]:
def data_preprocess(filename):
    # Check file format and read DataFrame accordingly
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
        
        # Convert datetime columns to pandas datetime objects
        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
        
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
        
    # Calculate trip duration in minutes
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    # Filter trips based on duration (between 1 and 60 minutes)
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    # Convert selected columns to string type for categorical representation
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    # Create a new column 'PU_DO' by combining pickup and dropoff location IDs
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    
    # Return the processed DataFrame
    return df


In [30]:
# Preprocess the training data
df_train = data_preprocess(filename_jul_train).head(1000)

# Preprocess the validation data
df_validate = data_preprocess(filename_aug_validate).head(1000)

# Preprocess the test data
df_test = data_preprocess(filename_sept_test).head(100)

# Print the lengths of the training and validation DataFrames
len(df_train), len(df_validate)

# Load your data (X_train, y_train, X_val, y_val)
# Define categorical and numerical features : independant features
categorical = ['PU_DO']
numerical = ['trip_distance']

# Initialize a DictVectorizer
dv = DictVectorizer()

# Convert training data to a dictionary of records and then transform it into a sparse matrix
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

# Convert validation data to a dictionary of records and then transform it into a sparse matrix
val_dicts = df_validate[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

# Convert test data to a dictionary of records and then transform it into a sparse matrix
test_dicts = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(test_dicts)

#dependant feature/ target variable

target = 'duration'

y_train = df_train[target].values
y_val = df_validate[target].values
y_test = df_test[target].values

In [32]:
target = ['duration']
df_train[categorical + numerical + target]

Unnamed: 0,PU_DO,trip_distance,duration
0,140_263,1.80,10.266667
1,163_163,2.31,20.316667
2,142_262,2.36,10.400000
3,68_24,4.36,14.950000
4,161_107,1.60,9.533333
...,...,...,...
1015,100_168,7.33,33.333333
1016,264_264,1.63,7.266667
1017,264_264,1.64,15.266667
1018,264_264,11.72,26.083333


In [19]:
# Create ExtraTreesRegressor model
et_model = ExtraTreesRegressor(n_estimators=100, random_state=42)

# Fit the model
et_model.fit(X_train, y_train)

# Predict on validation set
y_pred = et_model.predict(X_val)

# Calculate RMSE
rmse = mean_squared_error(y_val, y_pred, squared=False)


In [20]:
# Log model and parameters in MLflow
with mlflow.start_run():
    mlflow.set_tag("model", "ExtraTreesRegressor")
    mlflow.log_params({'n_estimators': 100, 'random_state': 42})  
    # Add other hyperparameters as needed
    mlflow.log_metric("rmse", rmse)


    # Save the trained model as a pickle file
    #model_filepath = "extra_trees_model.pkl"
    with open(f"{model_output_filepath}/extra_trees_model_v1.pkl", "wb") as f:
        pickle.dump((dv,et_model), f)

    # Register the model in MLflow
    mlflow.sklearn.log_model(et_model, "et_model")
    mlflow.log_artifact(f"{model_output_filepath}/extra_trees_model.pkl")
    
    print("Training sucessfully completed")

Training sucessfully completed


In [25]:
model_path = '/Users/shreyajaiswal/Library/CloudStorage/OneDrive-NortheasternUniversity/Projects/MLOps-NYC-Taxi/experiment_tracking/models/extra_trees_model_v1.pkl'


In [26]:
with open(f'{model_path}', 'rb') as f_out:
    dv,model = pickle.load(f_out)

In [24]:
model

In [52]:
print(df_test.count())

VendorID                 100
tpep_pickup_datetime     100
tpep_dropoff_datetime    100
passenger_count          100
trip_distance            100
RatecodeID               100
store_and_fwd_flag       100
PULocationID             100
DOLocationID             100
payment_type             100
fare_amount              100
extra                    100
mta_tax                  100
tip_amount               100
tolls_amount             100
improvement_surcharge    100
total_amount             100
congestion_surcharge     100
Airport_fee              100
duration                 100
PU_DO                    100
dtype: int64


In [53]:
# Load the trained model from the pickle file
with open(f"{model_output_filepath}/extra_trees_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)


In [54]:
# Predict on the test set
y_test_pred = loaded_model.predict(X_test)

In [55]:
print(y_test_pred)

[ 4.81016667  9.8025      5.60216667 12.67816667 19.26716667 26.50233333
 23.81616667 16.04633333 11.01166667  5.721      29.1455      7.02366667
 15.10125    10.83966667  5.48666667 24.45716667 10.43733333 13.01566667
 11.65816667 15.98916667  5.29233333  6.60466667  6.74116667  6.196
 15.50116667  9.00533333 16.9425      6.50183333  8.84233333  8.34766667
 11.936       6.58633333  4.478       4.59533333  4.00322222  8.811
 12.93533333  4.56033333  5.98666667 16.62366667 36.652       6.52433333
  5.59566667  6.77466667 14.9685      4.62877778 15.81216667  8.631
 24.644      16.9545      5.59566667 11.29333333 19.84516667 11.08933333
  5.1375     15.81216667 14.58316667  9.62216667  2.28383333  5.28616667
  9.62216667 10.06583333  6.64866667  5.59566667 25.10066667  3.53333333
 11.15266667  4.81016667  4.81016667 13.02866667  5.76066667 11.4265
 37.179      11.5945      4.851      15.17333333 11.07583333  5.132
 14.85633333  5.18466667  6.64866667  4.59388889 12.0555      5.71933333
  

In [57]:
# Calculate RMSE for the test set
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
print(rmse_test)

6.056218767811148
