In [1]:
import os
import pickle
import argparse
import mlflow
import numpy as np
import pandas as pd
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import ExtraTreesRegressor  # Change: Using ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer

In [2]:
#define the training filepath
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("nyc-extra-trees-hyperopt-v1")

2023/12/20 21:55:06 INFO mlflow.tracking.fluent: Experiment with name 'nyc-extra-trees-hyperopt-v1' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/529822013619408985', creation_time=1703127306605, experiment_id='529822013619408985', last_update_time=1703127306605, lifecycle_stage='active', name='nyc-extra-trees-hyperopt-v1', tags={}>

In [3]:
def data_preprocess(filename):
    # Check file format and read DataFrame accordingly
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
        
        # Convert datetime columns to pandas datetime objects
        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
        
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
        
    # Calculate trip duration in minutes
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    # Filter trips based on duration (between 1 and 60 minutes)
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    # Convert selected columns to string type for categorical representation
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    # Create a new column 'PU_DO' by combining pickup and dropoff location IDs
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    
    # Return the processed DataFrame
    return df
