In [1]:
import pandas as pd
import mlflow
import seaborn
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [49]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("yellow-taxi-nyc-artifact")

<Experiment: artifact_location='/Users/sohaibarshid/Desktop/MLOps-Zoomcamp-learning/02-tracking/artifacts', creation_time=1716981348011, experiment_id='3', last_update_time=1716981348011, lifecycle_stage='active', name='yellow-taxi-nyc-artifact', tags={}>

In [3]:
taxi_data_1 = pd.read_parquet('../01-intro/data/yellow_tripdata_2024-01.parquet')
taxi_data_2 = pd.read_parquet('../01-intro/data/yellow_tripdata_2024-02.parquet')

In [4]:
print('Number of columns in the Jan data {}'.format(len(taxi_data_1.columns)))

Number of columns in the Jan data 19


### Homework Question 1: Number of Columns

In [5]:
"There are [{}] columns in the yellow trip data for the month of January.".format(len(taxi_data_1.columns))

'There are [19] columns in the yellow trip data for the month of January.'

In [6]:
'''
Counting the non-NA cells for each column or row
'''
print("Total number of rows {}".format(len(taxi_data_1)))
print()
print('Number of non zero values in each column')
print(taxi_data_1.count())

Total number of rows 2964624

Number of non zero values in each column
VendorID                 2964624
tpep_pickup_datetime     2964624
tpep_dropoff_datetime    2964624
passenger_count          2824462
trip_distance            2964624
RatecodeID               2824462
store_and_fwd_flag       2824462
PULocationID             2964624
DOLocationID             2964624
payment_type             2964624
fare_amount              2964624
extra                    2964624
mta_tax                  2964624
tip_amount               2964624
tolls_amount             2964624
improvement_surcharge    2964624
total_amount             2964624
congestion_surcharge     2824462
Airport_fee              2824462
dtype: int64


### Homework Question 2: Computing Duration

In [7]:
# Convert the columns to datetime if they are not already
taxi_data_1['tpep_pickup_datetime'] = pd.to_datetime(taxi_data_1['tpep_pickup_datetime'])
taxi_data_1['tpep_dropoff_datetime'] = pd.to_datetime(taxi_data_1['tpep_dropoff_datetime'])

# Calculate the duration in minutes
taxi_data_1['duration_in_minutes'] = (taxi_data_1['tpep_dropoff_datetime'] - taxi_data_1['tpep_pickup_datetime']).dt.total_seconds() / 60

# Display the DataFrame with the new duration column
print(taxi_data_1[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'duration_in_minutes']].head())

  tpep_pickup_datetime tpep_dropoff_datetime  duration_in_minutes
0  2024-01-01 00:57:55   2024-01-01 01:17:43            19.800000
1  2024-01-01 00:03:00   2024-01-01 00:09:36             6.600000
2  2024-01-01 00:17:06   2024-01-01 00:35:01            17.916667
3  2024-01-01 00:36:38   2024-01-01 00:44:56             8.300000
4  2024-01-01 00:46:51   2024-01-01 00:52:57             6.100000


In [8]:
taxi_data_1.duration_in_minutes.describe()["std"]

34.851053592212814

### Homework Question 3: Dropping Outliers

In [9]:
taxi_data_1.duration_in_minutes.describe()

count    2.964624e+06
mean     1.561295e+01
std      3.485105e+01
min     -1.356667e+01
25%      7.150000e+00
50%      1.163333e+01
75%      1.868333e+01
max      9.455400e+03
Name: duration_in_minutes, dtype: float64

In [10]:
# Remove outliers (keep records where the duration was between 1 and 60 minutes inclusive)
taxi_filtered = taxi_data_1[(taxi_data_1['duration_in_minutes'] >= 1) & (taxi_data_1['duration_in_minutes'] <= 60)]

In [11]:
print("The number of records left after filtering {}%".format(round(len(taxi_filtered) / len(taxi_data_1) * 100), 4))

The number of records left after filtering 98%


### Homework Question 4: One Hot Encoding

In [12]:
taxi_data_1.PULocationID.dtype

dtype('int32')

In [13]:
print(len(taxi_data_1.PULocationID.unique()))
print(len(taxi_data_1.DOLocationID.unique()))

260
261


In order to do one-hot encoding on the pickup/dropoff location ID, we need to first convert the variable into a string and then proceed with the next steps

In [14]:
def data_transformations(df: pd.DataFrame, datatype: str, vectorizer: DictVectorizer):
    # Convert pickup and dropoff location IDs to strings
    df['PULocationID'] = df['PULocationID'].astype(str)
    df['DOLocationID'] = df['DOLocationID'].astype(str)

    # Create a list of dictionaries from the DataFrame
    records = df[['PULocationID', 'DOLocationID']].to_dict(orient='records')
    
    
    if datatype == 'train':
        X = vectorizer.fit_transform(records)
    else:
        X = vectorizer.transform(records)
        
    # Print the feature names and the feature matrix
    print('The dimensionality of the DictVectorizer for the {} data is {}'.format(datatype, X.shape))
    return X    

In [15]:
# Fit a dictionary vectorizer
vectorizer = DictVectorizer(sparse=True)
X = data_transformations(taxi_filtered, 'train', vectorizer)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PULocationID'] = df['PULocationID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DOLocationID'] = df['DOLocationID'].astype(str)


The dimensionality of the DictVectorizer for the train data is (2898906, 518)


### Homework Question 5: Training a Model. RMSE on train

In [16]:
# label is obviously the duration of the trip in minutes (we created this in the previous steps)
y = taxi_filtered['duration_in_minutes'].values

In [17]:
print(y.shape, X.shape)

(2898906,) (2898906, 518)


In [21]:
from sklearn.linear_model import Lasso, Ridge
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import xgboost as xgb

In [22]:
# Applying the same data transformations on the 'taxi dataset from feb 2023'
# Convert the columns to datetime if they are not already
taxi_data_2['tpep_pickup_datetime'] = pd.to_datetime(taxi_data_2['tpep_pickup_datetime'])
taxi_data_2['tpep_dropoff_datetime'] = pd.to_datetime(taxi_data_2['tpep_dropoff_datetime'])

# Calculate the duration in minutes
taxi_data_2['duration_in_minutes'] = (taxi_data_2['tpep_dropoff_datetime'] - taxi_data_2['tpep_pickup_datetime']).dt.total_seconds() / 60
# Remove outliers (keep records where the duration was between 1 and 60 minutes inclusive)
taxi_2_filtered = taxi_data_2[(taxi_data_2['duration_in_minutes'] >= 1) & (taxi_data_2['duration_in_minutes'] <= 60)]

X_test = data_transformations(taxi_2_filtered, 'test', vectorizer)
y_test = taxi_2_filtered.duration_in_minutes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PULocationID'] = df['PULocationID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DOLocationID'] = df['DOLocationID'].astype(str)


The dimensionality of the DictVectorizer for the test data is (2938060, 518)


In [23]:
train = xgb.DMatrix(X, label=y)
valid = xgb.DMatrix(X_test, label=y_test)

In [26]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [30]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 20, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=Trials()
)

[0]	validation-rmse:8.67855                                                                                                                                                                                         
[1]	validation-rmse:8.30225                                                                                                                                                                                         
  0%|                                                                                                                                                                        | 0/10 [00:00<?, ?trial/s, best loss=?]




[2]	validation-rmse:8.17295                                                                                                                                                                                         
[3]	validation-rmse:8.05487                                                                                                                                                                                         
[4]	validation-rmse:7.93329                                                                                                                                                                                         
[5]	validation-rmse:7.84536                                                                                                                                                                                         
[6]	validation-rmse:7.75091                                                                                                                         




[1]	validation-rmse:8.03216                                                                                                                                                                                         
[2]	validation-rmse:7.78160                                                                                                                                                                                         
[3]	validation-rmse:7.52678                                                                                                                                                                                         
[4]	validation-rmse:7.40833                                                                                                                                                                                         
[5]	validation-rmse:7.31798                                                                                                                         




[1]	validation-rmse:9.30936                                                                                                                                                                                         
[2]	validation-rmse:9.00912                                                                                                                                                                                         
[3]	validation-rmse:8.78815                                                                                                                                                                                         
[4]	validation-rmse:8.62052                                                                                                                                                                                         
[5]	validation-rmse:8.47332                                                                                                                         




[1]	validation-rmse:9.76823                                                                                                                                                                                         
[2]	validation-rmse:9.57046                                                                                                                                                                                         
[3]	validation-rmse:9.38663                                                                                                                                                                                         
[4]	validation-rmse:9.22506                                                                                                                                                                                         
[5]	validation-rmse:9.08836                                                                                                                         




[1]	validation-rmse:8.61547                                                                                                                                                                                         
[2]	validation-rmse:8.30859                                                                                                                                                                                         
[3]	validation-rmse:8.10390                                                                                                                                                                                         
[4]	validation-rmse:7.99593                                                                                                                                                                                         
[5]	validation-rmse:7.79291                                                                                                                         




[2]	validation-rmse:8.21588                                                                                                                                                                                         
[3]	validation-rmse:8.14739                                                                                                                                                                                         
[4]	validation-rmse:8.07578                                                                                                                                                                                         
[5]	validation-rmse:8.03031                                                                                                                                                                                         
[6]	validation-rmse:7.95248                                                                                                                         




[1]	validation-rmse:8.01909                                                                                                                                                                                         
[2]	validation-rmse:7.79031                                                                                                                                                                                         
[3]	validation-rmse:7.51896                                                                                                                                                                                         
[4]	validation-rmse:7.43064                                                                                                                                                                                         
[5]	validation-rmse:7.20353                                                                                                                         




[1]	validation-rmse:8.04847                                                                                                                                                                                         
[2]	validation-rmse:7.81150                                                                                                                                                                                         
[3]	validation-rmse:7.59574                                                                                                                                                                                         
[4]	validation-rmse:7.47599                                                                                                                                                                                         
[5]	validation-rmse:7.39759                                                                                                                         




[2]	validation-rmse:9.80600                                                                                                                                                                                         
[3]	validation-rmse:9.68280                                                                                                                                                                                         
[4]	validation-rmse:9.57054                                                                                                                                                                                         
[5]	validation-rmse:9.46847                                                                                                                                                                                         
[6]	validation-rmse:9.37544                                                                                                                         




[1]	validation-rmse:9.68558                                                                                                                                                                                         
[2]	validation-rmse:9.45510                                                                                                                                                                                         
[3]	validation-rmse:9.26463                                                                                                                                                                                         
[4]	validation-rmse:9.09437                                                                                                                                                                                         
[5]	validation-rmse:8.95267                                                                                                                         

In [46]:
mlflow.xgboost.autolog(disable=True)
import pickle
import tempfile

temp_dir = tempfile.mkdtemp()
preprocessor_path = f"{temp_dir}/preprocessor.b"

In [57]:
with mlflow.start_run():
    
    train = xgb.DMatrix(X, label=y)
    valid = xgb.DMatrix(X_test, label=y_test)

    best_params = {
        'learning_rate': 0.92,
        'max_depth': 18,
        'min_child_weight': 1.20,
        'objective': 'reg:linear',
        'reg_alpha': 0.007,
        'reg_lambda': 0.003,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(vectorizer, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

[0]	validation-rmse:8.16939




[1]	validation-rmse:7.91863
[2]	validation-rmse:7.52303
[3]	validation-rmse:7.41591
[4]	validation-rmse:7.31637
[5]	validation-rmse:7.18933
[6]	validation-rmse:7.12456
[7]	validation-rmse:6.95096
[8]	validation-rmse:6.73493
[9]	validation-rmse:6.70369
[10]	validation-rmse:6.66031
[11]	validation-rmse:6.63719
[12]	validation-rmse:6.61687
[13]	validation-rmse:6.47887
[14]	validation-rmse:6.45333
[15]	validation-rmse:6.41888
[16]	validation-rmse:6.40409
[17]	validation-rmse:6.35825
[18]	validation-rmse:6.26222
[19]	validation-rmse:6.23335
[20]	validation-rmse:6.22013
[21]	validation-rmse:6.17762
[22]	validation-rmse:6.07482
[23]	validation-rmse:6.01231
[24]	validation-rmse:5.99922
[25]	validation-rmse:5.98920
[26]	validation-rmse:5.95301
[27]	validation-rmse:5.94783
[28]	validation-rmse:5.89962
[29]	validation-rmse:5.88239
[30]	validation-rmse:5.86725
[31]	validation-rmse:5.83676
[32]	validation-rmse:5.80953
[33]	validation-rmse:5.77331
[34]	validation-rmse:5.74959
[35]	validation-rmse:5.



In [59]:
from mlflow.models import infer_signature

signature = infer_signature(X_test, y_pred)

mlflow.xgboost.log_model(
        xgb_model=booster,
        artifact_path="xgboost-model",
        signature=signature,
        registered_model_name="xgb-learn-model-booster",
    )

Successfully registered model 'xgb-learn-model-booster'.
Created version '1' of model 'xgb-learn-model-booster'.


<mlflow.models.model.ModelInfo at 0x3b485a2d0>

In [None]:
!pip install --upgrade setuptools

In [32]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='distutils')

In [28]:
X = X.todense() 

### Homework Question 6: Evaluating the model (Feb). RMSE on validation

In [19]:
# Applying the same data transformations on the 'taxi dataset from feb 2023'
# Convert the columns to datetime if they are not already
taxi_data_2['tpep_pickup_datetime'] = pd.to_datetime(taxi_data_2['tpep_pickup_datetime'])
taxi_data_2['tpep_dropoff_datetime'] = pd.to_datetime(taxi_data_2['tpep_dropoff_datetime'])

# Calculate the duration in minutes
taxi_data_2['duration_in_minutes'] = (taxi_data_2['tpep_dropoff_datetime'] - taxi_data_2['tpep_pickup_datetime']).dt.total_seconds() / 60

In [20]:
taxi_data_2.duration_in_minutes.describe()

count    3.007526e+06
mean     1.598243e+01
std      3.428955e+01
min     -5.206667e+01
25%      7.366667e+00
50%      1.200000e+01
75%      1.920000e+01
max      5.671983e+03
Name: duration_in_minutes, dtype: float64

In [21]:
# Remove outliers (keep records where the duration was between 1 and 60 minutes inclusive)
taxi_2_filtered = taxi_data_2[(taxi_data_2['duration_in_minutes'] >= 1) & (taxi_data_2['duration_in_minutes'] <= 60)]

In [22]:
print("The number of records left after filtering {}%".format(round(len(taxi_2_filtered) / len(taxi_data_2) * 100), 4))

The number of records left after filtering 98%


In [23]:
X_test = data_transformations(taxi_2_filtered, 'test', vectorizer)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['PULocationID'] = df['PULocationID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DOLocationID'] = df['DOLocationID'].astype(str)


The dimensionality of the DictVectorizer for the test data is (2938060, 518)


In [24]:
y_test = taxi_2_filtered.duration_in_minutes
y_test.shape

(2938060,)

In [25]:
y_pred = model.predict(X_test)

# Calculate the RMSE
mse = mean_squared_error(y_test, y_pred, squared=False)

print("MSE on validation (feb) data:", mse)

MSE on validation (feb) data: 8.123388013175115


In [26]:
import pickle

In [27]:
with open("../models/lin_reg.bin", "wb") as file:
    pickle.dump((vectorizer, model), file)