In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import root_mean_squared_error
import mlflow
import pickle

In [2]:
## This line tells MLflow where to store the metadata (metrics, parameters, and tags) for your runs.
mlflow.set_tracking_uri("sqlite:///mlflow.db")

## This line organizes your work by assigning your runs to a specific named group called an "experiment"
mlflow.set_experiment("taxi-duration-experiment")

2026/01/04 16:26:26 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/04 16:26:26 INFO mlflow.store.db.utils: Updating database tables
2026/01/04 16:26:26 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/04 16:26:26 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/04 16:26:26 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/04 16:26:26 INFO alembic.runtime.migration: Will assume non-transactional DDL.


<Experiment: artifact_location='/workspaces/everything-about-mlops/02-experiments-tracking/mlruns/1', creation_time=1767531867279, experiment_id='1', last_update_time=1767531867279, lifecycle_stage='active', name='taxi-duration-experiment', tags={}>

In [3]:
def read_dataset(url):
    df = pd.read_parquet(url)

    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds()/60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']

    df[categorical] = df[categorical].astype(str)

    return df

In [4]:
df_train = read_dataset("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
df_val = read_dataset("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")

In [5]:
categorical = ['PULocationID', 'DOLocationID']
dv = DictVectorizer()
train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [6]:
model = LinearRegression()
y_train = df_train['duration'].values
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
y_val = df_val['duration'].values

rmse = root_mean_squared_error(y_pred, y_val)
print(rmse)

7.811821356741418


In [7]:
with open('models/line_reg.bin','wb') as f:
    pickle.dump((dv, model), f)

In [None]:
with mlflow.start_run():
    mlflow.set_tag("developer", "shri")

    mlflow.log_param("train-data", "yellow_tripdata_2023-01")
    mlflow.log_param("test-data", "yellow_tripdata_2023-02")

    alpha = 0.1

    mlflow.log_param("alpha", alpha)

    lasso = Lasso(alpha)

    lasso.fit(X_train, y_train)

    y_pred = lasso.predict(X_val)

    rmse = root_mean_squared_error(y_pred, y_val)

    mlflow.log_metric("rmse", rmse)

: 

In [None]:
import xgboost