In [6]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
import mlflow
import mlflow.sklearn
import pickle
from mlflow.models.signature import infer_signature

In [2]:
sklearn.__version__

'1.6.1'

In [None]:
import pandas as pd

# Read March 2023 Yellow taxi trip data
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

# Print the number of rows
print(f"Number of records loaded: {len(df)}")


In [2]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

# Load and process data
url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet'
df = read_dataframe(url)

print(f"Cleaned data size: {len(df)}")


Cleaned data size: 3316216


In [3]:
categorical = ['PULocationID', 'DOLocationID']
dv = DictVectorizer()

train_dicts = df[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
y_train = df['duration'].values


In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train)

print(f"Intercept: {lr.intercept_:.2f}")


Intercept: 24.77


In [9]:
mlflow.set_tracking_uri("http://localhost:5000")  # Adjust URI if needed

input_example = X_train[:5]
signature = infer_signature(X_train, y_train)

with mlflow.start_run():
    mlflow.sklearn.log_model(
        lr,
        artifact_path="model",
        registered_model_name="lin_reg_yellow_taxi",
        signature=signature,
        input_example=input_example
    )
    mlflow.log_artifact("dict_vectorizer.pkl")


Registered model 'lin_reg_yellow_taxi' already exists. Creating a new version of this model...
2025/06/08 17:45:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lin_reg_yellow_taxi, version 2


🏃 View run unleashed-boar-647 at: http://localhost:5000/#/experiments/0/runs/93bff16307ab4f008e09fbc1367f3554
🧪 View experiment at: http://localhost:5000/#/experiments/0


Created version '2' of model 'lin_reg_yellow_taxi'.
