In [1]:
import pandas as pd

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [4]:
def load_and_preprocess_data(url, categorical):
    df = pd.read_parquet(url)
    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df[categorical] = df[categorical].astype(str)
    return df

In [5]:
def prepare_features(df, categorical):
    dicts = df[categorical].to_dict(orient="records")
    return dicts

In [6]:
def train_model(x_train, y_train):
    model = LinearRegression()
    model.fit(x_train, y_train)
    return model

In [7]:
def evaluate_model(model, x, y):
    y_pred = model.predict(x)
    rmse = mean_squared_error(y, y_pred, squared=False)
    return rmse

In [8]:
train_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
val_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet"
categorical = ["PULocationID", "DOLocationID"]
target = "duration"

# Load and preprocess training data
train_df = load_and_preprocess_data(train_url, categorical)
train_dicts = prepare_features(train_df, categorical)

# Vectorize features
dv = DictVectorizer()
x_train = dv.fit_transform(train_dicts)
y_train = train_df[target].values

# Train model
model = train_model(x_train, y_train)

# Evaluate on training data
train_rmse = evaluate_model(model, x_train, y_train)
print(f"Training RMSE: {train_rmse}")

# Load and preprocess validation data
val_df = load_and_preprocess_data(val_url, categorical)
val_dicts = prepare_features(val_df, categorical)

# Vectorize validation features
x_val = dv.transform(val_dicts)
y_val = val_df[target].values

# Evaluate on validation data
val_rmse = evaluate_model(model, x_val, y_val)
print(f"Validation RMSE: {val_rmse}")

Training RMSE: 7.6492610279057605
Validation RMSE: 7.81183265470218
