In [None]:
import pickle

import altair as alt
import polars as pl
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import root_mean_squared_error

In [None]:
data_path = "../data/green_tripdata_2021-01.parquet"
df = pl.read_parquet(data_path)

categorical = ["PULocationID", "DOLocationID"]
numerical = ["trip_distance"]

df = (
    df.with_columns(
        pl.col(categorical).cast(pl.String()),
        pl.col("lpep_dropoff_datetime")
        .sub(pl.col("lpep_pickup_datetime"))
        .alias("duration"),
    )
    .filter(
        pl.col("duration")
        .ge(pl.duration(minutes=1))
        .and_(pl.col("duration").le(pl.duration(minutes=60)))
    )
    .with_columns(
        pl.col("duration").dt.total_seconds().truediv(60).alias("duration_minutes"),
    )
)

df.glimpse()

Rows: 76518
Columns: 20
$ VendorID                       <i64> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
$ lpep_pickup_datetime  <datetime[ns]> 2021-01-01 00:15:56, 2021-01-01 00:25:59, 2021-01-01 00:45:57, 2020-12-31 23:57:51, 2021-01-01 00:16:36, 2021-01-01 00:16:36, 2021-01-01 00:19:14, 2021-01-01 00:26:31, 2021-01-01 00:57:46, 2021-01-01 00:58:32
$ lpep_dropoff_datetime <datetime[ns]> 2021-01-01 00:19:52, 2021-01-01 00:34:44, 2021-01-01 00:51:55, 2021-01-01 00:04:56, 2021-01-01 00:16:40, 2021-01-01 00:16:40, 2021-01-01 00:19:21, 2021-01-01 00:28:50, 2021-01-01 00:57:57, 2021-01-01 01:32:34
$ store_and_fwd_flag             <str> 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'
$ RatecodeID                     <f64> 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 5.0, 1.0, 1.0, 1.0
$ PULocationID                   <i64> 43, 166, 41, 168, 265, 265, 265, 75, 225, 225
$ DOLocationID                   <i64> 151, 239, 42, 75, 265, 265, 265, 75, 225, 265
$ passenger_count                <f64> 1.0, 1.0, 1.0, 1.0, 3.0, 3.

In [None]:
train_dicts = df.select(pl.col(categorical + numerical)).to_dicts()

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = "duration_minutes"
y_train = df[target].to_numpy()

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
root_mean_squared_error(y_train, y_pred)