In [1]:
import pickle

import altair as alt
import polars as pl
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import root_mean_squared_error

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [2]:
data_path = "../data/green_tripdata_2021-01.parquet"
df = pl.read_parquet(data_path)

categorical = ["PULocationID", "DOLocationID"]
numerical = ["trip_distance"]

df = (
    df.with_columns(
        pl.col(categorical).cast(pl.String()),
        pl.col("lpep_dropoff_datetime")
        .sub(pl.col("lpep_pickup_datetime"))
        .alias("duration"),
    )
    .filter(
        pl.col("duration")
        .ge(pl.duration(minutes=1))
        .and_(pl.col("duration").le(pl.duration(minutes=60)))
    )
    .with_columns(
        pl.col("duration").dt.total_seconds().truediv(60).alias("duration_minutes"),
    )
)

df.glimpse()

Rows: 73908
Columns: 22
$ VendorID                       <i64> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
$ lpep_pickup_datetime  <datetime[ns]> 2021-01-01 00:15:56, 2021-01-01 00:25:59, 2021-01-01 00:45:57, 2020-12-31 23:57:51, 2021-01-01 00:26:31, 2021-01-01 00:58:32, 2021-01-01 00:31:14, 2021-01-01 00:08:50, 2021-01-01 00:35:13, 2021-01-01 00:39:57
$ lpep_dropoff_datetime <datetime[ns]> 2021-01-01 00:19:52, 2021-01-01 00:34:44, 2021-01-01 00:51:55, 2021-01-01 00:04:56, 2021-01-01 00:28:50, 2021-01-01 01:32:34, 2021-01-01 00:55:07, 2021-01-01 00:21:56, 2021-01-01 00:44:44, 2021-01-01 00:55:25
$ store_and_fwd_flag             <str> 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'
$ RatecodeID                     <f64> 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
$ PULocationID                   <str> '43', '166', '41', '168', '75', '225', '244', '75', '74', '74'
$ DOLocationID                   <str> '151', '239', '42', '75', '75', '265', '244', '213', '238', '60'
$ passenger_count             

In [3]:
train_dicts = df.select(pl.col(categorical + numerical)).to_dicts()

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = "duration_minutes"
y_train = df[target].to_numpy()

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
root_mean_squared_error(y_train, y_pred)

9.838799799829626

In [4]:
pl.DataFrame({"y_pred": y_pred, "y_train": y_train}).unpivot()

variable,value
str,f64
"""y_pred""",7.666992
"""y_pred""",13.742974
"""y_pred""",8.400937
"""y_pred""",15.991386
"""y_pred""",10.154144
…,…
"""y_train""",38.0
"""y_train""",38.0
"""y_train""",11.0
"""y_train""",27.0


In [5]:
alt.Chart(
    pl.DataFrame({"y_pred": y_pred, "y_train": y_train}).unpivot()
).transform_density(
    "value",
    groupby=["variable"],
).mark_line(opacity=0.5).encode(
    alt.X("value:Q"),
    alt.Y("density:Q"),
    alt.Color("variable:N"),
)

In [6]:
alt.Chart(pl.DataFrame({"y_pred": y_pred, "y_train": y_train}).unpivot()).mark_bar(
    opacity=0.5
).encode(
    alt.X("value", bin=alt.BinParams(maxbins=50)),
    alt.Y("count()", stack=None),
    alt.Color("variable"),
)