In [3]:
import pandas as pd
import pyarrow
import argparse
import pathlib
import urllib.request
import pyarrow.dataset as ds
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

#!/usr/bin/env python3
import argparse
import pathlib
import glob
import pyarrow.dataset as ds
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [4]:
class TaxiDataset(Dataset):
    def __init__(self, df, cont_cols, cat_cols, scaler=None):
        self.cont = df[cont_cols].values.astype(np.float32)
        if scaler is not None:
            self.cont = scaler.transform(self.cont)
        self.cats = df[cat_cols].astype("category")
        self.cat_codes = np.stack([self.cats[col].cat.codes.values for col in cat_cols], 1).astype(np.int64)
        self.y = df["tips"].values.astype(np.float32)
        self.cardinalities = [len(self.cats[col].cat.categories) for col in cat_cols]

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.cont[idx], self.cat_codes[idx], self.y[idx]

In [5]:
class TipModel(nn.Module):
    def __init__(self, cat_cardinalities, cont_dim, hidden_sizes=(128, 64, 32), emb_rule=lambda c: min(50, (c + 1) // 2)):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(card, emb_rule(card)) for card in cat_cardinalities])
        emb_dim = sum(e.embedding_dim for e in self.embeddings)
        layers = []
        dims = [cont_dim + emb_dim] + list(hidden_sizes)
        for i in range(len(dims) - 1):
            layers.extend([
                nn.Linear(dims[i], dims[i + 1]),
                nn.ReLU(),
                nn.BatchNorm1d(dims[i + 1]),
                nn.Dropout(0.2),
            ])
        layers.append(nn.Linear(dims[-1], 1))
        self.mlp = nn.Sequential(*layers)

    def forward(self, cont, cats):
        emb_outs = [emb(cats[:, i]) for i, emb in enumerate(self.embeddings)]
        x = torch.cat(emb_outs + [cont], dim=1)
        return self.mlp(x).squeeze(1)

In [None]:
#!/usr/bin/env python3
import argparse
import glob
import pathlib

import numpy as np
import pandas as pd
import pyarrow.dataset as ds
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch import nn
from torch.utils.data import DataLoader, Dataset


class TaxiDataset(Dataset):
    def __init__(self, df, cont_cols, cat_cols, scaler=None):
        self.cont = df[cont_cols].values.astype(np.float32)
        if scaler is not None:
            self.cont = scaler.transform(self.cont)
        cats = df[cat_cols].astype("category")
        self.cat_codes = np.stack([cats[c].cat.codes.values for c in cat_cols], 1).astype(np.int64)
        self.y = df["tips"].values.astype(np.float32)
        self.cardinalities = [len(cats[c].cat.categories) for c in cat_cols]

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.cont[idx], self.cat_codes[idx], self.y[idx]


class TipModel(nn.Module):
    def __init__(self, cat_cardinalities, cont_dim, hidden=(128, 64, 32), emb_rule=lambda c: min(50, (c + 1) // 2)):
        super().__init__()
        self.emb = nn.ModuleList([nn.Embedding(card, emb_rule(card)) for card in cat_cardinalities])
        xdim = cont_dim + sum(e.embedding_dim for e in self.emb)
        layers = []
        dims = [xdim] + list(hidden)
        for a, b in zip(dims, dims[1:]):
            layers += [nn.Linear(a, b), nn.ReLU(), nn.BatchNorm1d(b), nn.Dropout(0.2)]
        layers += [nn.Linear(dims[-1], 1)]
        self.net = nn.Sequential(*layers)

    def forward(self, cont, cats):
        z = torch.cat([e(cats[:, i]) for i, e in enumerate(self.emb)] + [cont], 1)
        return self.net(z).squeeze(1)


def list_files(data_dir: pathlib.Path) -> list[pathlib.Path]:
    pat = str(data_dir / "fhvhv_tripdata_*.parquet")
    return [pathlib.Path(p) for p in sorted(glob.glob(pat))]


def preprocess(paths, sample=0.2):
    cols = [
        "pickup_datetime",
        "dropoff_datetime",
        "trip_miles",
        "trip_time",
        "PULocationID",
        "DOLocationID",
        "hvfhs_license_num",
        "dispatching_base_num",
        "shared_request_flag",
        "wav_request_flag",
        "base_passenger_fare",
        "tolls",
        "bcf",
        "sales_tax",
        "congestion_surcharge",
        "tips",
    ]
    table = ds.dataset([str(p) for p in paths], format="parquet").to_table(columns=cols)
    df = table.to_pandas()
    if sample < 1:
        df = df.sample(frac=sample, random_state=42)

    df["pickup_dt"] = pd.to_datetime(df["pickup_datetime"])
    df["trip_time_min"] = df["trip_time"].astype(float) / 60
    df["pickup_hour"] = df["pickup_dt"].dt.hour.astype("int8")
    df["pickup_dow"] = df["pickup_dt"].dt.dayofweek.astype("int8")

    df = df[(df["trip_miles"] > 0) & (df["trip_miles"] < 100)]
    df = df[(df["trip_time_min"] > 0) & (df["trip_time_min"] < 240)]
    df = df[df["base_passenger_fare"] > 0]
    df = df[df["tips"].notna()]
    return df


def train(df, cont, cat, device, epochs=5, batch=4096, lr=1e-3):
    tr, va = train_test_split(df, test_size=0.2, random_state=42)
    scaler = StandardScaler().fit(tr[cont])
    dtr = TaxiDataset(tr, cont, cat, scaler)
    dva = TaxiDataset(va, cont, cat, scaler)
    tldr = DataLoader(dtr, batch_size=batch, shuffle=True, num_workers=4, pin_memory=True)
    vldr = DataLoader(dva, batch_size=batch * 2, shuffle=False, num_workers=4, pin_memory=True)

    model = TipModel(dtr.cardinalities, len(cont)).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr)
    loss = nn.L1Loss()
    best = 1e9
    for ep in range(epochs):
        model.train(); s = 0
        for x, c, y in tldr:
            x, c, y = x.to(device), c.to(device), y.to(device)
            opt.zero_grad(); l = loss(model(x, c), y); l.backward(); opt.step(); s += l.item() * len(y)
        tr_mae = s / len(dtr)
        model.eval(); s = 0
        with torch.no_grad():
            for x, c, y in vldr:
                x, c, y = x.to(device), c.to(device), y.to(device)
                s += loss(model(x, c), y).item() * len(y)
        va_mae = s / len(dva)
        print(f"{ep+1}/{epochs} train {tr_mae:.2f} val {va_mae:.2f}")
        if va_mae < best:
            best = va_mae; torch.save(model.state_dict(), "tip_model.pt")
    print("best", best)


def cli(argv=None):
    p = argparse.ArgumentParser()
    p.add_argument("--data-dir", default="data")
    p.add_argument("--sample-frac", type=float, default=0.2)
    p.add_argument("--epochs", type=int, default=5)
    p.add_argument("--batch", type=int, default=4096)
    p.add_argument("--lr", type=float, default=1e-3)
    args, _ = p.parse_known_args(argv)

    paths = list_files(pathlib.Path(args.data_dir))
    if not paths:
        raise SystemExit("No Parquet files found in " + args.data_dir)

    df = preprocess(paths, args.sample_frac)

    cont = [
        "trip_miles",
        "trip_time_min",
        "base_passenger_fare",
        "tolls",
        "bcf",
        "sales_tax",
        "congestion_surcharge",
    ]
    cat = [
        "PULocationID",
        "DOLocationID",
        "hvfhs_license_num",
        "dispatching_base_num",
        "shared_request_flag",
        "wav_request_flag",
        "pickup_hour",
        "pickup_dow",
    ]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train(df, cont, cat, device, epochs=args.epochs, batch=args.batch, lr=args.lr)


if __name__ == "__main__":
    cli()


usage: ipykernel_launcher.py [-h] [--data-dir DATA_DIR]
                             [--sample-frac SAMPLE_FRAC] [--epochs EPOCHS]
                             [--batch-size BATCH_SIZE] [--lr LR]
ipykernel_launcher.py: error: unrecognized arguments: --f=/run/user/1000/jupyter/runtime/kernel-v328e87f0fff34ba946b6f85f751444f2a033c68ce.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
