First SVM model: linear SVC

For testing purposes: to reduce training time, we only test the initial model on a subset of the original data

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [None]:
df_test_no_null = pd.read_csv("testing_no_null.csv")
df_test = df_test_no_null.iloc[1:1000]

In [9]:
df_test.columns

Index(['Taxi ID', 'Trip Start Timestamp', 'Trip End Timestamp', 'Trip Seconds',
       'Trip Miles', 'Pickup Census Tract', 'Dropoff Census Tract',
       'Pickup Community Area', 'Dropoff Community Area',
       'Pickup Centroid Latitude', 'Pickup Centroid Longitude',
       'Pickup Centroid Location', 'Dropoff Centroid Latitude',
       'Dropoff Centroid Longitude', 'Dropoff Centroid  Location'],
      dtype='object')

In [20]:
df_test["Trip Start Timestamp"] = pd.to_datetime(df_test["Trip Start Timestamp"], errors="coerce")

# Create time buckets
df_test["hour"] = df_test["Trip Start Timestamp"].dt.hour
df_test["time_bucket"] = pd.cut(
    df_test["hour"],
    bins=[0, 8, 12, 16, 20, 24],
    labels=["0-8", "8-12", "12-16", "16-20", "20-24"],
    right=False
)

# Rename spatial unit
df = df_test.rename(columns={"Pickup Census Tract": "spatial_unit"})
df = df.dropna(subset=["time_bucket"])  # remove rows with missing bucket

# ⬇️ Aggregate demand by spatial unit and time bucket
agg_df = df.groupby(["spatial_unit", "time_bucket"]).size().reset_index(name="demand")

# Rename for model
agg_df.rename(columns={"spatial_unit": "hex_id"}, inplace=True)  # pretend census tract = hex_id
X = agg_df[["hex_id", "time_bucket"]]
y = agg_df["demand"]

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["hex_id", "time_bucket"])
])

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("scaler", StandardScaler(with_mean=False)),
    ("svm", SVR(kernel="linear"))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# print("R^2 Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


RMSE: 9.478178825047339


  agg_df = df.groupby(["spatial_unit", "time_bucket"]).size().reset_index(name="demand")
