In [16]:
import pandas as pd
import numpy as np


df_raw = pd.read_csv('/Users/chris/Downloads/testing_no_null.csv')
df_raw.head()

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,Extras,Trip Total,Payment Type,Company,Pickup Centroid Latitude,Pickup Centroid Longitude,Pickup Centroid Location,Dropoff Centroid Latitude,Dropoff Centroid Longitude,Dropoff Centroid Location
0,0000184e7cd53cee95af32eba49c44e4d20adcd8,f538e6b729d1aaad4230e9dcd9dc2fd9a168826ddadbd6...,01/19/2024 05:00:00 PM,01/19/2024 06:00:00 PM,4051.0,17.12,17031980000.0,17031320000.0,76.0,32.0,...,4.0,60.0,Credit Card,Flash Cab,41.979071,-87.90304,POINT (-87.9030396611 41.9790708201),41.884987,-87.620993,POINT (-87.6209929134 41.8849871918)
1,00007c3e7546e2c7d15168586943a9c22c3856cf,8ef1056519939d511d24008e394f83e925d2539d668a00...,01/18/2024 07:15:00 PM,01/18/2024 07:30:00 PM,1004.0,1.18,17031840000.0,17031840000.0,32.0,32.0,...,0.0,19.66,Mobile,5 Star Taxi,41.880994,-87.632746,POINT (-87.6327464887 41.8809944707),41.880994,-87.632746,POINT (-87.6327464887 41.8809944707)
2,0000cf293ada965f89a98c8ccfae7b0ce3a03e41,37073e8c9e454886fe4a916f80a9a3478570e7dd3e663f...,01/04/2024 07:15:00 AM,01/04/2024 07:30:00 AM,484.0,1.59,17031280000.0,17031320000.0,28.0,32.0,...,0.0,9.49,Mobile,City Service,41.879255,-87.642649,POINT (-87.642648998 41.8792550844),41.884987,-87.620993,POINT (-87.6209929134 41.8849871918)
3,0001235258d46a21317b6691ade9386c4d7e02c4,715b091e1001d1c17938c3b5ed7e23d926c53150ee2d0f...,01/25/2024 11:15:00 AM,01/25/2024 11:30:00 AM,1686.0,13.01,17031320000.0,17031980000.0,32.0,56.0,...,5.0,47.7,Credit Card,Chicago Independents,41.884987,-87.620993,POINT (-87.6209929134 41.8849871918),41.785999,-87.750934,POINT (-87.7509342894 41.785998518)
4,00012902ec577e1a25815a527b4204782daa98c8,4628ef9dfa973bdfe877c5aa9d9738f9dc1204e54f2f1a...,01/09/2024 03:15:00 PM,01/09/2024 03:30:00 PM,1047.0,3.02,17031330000.0,17031080000.0,33.0,8.0,...,0.0,13.08,Mobile,Chicago Independents,41.85935,-87.617358,POINT (-87.6173580061 41.859349715),41.892508,-87.626215,POINT (-87.6262149064 41.8925077809)


In [17]:
# -- parse timestamps to buckets -- 
df_raw['Trip Start Timestamp'] = pd.to_datetime(df_raw['Trip Start Timestamp'])
df_raw['hour'] = df_raw['Trip Start Timestamp'].dt.hour

# define time baskets
def time_basket(h):
    if 0 <= h < 6:
        return 'LateNight'   # 00:00–05:59
    elif 6 <= h < 12:
        return 'Morning'     # 06:00–11:59
    elif 12 <= h < 18:
        return 'Afternoon'   # 12:00–17:59
    else:
        return 'Evening'     # 18:00–23:59

df_raw['TimeBasket'] = df_raw['hour'].apply(time_basket)


  df_raw['Trip Start Timestamp'] = pd.to_datetime(df_raw['Trip Start Timestamp'])


In [18]:
# -- aggregate data on day, timestamp and area --
df_raw['date'] = df_raw['Trip Start Timestamp'].dt.date

# group and count trips
agg = (
    df_raw
    .groupby(['date', 'Pickup Community Area', 'TimeBasket'])
    .size()
    .reset_index(name='Demand')
)

# rename for clarity
agg.rename(columns={'Pickup Community Area':'Area'}, inplace=True)


In [19]:
agg['weekday'] = pd.to_datetime(agg['date']).dt.weekday
X = pd.get_dummies(
    agg[['Area','TimeBasket','weekday']],
    columns=['Area','TimeBasket','weekday'],
    drop_first=True
)
y = agg['Demand']

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


agg = agg.reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)
test_idx = X_test.index


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


In [21]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# linear SVR
model = SVR(kernel='linear', C=1.0, epsilon=0.1)
model.fit(X_train_scaled, y_train)

# predictions
y_pred = model.predict(X_test_scaled)



In [22]:
results = pd.DataFrame({
    'date'            : agg.loc[test_idx, 'date'].values,
    'Area'            : agg.loc[test_idx, 'Area'].values,
    'TimeBasket'      : agg.loc[test_idx, 'TimeBasket'].values,
    'ActualDemand'    : y_test.values,
    'PredictedDemand' : y_pred
})
results['Error']    = results['PredictedDemand'] - results['ActualDemand']
results['AbsError'] = results['Error'].abs()
results['APE (%)']  = 100 * results['AbsError'] / results['ActualDemand'].replace(0, np.nan)



In [None]:
results.to_csv('actuals_vs_predictions.csv', index=False)

Unnamed: 0,date,Area,TimeBasket,ActualDemand,PredictedDemand,Error,AbsError,APE (%)
0,2024-05-17,76.0,Morning,754,404.899964,-349.100036,349.100036,46.299740
1,2024-01-06,32.0,Morning,52,437.899866,385.899866,385.899866,742.115128
2,2024-03-08,56.0,Evening,59,84.899784,25.899784,25.899784,43.897940
3,2025-01-07,56.0,Afternoon,51,90.899970,39.899970,39.899970,78.235235
4,2025-03-18,6.0,Evening,3,6.299813,3.299813,3.299813,109.993775
...,...,...,...,...,...,...,...,...
4049,2025-03-27,8.0,LateNight,4,92.700023,88.700023,88.700023,2217.500574
4050,2024-07-16,43.0,Evening,1,-4.700101,-5.700101,5.700101,570.010079
4051,2024-09-05,56.0,Afternoon,157,91.899899,-65.100101,65.100101,41.465032
4052,2024-08-08,7.0,Evening,3,7.099833,4.099833,4.099833,136.661107
