# Load data

In [1]:

import numpy as np
import pandas as pd

import features.config as config
import features.modelling as ml

In [2]:
# Load taxi and weather data
taxi_weather_raw = pd.read_csv(config.TAXI_WEATHER_DATA_SAVE)

## split train and test set

In [3]:
train_set, test_set = ml.split_train_test(taxi_weather_raw, config.TEST_SIZE,
                                          config.RANDOM_STATE)

# Pipelining

In [4]:
taxi_weather = train_set.copy()
taxi_weather.drop(['trip_duration', 'trip_duration_log', 'trip_duration_min'], axis=1, inplace=True)
taxi_weather_labels = train_set[['trip_duration', 'trip_duration_log', 'trip_duration_min']].copy()

In [5]:
taxi_weather['pickup_datetime'] = pd.to_datetime(taxi_weather['pickup_datetime'])
taxi_weather['dropoff_datetime'] = pd.to_datetime(taxi_weather['dropoff_datetime'])
taxi_weather['datetime_hour'] = pd.to_datetime(taxi_weather['datetime_hour'])
taxi_weather['hav_dist_km_log'] = np.log1p(taxi_weather['hav_dist_km'])

In [26]:
from sklearn.compose import ColumnTransformer

num_attr = ['passenger_count', 'hav_dist_km_log', 'temp_c', 'windspeed_kph_sqrt', 'humidity',
            'pressure_hpa']

pickup_attr = ['pickup_longitude', 'pickup_latitude']

dropoff_attr = ['dropoff_longitude', 'dropoff_latitude']

cat_attr = ['pickup_weekday', 'pickup_month', 'vendor_id', 'pickup_hour', 'hour_of_year',
            'hour_of_day', 'temp_code',
            'windspeed_code', 'humidity_code', 'fog_code', 'freezing_code', 'cloud_code',
            'hazy_code',
            'pressure_code', 'rain_code', 'snow_code']
for col in cat_attr:
  taxi_weather[col] = taxi_weather[col].astype('category')

bin_attr = ['store_and_fwd_flag_bin', 'fog', 'rain', 'snow', 'cloud_missing_flag']
taxi_weather[bin_attr] = taxi_weather[bin_attr].astype(bool)

attr_to_drop = [
  'pickup_datetime', 'dropoff_datetime', 'datetime_hour',
  'windspeed_outliers', 'daily_snow_outliers',
  'passenger_count_invalid', 'pickup_coord_invalid',
  'dropoff_coord_invalid', 'same_location_long_trip', 'trip_duration_outlier', 'windspeed_kph',
  'hav_dist_km', 'rain_mm', 'snow_mm', 'daily_precip_mm', 'daily_snow_mm']

## model prep

In [27]:

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import features.transformer as tf

# Preprocessing Pipelines
num_pipeline = tf.num_stand_pipeline()
geo_pipeline_pick = tf.geo_clustering_pipeline(5)
geo_pipeline_drop = tf.geo_clustering_pipeline(4)
cat_pipeline = tf.cat_onehot_pipeline()
bool_pipeline = tf.bool_pipeline()

# Finaler ColumnTransformer
lin_preprocessing = ColumnTransformer([
  ('num', num_pipeline, num_attr),
  ('cat', cat_pipeline, cat_attr),
  ('geo_pick', geo_pipeline_pick, pickup_attr),
  ('geo_drop', geo_pipeline_drop, dropoff_attr),
  ('bool', bool_pipeline, bin_attr)
])

In [28]:
taxi_weather_prep = lin_preprocessing.fit_transform(taxi_weather)

In [29]:
taxi_weather_prep.shape

(1166915, 36)

In [30]:
lin_preprocessing.get_feature_names_out()

array(['num__passenger_count', 'num__hav_dist_km_log', 'num__temp_c',
       'num__windspeed_kph_sqrt', 'num__humidity', 'num__pressure_hpa',
       'cat__pickup_weekday', 'cat__pickup_month', 'cat__vendor_id',
       'cat__pickup_hour', 'cat__hour_of_year', 'cat__hour_of_day',
       'cat__temp_code', 'cat__windspeed_code', 'cat__humidity_code',
       'cat__fog_code', 'cat__freezing_code', 'cat__cloud_code',
       'cat__hazy_code', 'cat__pressure_code', 'cat__rain_code',
       'cat__snow_code', 'geo_pick__minibatchkmeans0',
       'geo_pick__minibatchkmeans1', 'geo_pick__minibatchkmeans2',
       'geo_pick__minibatchkmeans3', 'geo_pick__minibatchkmeans4',
       'geo_drop__minibatchkmeans0', 'geo_drop__minibatchkmeans1',
       'geo_drop__minibatchkmeans2', 'geo_drop__minibatchkmeans3',
       'bool__store_and_fwd_flag_bin', 'bool__fog', 'bool__rain',
       'bool__snow', 'bool__cloud_missing_flag'], dtype=object)

## lin reg

In [31]:
from sklearn.pipeline import make_pipeline

# Modell-Pipeline
model_pipeline = make_pipeline(lin_preprocessing, (LinearRegression()))

# Trainingsdaten vorbereiten
X_train = taxi_weather[num_attr + pickup_attr + dropoff_attr + cat_attr + bin_attr]
y_train = taxi_weather_labels['trip_duration_log']

# Cross-Validation
lin_scores = cross_val_score(model_pipeline, X_train, y_train,
                             scoring="neg_root_mean_squared_error", cv=5)
print("log-RMSE:", -lin_scores.mean())

log-RMSE: 0.501912490153978


In [45]:
# Nach dem Vorhersagen:
y_pred_log = model_pipeline.predict(X_train)

# Rücktransformation:
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_train)

# RMSE im Originalraum
from sklearn.metrics import mean_squared_error
import numpy as np

mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
print("RMSE (seconds):", rmse)

RMSE (seconds): 128990379022.81541


In [39]:
pd.Series(-lin_scores).describe()

count    5.000000
mean     0.501912
std      0.003522
min      0.498397
25%      0.499968
50%      0.500402
75%      0.503539
max      0.507256
dtype: float64

## dec tree reg

In [34]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

tree_preprocessor = ColumnTransformer([
  ('num', num_pipeline, num_attr),
  ('cat', cat_pipeline, cat_attr),
  ('geo_pick', geo_pipeline_pick, pickup_attr),
  ('geo_drop', geo_pipeline_drop, dropoff_attr),
  ('bool', bool_pipeline, bin_attr)
])

tree_pipeline = Pipeline([
  ('pre', tree_preprocessor),
  ('model', DecisionTreeRegressor(random_state=42))
])

X_train = taxi_weather[num_attr + pickup_attr + dropoff_attr + cat_attr + bin_attr]
y_train = taxi_weather_labels['trip_duration_log']

tree_scores = cross_val_score(tree_pipeline, X_train, y_train,
                              scoring="neg_root_mean_squared_error", cv=5)

print("log-RMSE:", -tree_scores.mean())

log-RMSE: 0.5822420252243012


In [40]:
pd.Series(-tree_scores).describe()

count    5.000000
mean     0.582242
std      0.002768
min      0.579826
25%      0.579994
50%      0.581960
75%      0.582785
max      0.586645
dtype: float64

## random forest

In [36]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

rf_pipeline = Pipeline([
  ('pre', tree_preprocessor),
  ('model', RandomForestRegressor(n_estimators=10, max_depth=20,
                                  n_jobs=-1, random_state=42))
])

rf_scores = -cross_val_score(rf_pipeline, X_train, y_train,
                             scoring="neg_root_mean_squared_error", cv=5)

print("Random Forest log-RMSE:", rf_scores.mean())

Random Forest log-RMSE: 0.41943863883994387


In [42]:
pd.Series(rf_scores).describe()

count    5.000000
mean     0.419439
std      0.002677
min      0.417135
25%      0.417806
50%      0.417997
75%      0.420689
max      0.423567
dtype: float64

## hist gradient

In [37]:
from sklearn.ensemble import HistGradientBoostingRegressor

gb_pipeline = Pipeline([
  ('pre', tree_preprocessor),
  ('model', HistGradientBoostingRegressor(max_iter=100, max_depth=10,
                                          learning_rate=0.1, random_state=42))
])

gb_scores = -cross_val_score(gb_pipeline, X_train, y_train,
                             scoring="neg_root_mean_squared_error", cv=5)

print("Gradient Boosting log-RMSE:", gb_scores.mean())

Gradient Boosting log-RMSE: 0.4169309699557803


In [43]:
pd.Series(gb_scores).describe()

count    5.000000
mean     0.416931
std      0.003018
min      0.414050
25%      0.414696
50%      0.415543
75%      0.419996
max      0.420370
dtype: float64

## xboost