# Setup & Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer

import preModelling.data_config as config
import preModelling.utils
from modelling.feature_config import NUM_ALL, CAT_ALL, BOOL_ALL, NUM_NO_WEATHER, \
  CAT_NO_WEATHER, BOOL_NO_WEATHER
from modelling.feature_importance import plot_lin_feature_importance, \
  plot_tree_feature_importance
from modelling.modelling_config import N_PICKUP_CLUSTERS, N_DROPOFF_CLUSTERS, \
  RANDOM_SEED, \
  KMEANS_BATCH_SIZE
from modelling.modelling_utils import feature_to_category, feature_as_bool, \
  make_all_models
from modelling.transformer import num_base_pipelining, cat_base_pipelining, \
  bool_base_pipelining, get_display_models_results, compare_models_results, \
  create_geo_clusters

In [2]:
# Load taxi and weather data
taxi_weather_raw = pd.read_csv(config.TAXI_WEATHER_DATA_SAVE)

# Train/Test Split

In [3]:
train_set, test_set = preModelling.utils.split_train_test(taxi_weather_raw, config.TEST_SIZE,
                                                          config.RANDOM_STATE)

In [4]:
taxi_weather = train_set.copy()
taxi_weather.drop(['trip_duration', 'trip_duration_log', 'trip_duration_min'], axis=1, inplace=True)
taxi_weather_labels = train_set[['trip_duration_log']].copy()

In [5]:
taxi_weather['hav_dist_km_log'] = np.log1p(taxi_weather['hav_dist_km'])

# Preprocessing-Configuration

In [6]:
taxi_weather = create_geo_clusters(
    taxi_weather,
    ['pickup_longitude', 'pickup_latitude'],
    'pickup',
    n_clusters=N_PICKUP_CLUSTERS,  # oder dein `N_PICKUP_CLUSTERS`
    random_state=RANDOM_SEED,
    batch_size=KMEANS_BATCH_SIZE
)

taxi_weather = create_geo_clusters(
    taxi_weather,
    ['dropoff_longitude', 'dropoff_latitude'],
    'dropoff',
    n_clusters=N_DROPOFF_CLUSTERS,
    random_state=RANDOM_SEED,
    batch_size=KMEANS_BATCH_SIZE
)
taxi_weather['pickup_cluster'] = taxi_weather['pickup_cluster'].astype('category')
taxi_weather['dropoff_cluster'] = taxi_weather['dropoff_cluster'].astype('category')

taxi_weather = feature_to_category(taxi_weather, CAT_ALL)
taxi_weather = feature_as_bool(taxi_weather, BOOL_ALL)

num_base_pipeline = num_base_pipelining()
cat_base_pipeline = cat_base_pipelining()
bool_base_pipeline = bool_base_pipelining()

y_train = taxi_weather_labels['trip_duration_log']

# Model Factories

## exp1 Training – All Models – All Features

In [7]:
preprocessing_base = ColumnTransformer([
  ('num', num_base_pipeline, NUM_ALL),
  ('cat', cat_base_pipeline, CAT_ALL),
  ('bool', bool_base_pipeline, BOOL_ALL)
])

# Prepare training data
X_train_all = taxi_weather[NUM_ALL + CAT_ALL + BOOL_ALL]

models_all = make_all_models(preprocessing_base)

In [8]:
result_all_models_all_att = get_display_models_results(models_all, X_train_all, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031445 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1741
[LightGBM] [Info] Number of data points in the train set: 777943, number of used features: 38
[LightGBM] [Info] Start training from score 6.467198




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030484 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1735
[LightGBM] [Info] Number of data points in the train set: 777943, number of used features: 38
[LightGBM] [Info] Start training from score 6.467330




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1729
[LightGBM] [Info] Number of data points in the train set: 777944, number of used features: 38
[LightGBM] [Info] Start training from score 6.465593




In [None]:
compare_models_results(result_all_models_all_att, seconds=True)

## exp Training – All Models – All Features

In [None]:
preprocessing_no_weather = ColumnTransformer([
  ('num', num_base_pipeline, NUM_NO_WEATHER),
  ('cat', cat_base_pipeline, CAT_NO_WEATHER),
  ('bool', bool_base_pipeline, BOOL_NO_WEATHER)
])

X_train_no_weather = taxi_weather[
  NUM_NO_WEATHER + CAT_NO_WEATHER + BOOL_NO_WEATHER
  ]

models_no_weather = make_all_models(preprocessing_no_weather)

In [None]:
result_all_models_no_weather = get_display_models_results(models_no_weather, X_train_no_weather,
                                                          y_train)

In [None]:
compare_models_results(result_all_models_no_weather, seconds=True)

## Top Features

In [None]:
linreg_model = models_all['LinearRegression']
plot_lin_feature_importance(linreg_model, X_train_all, y_train)

In [None]:
lasso_model = models_all['Lasso']
plot_lin_feature_importance(lasso_model, X_train_all, y_train)

In [None]:
ridge_model = models_all['Ridge']
plot_lin_feature_importance(ridge_model, X_train_all, y_train)

In [None]:
xgb_model = models_all['XGBoost']
plot_tree_feature_importance(xgb_model, X_train_all, y_train)

In [None]:
lgbm_model = models_all['LightGBM']
plot_tree_feature_importance(lgbm_model, X_train_all, y_train)

In [None]:
dt_model = models_all['DecisionTree']
plot_tree_feature_importance(dt_model, X_train_all, y_train)

In [None]:
rf_model = models_all['RandomForest']
plot_tree_feature_importance(rf_model, X_train_all, y_train)