# Setup & Imports

In [1]:
import importlib

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

import constants.features_c as features_c
import pipelines.training as training
from constants.modelling_c import TEST_SIZE, RANDOM_STATE
from pipelines import preprocessing
from pipelines.preprocessing import build_preprocessor, feature_to_category, \
  feature_to_fp32
from viz import plot_residual_heatmap
from viz import plot_residuals, plot_residual_scatter

# Reload the module to pick up any changes.
importlib.reload(features_c)
importlib.reload(preprocessing)
importlib.reload(training)

<module 'pipelines.training' from '/Users/Wendo99/Documents/50-Apps/PyCharm/NYC_Taxi/src/pipelines/training.py'>

In [2]:
df = training.load_taxi_weather_data()

In [3]:
df = df.drop(columns=[c for c in df.columns if c in features_c.DROPPED_FEATURES])

In [4]:
df = feature_to_fp32(df, features_c.NUM_ALL)
df = feature_to_category(df, features_c.CAT_ALL)

In [5]:
x_dataset = df.drop(columns=[features_c.TARGET])
y_dataset = df[features_c.TARGET]

x_train, x_test, y_train, y_test = train_test_split(
    x_dataset, y_dataset, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

# Preprocessing-Configuration

In [6]:
preprocessor = build_preprocessor()

## Hyperparameter Search

In [7]:
NEED_SEARCH = False

Ridge

In [8]:
if NEED_SEARCH:
  training.search_hyperparameters("Ridge", x_train, y_train, 2)

Random Forest

In [9]:
if NEED_SEARCH:
  training.search_hyperparameters("RandomForest", x_train, y_train, 4)

XGBoost

In [10]:
if NEED_SEARCH:
  training.search_hyperparameters("XGBoost", x_train, y_train, 14)

Bayes

In [11]:
if NEED_SEARCH:
  training.search_hyperparameters("Bayes", x_train, y_train, 6)

SVM

In [12]:
if NEED_SEARCH:
  training.search_hyperparameters("SVM", x_train, y_train, 10)

# First Train

In [13]:
RETRAIN = True

In [14]:
# linReg = training.fit_save_model("LinearRegression", preprocessor, x_train, y_train,
#                                  retrain=RETRAIN)

In [15]:
# ridge = training.fit_save_model("Ridge", preprocessor, x_train, y_train, retrain=RETRAIN)

In [16]:
# random_forest = training.fit_save_model("RandomForest", preprocessor, x_train, y_train,
#                                         retrain=RETRAIN)

In [17]:
xgboost = training.fit_save_model("XGBoost", preprocessor, x_train, y_train, retrain=RETRAIN)

In [18]:
# bayes = training.fit_save_model("Bayes", x_train, y_train, retrain=RETRAIN)

# CV Performance Search

In [19]:
CV_TRAIN = True

Lin Reg

In [20]:
# if CV_TRAIN:
#   training.cv_train('LinearRegression', linReg, x_train, y_train)

Ridge

In [21]:
# if CV_TRAIN:
#   training.cv_train('Ridge', ridge, x_train, y_train)

RF

In [22]:
# if CV_TRAIN:
#   training.cv_train('RandomForest', random_forest, x_train, y_train)

XGBoost

In [23]:
if CV_TRAIN:
  training.cv_train('XGBoost', xgboost, x_train, y_train)

XGBoost Log-RMSE (mean): 0.319698
XGBoost Log-RMSE (std): 0.000834


XGBoost Log-RMSE (mean): 0.338292
XGBoost Log-RMSE (std): 0.000219

XGBoost Log-RMSE (mean): 0.319581
XGBoost Log-RMSE (std): 0.000313

XGBoost Log-RMSE (mean): 0.319793
XGBoost Log-RMSE (std): 0.000200

XGBoost Log-RMSE (mean): 0.319698
XGBoost Log-RMSE (std): 0.000834

Bayes

In [24]:
# if CV_TRAIN:
#   training.cv_train('Bayes', x_train, y_train)

# Top Features

In [25]:
SHOW_TOP_F = True

Linear Regression

In [26]:
# if SHOW_TOP_F:
#   top20 = training.top_linreg_features(linReg, x_train=x_train, top_n=75)
#   print(top20)

Ridge

In [27]:
# if SHOW_TOP_F:
#   top20 = training.top_linreg_features(ridge, x_train=x_train, top_n=60)
#   print(top20)

Random Forest

In [28]:
# if SHOW_TOP_F:
#   top20_rf = training.top_tree_features(random_forest, x_train=x_train, top_n=60)
#   print(top20_rf)

XGBoost

In [29]:
if SHOW_TOP_F:
  top20_xgb = training.top_tree_features(xgboost, x_train=x_train, top_n=60)
  print(top20_xgb)

                  feature  importance  rel_importance  cum_importance
0   route_distance_log_km    0.461758           46.18       46.180000
1   trip_duration_outlier    0.097892            9.79       55.959999
2         hav_dist_km_log    0.083911            8.39       64.360001
3             hour_of_day    0.043096            4.31       68.669998
4          pickup_weekday    0.029452            2.95       71.610001
5       dropoff_cluster_3    0.027809            2.78       74.389999
6       dropoff_cluster_0    0.021012            2.10       76.489998
7              is_holiday    0.017584            1.76       78.250000
8       dropoff_cluster_4    0.017333            1.73       79.980003
9                    rain    0.016871            1.69       81.669998
10       dropoff_latitude    0.015339            1.53       83.209999
11       is_early_morning    0.014408            1.44       84.650002
12             is_rush_pm    0.013034            1.30       85.949997
13           pickup_

Bayes Net

In [None]:
# if SHOW_TOP_F:
#   top20_bayes = training.top_generic_features(bayes, x_train=x_train, y_train=y_train, top_n=75
#                                               )
#   print(top20_bayes)

# Error Analysis

In [None]:
model_name = "XGBoost"
model = xgboost

In [None]:
df_err = training.get_res_errors(modell=model, x_train=x_train,
                                 y_train=y_train)

In [None]:
plot_residuals(df_err, model_name)

In [None]:
plot_residual_scatter(df_err, model_name)

In [None]:
training.list_res_errors(df_err, model_name)

In [None]:
df_err['dist_bin'] = pd.qcut(
    df_err['route_distance_log_km'].fillna(0),
    q=10,
    labels=False
)

plot_residual_heatmap(
    df_err,
    model_name,
    x_col='dist_bin',
    y_col='hour_of_day'
)

plot_residual_heatmap(df_err,
                      model_name,
                      x_col="dist_bin",
                      y_col="hour_of_day")

plot_residual_heatmap(df_err,
                      model_name,
                      x_col="hour_of_day",
                      y_col="pickup_cluster")

plot_residual_heatmap(df_err,
                      model_name,
                      x_col="hour_of_day",
                      y_col="dropoff_cluster")


In [None]:
training.list_errors_10_bins(df_err, model_name, "hav_dist_km_log")

In [None]:
training.list_errors_10_bins(df_err, model_name, "route_distance_log_km")

# Feature eng / selection

In [None]:
corr = x_train.corr()

In [None]:
corr.temp_code

In [None]:
num = x_train.select_dtypes("number")
corr = num.corr()
sns.heatmap(corr, cmap="coolwarm", vmin=-1, vmax=1)
plt.show()

In [None]:
hi_corr = (corr.abs()
           .stack()
           .rename("corr")
           .reset_index()
           .query("level_0 < level_1 and corr >= 0.70")
           .sort_values("corr", ascending=False))
print(hi_corr.head(20))

In [None]:
rank = (
  x_train.assign(target=y_train)
  .corr()
  ["target"]
  .abs()
  .sort_values(ascending=False)
)
print(rank.head(20))