In [59]:
import pandas as pd
import numpy as np
import missingno as msn
import seaborn as sns

from sklearn.model_selection import train_test_split

RSEED = 42

In [60]:
df = pd.read_csv('data/train.csv')
df_variableDefinitions = pd.read_csv('data/variabledefinitions.csv')

## Preprocessing


In [61]:
# Replace missing values in travel_with where people traveled alone
df.loc[(df.travel_with.isna()) & (df[['total_male', 'total_female']].sum(axis=1) == 1), 'travel_with'] = 'Alone'

# Drop missing values in travel_with where we can't impute the missing values
df_new = df.dropna(subset=['travel_with'])
# df[(df.travel_with.isna()) & (df[['total_male', 'total_female']].sum(axis=1) == 1)]

In [62]:
# Drop observations in total_male and total_female that dont contain values
indices_drop = df_new[(df_new['total_female'] == 0) & (df_new['total_male'] == 0)].index
df_new = df_new.drop(indices_drop, axis=0)

In [63]:
# Replace missing values in most_impressing with 'No comments'
df_new['most_impressing'] = df_new['most_impressing'].fillna('No comments')

In [64]:
# Replace missing values in most_impressing with 'No comments'
df_new['most_impressing'].replace(' Wildlife', 'Wildlife', inplace=True)

In [65]:
# Drop remaining Nan's in total_female and total_male
df_new = df_new.dropna()

In [66]:
df_new.drop(['ID'], axis=1, inplace=True)

In [67]:
df_new.shape

(4759, 22)

### F**k the  Outliers

In [68]:
# Computing Q1, Q3, and IQR
q1 = df_new['total_cost'].quantile(0.25)
q3 = df_new['total_cost'].quantile(0.75)

iqr = q3 - q1

In [69]:
# Final dataset without outliers
df_new = df_new.query('(@q1 - 1.5 * @iqr) <= @df_new["total_cost"] <= (@q3 + 1.5 * @iqr)')
#df_new.to_csv('data/the_data_we_work_with.csv')

## Dummy encoding!

In [70]:
X = df_new.drop(['total_cost'], axis=1)
y = df_new.pop('total_cost')

In [71]:
X = pd.get_dummies(X, drop_first=True)
X

Unnamed: 0,total_female,total_male,night_mainland,night_zanzibar,country_ANGOLA,country_ARGENTINA,country_AUSTRALIA,country_AUSTRIA,country_BELGIUM,country_BERMUDA,...,payment_mode_Credit Card,payment_mode_Other,payment_mode_Travellers Cheque,first_trip_tz_Yes,most_impressing_Friendly People,most_impressing_Good service,most_impressing_No comments,most_impressing_Satisfies and Hope Come Back,most_impressing_Wildlife,"most_impressing_Wonderful Country, Landscape, Nature"
0,1.0,1.0,13.0,0.0,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,1.0,0.0,14.0,7.0,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
2,0.0,1.0,1.0,31.0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1.0,1.0,11.0,0.0,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,False,False
4,1.0,0.0,7.0,4.0,False,False,False,False,False,False,...,False,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4804,0.0,1.0,2.0,0.0,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
4805,1.0,1.0,11.0,0.0,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,False,False
4806,1.0,0.0,3.0,7.0,False,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
4807,1.0,1.0,5.0,0.0,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False


## Split Data

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RSEED)

## Modelling

In [73]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [74]:
regressors = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor()]

### Simple Models with default values

In [75]:

def model_tester(X_train, y_train, X_test, regressor):
    '''
    Run multiple regression models and print evaluation metrics per model
    '''
    model = regressor
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    print(str(regressor).strip('()'))

    if 'tree' in str(regressor).lower():
        print("Tree Depths: " , model.get_depth())
        print("Number of leaves: " , model.get_n_leaves(), "\n")

    if 'linear' in str(regressor).lower():
        pass

    if 'neighbors' in str(regressor).lower():
        pass

    print("MSE: " , np.round(mean_squared_error(y_test, prediction), 2))
    print("RMSE: " , np.round(np.sqrt(mean_squared_error(y_test, prediction))*0.00036, 2))
    print("MAE: " , np.round(mean_absolute_error(y_test, prediction), 2))
    print("RSquared: " , np.round(r2_score(y_test, prediction), 2))
    print("RSquared (adjusted): ", np.round(1 - ( 1 - r2_score(y_test, prediction) ) * ( len(y_test) - 1 ) / ( len(y_test) - X_test.shape[1] - 1 ), 2))
    
    print("---"*10)

for reg in regressors:
    model_tester(X_train, y_train, X_test, reg)

LinearRegression
MSE:  3.6090612199412876e+32
RMSE:  6839110571590.36
MAE:  1325245955417456.5
RSquared:  -1.121360279326998e+19
RSquared (adjusted):  -1.2702220806001013e+19
------------------------------
KNeighborsRegressor
MSE:  20157901784958.76
RMSE:  1616.31
MAE:  2973477.36
RSquared:  0.37
RSquared (adjusted):  0.29
------------------------------
DecisionTreeRegressor
Tree Depths:  40
Number of leaves:  2870 

MSE:  32180841120839.94
RMSE:  2042.21
MAE:  3649183.8
RSquared:  0.0
RSquared (adjusted):  -0.13
------------------------------


### Optimize LinReg Model with polynomial features

In [76]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

In [77]:
# Anleitung: https://medium.com/@vk.viswa/demystifying-polynomial-regression-understanding-and-implementation-5f5635870b0c

lin = LinearRegression()
poly = PolynomialFeatures(degree=2)

X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

lin.fit(X_poly_train, y_train)

In [78]:
y_pred = lin.predict(X_poly_test)

In [79]:
print("RMSE in Euros: " , np.round(np.sqrt(mean_squared_error(y_test, y_pred))*0.00036, 2))
print("R2: " , np.round(r2_score(y_test, y_pred),2))

RMSE in Euros:  10325969001.68
R2:  -25562748776934.6


Learnings:
- 2nd degree leads to overfitting and huge RMSE
- 3rd degree breaks jupyter notebook

### Hyperparameter Tuning

#### Ridge Regularization

In [80]:
from scipy.stats import loguniform
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor

In [81]:
# Quelle code: https://machinelearningmastery.com/hyperparameter-optimization-with-random-search-and-grid-search/
# define model
model = Ridge()

# define evaluation
#cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# define search space
space = dict()
space['solver'] = ['svd', 'cholesky', 'lsqr', 'sag']
space['alpha'] = loguniform(1e-5, 10)
space['fit_intercept'] = [True, False]
#space['normalize'] = [True, False]

# define search
search = RandomizedSearchCV(model, space, n_iter=20, scoring='neg_root_mean_squared_error', n_jobs=-1, random_state=RSEED)
# execute search
result = search.fit(X_train, y_train)



In [82]:
print(f'Best Score (neg RMSE): {np.round(result.best_score_ * 0.00036, 2)}')
print(f'Best Hyperparameters: {result.best_params_}')

Best Score (neg RMSE): -1479.82
Best Hyperparameters: {'alpha': 0.046894009635376835, 'fit_intercept': False, 'solver': 'sag'}


#### Gradient Boosting Regressor (HistGradientBoostingRegressor)

In [83]:
gbr = HistGradientBoostingRegressor()

In [84]:
parameters = {'learning_rate': [0.01, 0.02, 0.03, 0.04],
              'max_depth': [4,6,8,10]}

In [85]:
grid_gbr = GridSearchCV(estimator=gbr, param_grid=parameters, scoring='neg_root_mean_squared_error' ,cv=2, n_jobs=1)
grid_gbr.fit(X_train, y_train)

In [86]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_gbr.best_estimator_)
print("\n The best RMSE (EUR) across ALL searched params:\n",np.round(grid_gbr.best_score_*0.00036))
print("\n The best parameters across ALL searched params:\n",grid_gbr.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 HistGradientBoostingRegressor(learning_rate=0.04, max_depth=4)

 The best RMSE (EUR) across ALL searched params:
 -1473.0

 The best parameters across ALL searched params:
 {'learning_rate': 0.04, 'max_depth': 4}


#### Gradient Boosting Regressor (GradientBoostingRegressor)

In [87]:
gbr = GradientBoostingRegressor()

In [88]:
parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
                'subsample'    : [0.9, 0.5, 0.2, 0.1],
                'n_estimators' : [100,500,1000, 1500],
                'max_depth'    : [4,6,8,10]
                }

In [89]:
grid_GBR = GridSearchCV(estimator=gbr, param_grid = parameters, scoring='neg_root_mean_squared_error', cv = 2, n_jobs=-1)
grid_GBR.fit(X_train, y_train)

In [91]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best RMSE (EUR) across ALL searched params:\n",np.round(grid_GBR.best_score_*0.00036))
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingRegressor(learning_rate=0.02, max_depth=10, subsample=0.1)

 The best RMSE (EUR) across ALL searched params:
 -1482.0

 The best parameters across ALL searched params:
 {'learning_rate': 0.02, 'max_depth': 10, 'n_estimators': 100, 'subsample': 0.1}


#### XBG Regressor

In [95]:
from xgboost import XGBRegressor

In [108]:
xgb_model = XGBRegressor()

In [111]:
parameters = {'learning_rate': [0.01, 0.02, 0.025, 0.03, 0.035, 0.04],
                'subsample'    : [0.9, 0.5, 0.2, 0.1],
                'n_estimators' : [50, 100, 200],
                'max_depth'    : [3,4,5,6]
                }

In [112]:
grid_GBR = GridSearchCV(estimator=xgb_model, param_grid=parameters, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1)
grid_GBR.fit(X_train, y_train)

In [118]:
y_pred = grid_GBR.predict(X_test)

In [113]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best RMSE (EUR) across ALL searched params:\n",np.round(grid_GBR.best_score_*0.00036))
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.02, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=200, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)

 The best RMSE (EUR) across ALL searched params:
 -1463.0

 The best parameters across ALL searched params:
 {'learning_rate': 0.02, 'max_depth': 5, 'n_estimators': 200, 'subsampl

In [115]:
mse = np.power(grid_GBR.best_score_, 2)
mse

16523048428852.076

In [117]:
def calc_mpe(y_true, y_predicted):
    mpe = np.mean(np.power((y_true-y_predicted)/y_true, 2))*100
    mpe = np.sqrt(mpe)
    return mpe

In [119]:
calc_mpe(y_test, y_pred)

110.27311110049817

##### Error Analysis