In [1]:
import pandas as pd
import numpy as np
#import missingno as msn
import seaborn as sns

from sklearn.model_selection import train_test_split

RSEED = 42

In [2]:
df = pd.read_csv('data/train.csv')
df_variableDefinitions = pd.read_csv('data/variabledefinitions.csv')

## Preprocessing


In [3]:
# Replace missing values in travel_with where people traveled alone
df.loc[(df.travel_with.isna()) & (df[['total_male', 'total_female']].sum(axis=1) == 1), 'travel_with'] = 'Alone'

# Drop missing values in travel_with where we can't impute the missing values
df_new = df.dropna(subset=['travel_with'])
df_new.shape
# df[(df.travel_with.isna()) & (df[['total_male', 'total_female']].sum(axis=1) == 1)]

(4780, 23)

In [4]:
# Drop observations in total_male and total_female that dont contain values
indices_drop = df_new[(df_new['total_female'] == 0) & (df_new['total_male'] == 0)].index
df_new = df_new.drop(indices_drop, axis=0)

In [5]:
# Replace missing values in most_impressing with 'No comments'
df_new['most_impressing'] = df_new['most_impressing'].fillna('No comments')

In [6]:
# Replace missing values in most_impressing with 'No comments'
df_new['most_impressing'].replace(' Wildlife', 'Wildlife', inplace=True)
#print(df_new['most_impressing'].value_counts())


In [7]:
# Drop remaining Nan's in total_female and total_male
df_new = df_new.dropna()


In [8]:
df_new.drop(['ID'], axis=1, inplace=True)
df_new.to_csv('data/the_data_we_work_with.csv')

## Dummy encoding!

In [9]:
df_new.head(5)
df_new.shape

(4759, 22)

In [10]:
X = df_new.drop(['total_cost'], axis=1)
y = df_new.pop('total_cost')

In [11]:
X = pd.get_dummies(X, drop_first=True)
X

Unnamed: 0,total_female,total_male,night_mainland,night_zanzibar,country_ANGOLA,country_ARGENTINA,country_AUSTRALIA,country_AUSTRIA,country_BELGIUM,country_BERMUDA,...,payment_mode_Credit Card,payment_mode_Other,payment_mode_Travellers Cheque,first_trip_tz_Yes,most_impressing_Friendly People,most_impressing_Good service,most_impressing_No comments,most_impressing_Satisfies and Hope Come Back,most_impressing_Wildlife,"most_impressing_Wonderful Country, Landscape, Nature"
0,1.0,1.0,13.0,0.0,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,1.0,0.0,14.0,7.0,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
2,0.0,1.0,1.0,31.0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1.0,1.0,11.0,0.0,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,False,False
4,1.0,0.0,7.0,4.0,False,False,False,False,False,False,...,False,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4804,0.0,1.0,2.0,0.0,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
4805,1.0,1.0,11.0,0.0,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,False,False
4806,1.0,0.0,3.0,7.0,False,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
4807,1.0,1.0,5.0,0.0,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False


## Split Data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RSEED)

## Modelling

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [14]:
regressors = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor()]

### Simple Models with default values

In [15]:
len(y_test)

1428

In [16]:

def model_tester(X_train, y_train, X_test, regressor):
    '''
    Run multiple regression models and print evaluation metrics per model
    '''
    model = regressor
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    print(str(regressor).strip('()'))

    if 'tree' in str(regressor).lower():
        print("Tree Depths: " , model.get_depth())
        print("Number of leaves: " , model.get_n_leaves(), "\n")

    if 'linear' in str(regressor).lower():
        pass

    if 'neighbors' in str(regressor).lower():
        pass

    print("MSE: " , np.round(mean_squared_error(y_test, prediction), 2))
    print("RMSE: " , np.round(np.sqrt(mean_squared_error(y_test, prediction))*0.00036, 2))
    print("MAE: " , np.round(mean_absolute_error(y_test, prediction), 2))
    print("RSquared: " , np.round(r2_score(y_test, prediction), 2))
    print("RSquared (adjusted): ", np.round(1 - ( 1 - r2_score(y_test, prediction) ) * ( len(y_test) - 1 ) / ( len(y_test) - X_test.shape[1] - 1 ), 2))
    
    print("---"*10)

for reg in regressors:
    model_tester(X_train, y_train, X_test, reg)

LinearRegression
MSE:  90188516589364.02
RMSE:  3418.83
MAE:  5790196.29
RSquared:  0.37
RSquared (adjusted):  0.29
------------------------------


KNeighborsRegressor
MSE:  97498203678535.08
RMSE:  3554.68
MAE:  5250360.77
RSquared:  0.32
RSquared (adjusted):  0.23
------------------------------
DecisionTreeRegressor
Tree Depths:  43
Number of leaves:  3185 

MSE:  171624443213669.2
RMSE:  4716.2
MAE:  6695596.5
RSquared:  -0.2
RSquared (adjusted):  -0.35
------------------------------


## Thoughts

### Ideas
- Residual plots for predictions
    - For a good model we expect a random distribution of residuals
    - Patterns in our residuals are a sign of underfitting

### KNN
- Manhattan Distance (Minkowski p=1 / L1-norm, Taxicab or Cityblock distance) is better for sparse data (high dimensional data).
 - Sparse data contains many zeros e.g. as a result of dummy encoded categorical data  

### Hypotheses about our models (out of the box)
- Linear Regression
    - Prone to underfitting
    - High Bias and low Variance
- Decision Trees
    - Prone to overfitting
    - Low Bias but high variance
- KNN
    - Prone to overfitting
    - Low Bias but high variance

### Optimize LinReg Model with polynomial features

In [17]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

import matplotlib.pyplot as plt

In [18]:
# Anleitung: https://medium.com/@vk.viswa/demystifying-polynomial-regression-understanding-and-implementation-5f5635870b0c

lin = LinearRegression()
poly = PolynomialFeatures(degree=1)

X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

lin.fit(X_poly_train, y_train)

In [19]:
y_pred = lin.predict(X_poly_test)
#np.set_printoptions(precision=2)

In [20]:
print("RMSE in Euros: " , np.round(np.sqrt(mean_squared_error(y_test, y_pred))*0.00036, 2))
print("R2: " , np.round(r2_score(y_test, y_pred),2))

RMSE in Euros:  3418.83
R2:  0.37


Learnings:
- 2nd degree leads to overfitting and huge RMSE
- 3rd degree breaks jupyter notebook

### Hyperparameter Tuning

In [21]:
from scipy.stats import loguniform
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV

In [22]:
# define model
model = Ridge()

# define evaluation
#cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# define search space
space = dict()
space['solver'] = ['svd', 'cholesky', 'lsqr', 'sag']
space['alpha'] = loguniform(1e-5, 10)
space['fit_intercept'] = [True, False]
#space['normalize'] = [True, False]

# define search
search = RandomizedSearchCV(model, space, n_iter=20, scoring='neg_root_mean_squared_error', n_jobs=-1, random_state=RSEED)
# execute search
result = search.fit(X_train, y_train)



In [23]:
print(f'Best Score: {np.round(result.best_score_ * 0.00036, 2)}')
print(f'Best Hyperparameters: {result.best_params_}')

Best Score: -3561.41
Best Hyperparameters: {'alpha': 0.046894009635376835, 'fit_intercept': False, 'solver': 'sag'}
