# Wybór modelu

## Przygotowanie funkcji i zbiorów danych

In [1]:
import numpy as np
import pandas as pd

In [9]:
# from sklearnex import patch_sklearn
# patch_sklearn()
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
#from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, SelectFromModel, RFE
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error, precision_score, recall_score, roc_curve, auc
#from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
#import matplotlib.pyplot as plt
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK
from hyperopt.pyll.base import scope


In [13]:
def create_sets(df, with_tests = 0): 
    y = np.array(df.is_canceled)
    X = df.drop(['is_canceled'], axis=1)
    #X.is_canceled

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    ) 
    #stratify - podział zbioru y, żeby był zrównoważony
    X_train, X_test, y_train, y_test = train_test_split(
        X_train, y_train, stratify=y_train, test_size=0.25, random_state=42 # 0.25 * 0.8 = 0.2
    )
    #print(X_val)
    X_train_val=pd.concat((X_train,X_val))
    y_train_val=np.concatenate((y_train,y_val), axis=0)
    print(X_train.shape, X_val.shape, X_test.shape)
    if with_tests: return X_train, X_val, y_train, y_val, X_train_val, y_train_val, X_test, y_test
    return X_train, X_val, y_train, y_val, X_train_val, y_train_val

In [4]:
df_raw = pd.read_csv("hotel_bookings.csv")

**Przygotowanie ramki do modelu** - stworzenie i wybór zmiennych na podstawie wniosków z wcześniejszych etapów projektu

In [5]:
df_raw_temp = df_raw.copy()
df_raw_temp['lead_time_std'] = -1.0
df_raw_temp.loc[df_raw_temp['lead_time'] > 0, 'lead_time_std'] = np.log(df_raw_temp.lead_time[df_raw_temp.lead_time > 0])
df_raw_temp['lead_time_std'] = (df_raw_temp['lead_time_std'] - np.mean(df_raw_temp['lead_time_std']))/np.std(df_raw_temp['lead_time_std'])

df_raw_temp['is_reserved_compatible'] = (df_raw_temp['assigned_room_type'] == df_raw_temp['reserved_room_type']).astype(int)
df_raw_temp["cancelations_proportion"] = df_raw_temp.apply(
    lambda row: 0.5 if row["is_repeated_guest"] == 0 else row["previous_cancellations"] / 
    (row["previous_cancellations"] + row["previous_bookings_not_canceled"]) 
    if (row["previous_cancellations"] + row["previous_bookings_not_canceled"]) > 0 else 0.5,
    axis=1
)
df_raw_en = pd.get_dummies(df_raw_temp, columns= df_raw_temp.drop(['market_segment'], axis = 1).select_dtypes(include=['object']).columns.to_list(), dtype='int')
df = df_raw_en.drop_duplicates().loc[
    :, ['is_canceled', 'required_car_parking_spaces', 'lead_time_std', 'is_reserved_compatible', 'total_of_special_requests', 
        'deposit_type_Non Refund', 'adr', 'cancelations_proportion', 'customer_type_Transient', 'country_PRT', 'previous_cancellations',
        'previous_bookings_not_canceled', 'hotel_Resort Hotel', 'country_GBR', 'country_FRA', 'market_segment']].dropna()

In [6]:
df

Unnamed: 0,is_canceled,required_car_parking_spaces,lead_time_std,is_reserved_compatible,total_of_special_requests,deposit_type_Non Refund,adr,cancelations_proportion,customer_type_Transient,country_PRT,previous_cancellations,previous_bookings_not_canceled,hotel_Resort Hotel,country_GBR,country_FRA,market_segment
0,0,0,1.159231,1,0,0,0.00,0.5,1,1,0,0,1,0,0,Direct
1,0,0,1.581944,1,0,0,0.00,0.5,1,1,0,0,1,0,0,Direct
2,0,0,-0.981870,0,0,0,75.00,0.5,1,0,0,0,1,1,0,Direct
3,0,0,-0.641047,1,0,0,75.00,0.5,1,0,0,0,1,1,0,Corporate
4,0,0,-0.600246,1,1,0,98.00,0.5,1,0,0,0,1,1,0,Online TA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,0,-0.326924,1,0,0,96.14,0.5,1,0,0,0,0,0,0,Offline TA/TO
119386,0,0,0.493134,1,2,0,225.43,0.5,1,0,0,0,0,0,1,Online TA
119387,0,0,-0.111726,1,4,0,157.71,0.5,1,0,0,0,0,0,0,Online TA
119388,0,0,0.529678,1,0,0,104.40,0.5,1,0,0,0,0,1,0,Online TA


In [14]:
X_train, X_val, y_train, y_val, X_train_val, y_train_val, X_test, y_test = create_sets(df, 1)
X_train

(52437, 15) (17480, 15) (17479, 15)


Unnamed: 0,required_car_parking_spaces,lead_time_std,is_reserved_compatible,total_of_special_requests,deposit_type_Non Refund,adr,cancelations_proportion,customer_type_Transient,country_PRT,previous_cancellations,previous_bookings_not_canceled,hotel_Resort Hotel,country_GBR,country_FRA,market_segment
14374,0,-1.167121,1,0,0,120.00,0.5,1,1,1,0,1,0,0,Direct
66734,0,0.529678,1,2,0,97.02,0.5,1,0,0,0,0,0,0,Online TA
7371,0,0.861096,1,0,0,262.00,0.5,1,1,0,0,1,0,0,Online TA
13898,1,-2.603792,0,2,0,33.00,0.5,1,1,1,4,1,0,0,Corporate
8552,1,1.221569,1,0,0,54.00,0.5,0,0,0,0,1,0,0,Groups
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86997,0,-1.167121,1,0,0,65.00,0.0,1,1,0,6,0,0,0,Corporate
97719,0,0.316453,1,0,0,102.68,0.5,1,0,0,0,0,0,0,Offline TA/TO
112300,0,0.293668,1,0,0,144.00,0.5,1,0,0,0,0,0,0,Online TA
17700,0,-2.603792,0,0,0,58.00,0.5,1,0,0,0,1,1,0,Groups


**Stworzenie zmiennej przy wykorzystaniu Target Encoding**

In [15]:
X_y_train = X_train.copy()
X_y_train['is_canceled'] = y_train
X_y_val = X_val.copy()
X_y_val['is_canceled'] = y_val
X_y_test = X_test.copy()
X_y_test['is_canceled'] = y_test
X_y_train_val = X_train_val.copy()
X_y_train_val['is_canceled'] = y_train_val

X_y_variables = [X_y_train, X_y_val, X_y_test, X_y_train_val]
for X_y in X_y_variables:
    print(X_y.columns)
    target_means = X_y.groupby('market_segment')['is_canceled'].mean()
    X_y['market_encoded'] = X_y['market_segment'].map(target_means)
    X_y.drop(['market_segment'], axis=1, inplace=True)

y_train, X_train = X_y_train['is_canceled'], X_y_train.drop('is_canceled', axis = 1)
y_val, X_val = X_y_val['is_canceled'], X_y_val.drop('is_canceled', axis = 1)
y_test, X_test = X_y_test['is_canceled'], X_y_test.drop('is_canceled', axis = 1)
y_train_val, X_train_val = X_y_train_val['is_canceled'], X_y_train_val.drop('is_canceled', axis = 1)


Index(['required_car_parking_spaces', 'lead_time_std',
       'is_reserved_compatible', 'total_of_special_requests',
       'deposit_type_Non Refund', 'adr', 'cancelations_proportion',
       'customer_type_Transient', 'country_PRT', 'previous_cancellations',
       'previous_bookings_not_canceled', 'hotel_Resort Hotel', 'country_GBR',
       'country_FRA', 'market_segment', 'is_canceled'],
      dtype='object')
Index(['required_car_parking_spaces', 'lead_time_std',
       'is_reserved_compatible', 'total_of_special_requests',
       'deposit_type_Non Refund', 'adr', 'cancelations_proportion',
       'customer_type_Transient', 'country_PRT', 'previous_cancellations',
       'previous_bookings_not_canceled', 'hotel_Resort Hotel', 'country_GBR',
       'country_FRA', 'market_segment', 'is_canceled'],
      dtype='object')
Index(['required_car_parking_spaces', 'lead_time_std',
       'is_reserved_compatible', 'total_of_special_requests',
       'deposit_type_Non Refund', 'adr', 'cancelati

**Funkcje pomocnicze do tworzenia i ewalucacji modeli**

In [16]:
def disp_metrics(metrics, y_val_hat, _y_val = y_val , y_train = None, y_hat_train = None):
      for metric in metrics:
        print(metric + ":")
        # print("Zestaw treningowy:")
        # print(metrics[metric](y_train, y_hat_train))
        # print("Zestaw walidacyjny:")
        print(metrics[metric](_y_val, y_val_hat))

def modeling(model, method = None, src = 'accuracy', cv = None, x_train = X_train, _y_train = y_train, x_val = X_val, _y_val = y_val, return_clf = False, **kwargs):
    clf = model(**kwargs)
    if method == "grid":
        clf = GridSearchCV(clf, param_grid, cv=cv, scoring=src) #, n_jobs=-1)
    elif method == "random":
        clf = RandomizedSearchCV(clf, param_distributions=param_grid, cv=cv, scoring=src, n_iter=10) #,n_jobs=-1)
    if method is not None:
        clf = clf.best_estimator_
    clf.fit(x_train, _y_train, eval_set=[(x_val, _y_val)]) if model == xgb.XGBClassifier else clf.fit(x_train, _y_train)
    y_hat = clf.predict(x_val)
    #y_hat_train = clf.predict(x_train)
    #y_prob = clf.predict_proba(x_val)[:, 1]
    metrics = {
        'f1': f1_score,
        'accuracy': accuracy_score,
        'precision': precision_score,
        'recall': recall_score,
        'roc_auc': roc_auc_score
    }
    disp_metrics(metrics, y_hat)

    if return_clf: return metrics, y_hat, clf
    return metrics, y_hat

def optimise(model, space, metric = accuracy_score):
    def objective(params):
        clf = model(**params)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        res = metric(y_val, y_pred)
        return {'loss': -res, 'status': STATUS_OK}

    best_params = fmin(objective, space, algo=tpe.suggest, max_evals=100)
    print("Best set of hyperparameters: ", best_params)
    return best_params


## Modele
- zmienna objaśniana is_canceled, po m.in. usunięciu duplikatów z ramki, w wykorzystywanej ramce ma ok 72% zer i ok 28% jedynek
- strojenie hiperparametrów jest przedstawione w bardziej (np poprzez komórki z wywołanie hyperopt i zapisanymi najlepszymi parametrami) lub mniej (np w sytuacjach gdy dobierano je bardziej "ręcznie) jawny sposób

### Regresja logistyczna

In [None]:
modeling(LogisticRegression, max_iter=100000, solver='newton-cholesky')

f1:
0.5742694313083545
accuracy:
0.799141876430206
precision:
0.6879721092388147
recall:
0.49281997918834547
roc_auc:
0.7040431256888473


### Zwykłe drzewa
- raczej zbyt proste i nieskuteczne modele dla tego problemu

In [None]:
modeling(DecisionTreeClassifier) # dla criterion = "entropy" identyczne wyniki

f1:
0.5908379421892935
accuracy:
0.7756864988558353
precision:
0.5925073252406865
recall:
0.5891779396462019
roc_auc:
0.7177842360952903


In [None]:
modeling(ExtraTreesClassifier)

f1:
0.612205772712749
accuracy:
0.7917048054919908
precision:
0.6269633507853403
recall:
0.5981269510926118
roc_auc:
0.7316078542445308


In [None]:
modeling(RandomForestClassifier)

f1:
0.6253263707571801
accuracy:
0.802974828375286
precision:
0.6551173922954183
recall:
0.5981269510926118
roc_auc:
0.7393790574003494


### Drzewa Gradient Boosting
- zwracanie bardzo obiecujących wyników

#### XGB

In [None]:
space = {
    'max_depth': scope.int(hp.quniform('max_depth', 0, 8, 1)),
    'learning_rate': hp.loguniform('learning_rate', -5, -0.5),
    'n_estimators': scope.int(hp.quniform('n_estimators', 10, 700, 1))
}
optimise(xgb.XGBClassifier, space)

100%|██████████| 100/100 [06:08<00:00,  3.69s/trial, best loss: -0.7763157894736842]
Best set of hyperparameters:  {'learning_rate': np.float64(0.3100696163412409), 'max_depth': np.float64(2.0), 'n_estimators': np.float64(557.0)}


In [None]:
metrics, y_hat = modeling(xgb.XGBClassifier, tree_method="exact", early_stopping_rounds=100, learning_rate = 0.3100696163412409, n_estimators = 557, max_depth = 2)
disp_metrics(metrics, y_hat)

f1:
0.6462408140192198
accuracy:
0.8209954233409611
precision:
0.7074257425742574
recall:
0.5947970863683663
roc_auc:
0.750771324249272


#### GradientBoostingClassifier

In [None]:
metrics, y_hat = modeling(GradientBoostingClassifier)
disp_metrics(metrics, y_hat)

f1:
0.6208983011403305
accuracy:
0.8136155606407323
precision:
0.7041435735022433
recall:
0.5552549427679501
roc_auc:
0.7334065640861447


#### HistGradientBoostingClassifier

In [None]:
modeling(HistGradientBoostingClassifier, verbose = 2)

f1:
0.6506564919762092
accuracy:
0.8219107551487415
precision:
0.7060399415489528
recall:
0.6033298647242455
roc_auc:
0.754051520133326


**Testowanie dla różnych konfiguracji space i różnych loss function** - otrzymywanie różnych zestawów optymalnych hiperparametrów

In [29]:
space = {
    'class_weight': hp.choice('class_weight', ['balanced', None]),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 50, 1)),
    'l2_regularization': hp.loguniform('l2_regularization', -15, -0.5),
    'learning_rate': hp.uniform('learning_rate', 0.005, 0.5),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 3, 20, 1)),
    'max_iter' : scope.int(hp.quniform('max_iter', 100, 500, 1))
}
best_params = optimise(HistGradientBoostingClassifier, space)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 100/100 [02:39<00:00,  1.60s/trial, best loss: -0.8237986270022883]
Best set of hyperparameters:  {'class_weight': np.int64(1), 'l2_regularization': np.float64(0.13258194641807666), 'learning_rate': np.float64(0.0627604096823675), 'max_depth': np.float64(27.0), 'max_iter': np.float64(472.0), 'min_samples_leaf': np.float64(14.0)}


In [24]:
    # 'class_weight': hp.choice('class_weight', ['balanced', None]),
    # 'max_depth': scope.int(hp.quniform('max_depth', 5, 50, 1)),
    # 'l2_regularization': hp.loguniform('l2_regularization', -10, -0.5),
    # 'learning_rate': hp.uniform('learning_rate', 0.005, 0.5),
    # 'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 3, 20, 1)),
    # 'max_iter' : scope.int(hp.quniform('max_iter', 100, 500, 1))
best_params # roc_auc best loss: -0.7946477104101374
#{'class_weight': np.int64(0), 'l2_regularization': np.float64(0.0006269644876478571), 'learning_rate': np.float64(0.05793385109728729), 'max_depth': np.float64(35.0), 'max_iter': np.float64(255.0), 'min_samples_leaf': np.float64(14.0)}

{'class_weight': np.int64(0),
 'l2_regularization': np.float64(0.0006269644876478571),
 'learning_rate': np.float64(0.05793385109728729),
 'max_depth': np.float64(35.0),
 'max_iter': np.float64(255.0),
 'min_samples_leaf': np.float64(14.0)}

In [None]:
clf = HistGradientBoostingClassifier(best_params)
metrics, y_hat = modeling(HistGradientBoostingClassifier, verbose = 2)
disp_metrics(metrics, y_hat)

f1:
0.6495515695067264
accuracy:
0.8211670480549199
precision:
0.7040097205346294
recall:
0.6029136316337149
roc_auc:
0.753409478538751


In [18]:
    # 'class_weight': 'balanced',
    # 'max_depth': scope.int(hp.quniform('max_depth', 5, 50, 1)),
    # 'l2_regularization': hp.loguniform('l2_regularization', -10, -0.5),
    # 'learning_rate': hp.uniform('learning_rate', 0.005, 0.3),
    # 'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 3, 20, 1)),
    # 'max_iter' : scope.int(hp.quniform('max_iter', 100, 500, 1))
best_params # f1 best loss: -0.6739334071361662
#{'l2_regularization': np.float64(0.2624932565184252), 'learning_rate': np.float64(0.20598295811552644), 'max_depth': np.float64(37.0), 'max_iter': np.float64(170.0), 'min_samples_leaf': np.float64(13.0)}

{'l2_regularization': np.float64(0.2624932565184252),
 'learning_rate': np.float64(0.20598295811552644),
 'max_depth': np.float64(37.0),
 'max_iter': np.float64(170.0),
 'min_samples_leaf': np.float64(13.0)}

In [None]:
clf = HistGradientBoostingClassifier(best_params)
metrics, y_hat = modeling(HistGradientBoostingClassifier, verbose = 2)
disp_metrics(metrics, y_hat)

f1:
0.6484418944763191
accuracy:
0.8212242562929062
precision:
0.7056807051909892
recall:
0.5997918834547347
roc_auc:
0.7524797681573476


In [None]:
# 'class_weight': hp.choice('class_weight', ['balanced', None]),
#     'max_depth': scope.int(hp.quniform('max_depth', 5, 50, 1)),
#     'l2_regularization': hp.loguniform('l2_regularization', -15, -0.5),
#     'learning_rate': hp.uniform('learning_rate', 0.005, 0.5),
#     'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 3, 20, 1)),
#     'max_iter' : scope.int(hp.quniform('max_iter', 100, 500, 1))
best_params # acc -0.8237986270022883
#{'class_weight': np.int64(1), 'l2_regularization': np.float64(0.13258194641807666), 'learning_rate': np.float64(0.0627604096823675), 'max_depth': np.float64(27.0), 'max_iter': np.float64(472.0), 'min_samples_leaf': np.float64(14.0)}

{'class_weight': np.int64(1),
 'l2_regularization': np.float64(0.13258194641807666),
 'learning_rate': np.float64(0.0627604096823675),
 'max_depth': np.float64(27.0),
 'max_iter': np.float64(472.0),
 'min_samples_leaf': np.float64(14.0)}

In [None]:
clf = HistGradientBoostingClassifier(best_params)
metrics, y_hat = modeling(HistGradientBoostingClassifier, verbose = 2)
disp_metrics(metrics, y_hat)

f1:
0.6501965188096575
accuracy:
0.8217963386727689
precision:
0.7060975609756097
recall:
0.6024973985431842
roc_auc:
0.7537141825063061


In [None]:
best_params = {'l2_regularization': np.float64(0.13258194641807666),
 'learning_rate': np.float64(0.0627604096823675),
 'max_depth': 27,
 'max_iter': 472,
 'min_samples_leaf': 14}
metrics, y_hat = modeling(HistGradientBoostingClassifier, verbose = 2, **best_params)
disp_metrics(metrics, y_hat)

f1:
0.6532357906584131
accuracy:
0.8237414187643021
precision:
0.7112745098039216
recall:
0.6039542143600416
roc_auc:
0.7555076791721312


### AdaBoost, naive bayes
- wyniki znacznie poniżej oczekiwań

In [None]:
modeling(AdaBoostClassifier)

f1:
0.5612855007473841
accuracy:
0.798512585812357
precision:
0.6990381632019858
recall:
0.4688865764828304
roc_auc:
0.6961789884386539


In [None]:
modeling(GaussianNB)

f1:
0.5340634005763689
accuracy:
0.5375286041189932
precision:
0.36931048226385016
recall:
0.96420395421436
roc_auc:
0.6699915234582648


### SVM
- czasochłonne i dające słabe wyniki modele

In [None]:
modeling(SGDClassifier)

f1:
0.6153171907527117
accuracy:
0.799141876430206
precision:
0.6496992133271634
recall:
0.5843912591050988
roc_auc:
0.7324717636748374


In [None]:
metrics, y_hat = modeling(LinearSVC, verbose = 2)
disp_metrics(metrics, y_hat)

f1:
0.5575561902396622
accuracy:
0.7961670480549199
precision:
0.6911945812807881
recall:
0.4672216441207076
roc_auc:
0.6940447471096635


In [None]:
metrics, y_hat = modeling(SVC, kernel = 'linear', verbose = 2)
disp_metrics(metrics, y_hat)

f1:
0.5727941176470588
accuracy:
0.8005720823798627
precision:
0.6965722801788375
recall:
0.48636836628511965
roc_auc:
0.7030263922155381


In [None]:
metrics, y_hat = modeling(NuSVC, verbose = 2)
disp_metrics(metrics, y_hat)

f1:
0.5549923973644196
accuracy:
0.7990846681922197
precision:
0.7094266277939747
recall:
0.4557752341311134
roc_auc:
0.6925030016809414


### Neural network - Multi-layer Perceptron
- zwracanie dobrych wyników, aczkolwiek bez efektu "wow"

In [None]:
metrics, y_hat = modeling(MLPClassifier, verbose = 2)
disp_metrics(metrics, y_hat)

f1:
0.656127204605684
accuracy:
0.7847254004576659
precision:
0.5848810687520365
recall:
0.7471383975026015
roc_auc:
0.7730563782384802


**Testowanie dla różnych konfiguracji space i różnych loss function** - otrzymywanie różnych zestawów optymalnych hiperparametrów

In [10]:
space = {
    'alpha': hp.uniform('alpha', 0.000005, 0.03),
    'hidden_layer_sizes': (50,100,50),
    'early_stopping': True
}
best_params = optimise(MLPClassifier, space, metric=roc_auc_score)

100%|██████████| 100/100 [42:10<00:00, 25.30s/trial, best loss: -0.7534324247219468] 
Best set of hyperparameters:  {'alpha': np.float64(0.020378920097890663)}


In [11]:
# 'alpha': hp.uniform('alpha', 0.000005, 0.03),
# 'hidden_layer_sizes': (50,100,50),
# 'early_stopping': True
best_params #roc_auc best loss: -0.7534324247219468
#{'alpha': np.float64(0.020378920097890663)}


{'alpha': np.float64(0.020378920097890663)}

In [None]:
clf = MLPClassifier(best_params)
metrics, y_hat = modeling(MLPClassifier, verbose = 2)
disp_metrics(metrics, y_hat)

f1:
0.5748518423086834
accuracy:
0.8112128146453089
precision:
0.7544808927967535
recall:
0.4643080124869927
roc_auc:
0.7035149529890585


In [13]:
# 'hidden_layer_sizes': hp.choice('hidden_layer_sizes', [(50,50,50), (50,100,50), (100,), (100,100,100)]),
# 'activation': hp.choice('activation', ['relu', 'tanh']),
# 'alpha': hp.uniform('alpha', 0.00005, 0.01)
best_params #f1 best loss: -0.6592569723165586
#{'activation': np.int64(0),
#  'alpha': np.float64(0.004461189588224096),
#  'hidden_layer_sizes': np.int64(1)}

{'activation': np.int64(0),
 'alpha': np.float64(0.004461189588224096),
 'hidden_layer_sizes': np.int64(1)}

In [17]:
# clf = MLPClassifier(best_params)
# metrics, y_hat = modeling(MLPClassifier, verbose = 2)
disp_metrics(metrics, y_hat)

f1:
0.6062226543509966
accuracy:
0.8146453089244852
precision:
0.7286006427110722
recall:
0.5190426638917794
roc_auc:
0.7228743891451007


In [None]:
# 'hidden_layer_sizes': hp.choice('hidden_layer_sizes', [(50,), (50,100,50), (150,), (10,30,10)]),
#     'activation': hp.choice('activation', ['relu', 'tanh']),
#     'solver': hp.choice('solver', ['adam', 'sgd', 'lbfgs']),
#     'learning_rate': hp.choice('learning_rate', ['constant','adaptive']),
#     'alpha': hp.uniform('alpha', 0.0001, 0.5)
best_params # accuracy best loss -0.8152173913043478
#{'activation': np.int64(0),
#  'alpha': np.float64(0.0018514177742853834),
#  'hidden_layer_sizes': np.int64(1),
#  'learning_rate': np.int64(0),
#  'solver': np.int64(0)}

{'activation': np.int64(0),
 'alpha': np.float64(0.0018514177742853834),
 'hidden_layer_sizes': np.int64(1),
 'learning_rate': np.int64(0),
 'solver': np.int64(0)}

In [None]:
clf = MLPClassifier(best_params)
metrics, y_hat = modeling(MLPClassifier, verbose = 2)
disp_metrics(metrics, y_hat)

f1:
0.6438809261300992
accuracy:
0.8152173913043478
precision:
0.6846424384525205
recall:
0.6077003121748179
roc_auc:
0.7507929568763636


### Stacking Classifier
- wysoce skomplikowany model zawierający wewnątrz niektóre z wcześniej testowanych klasyfikatorów
- dobre wyniki, jednak brak poprawy względem najlepszych z wcześniej testowanych modeli

In [None]:
level0 = list()
level0.append(('lr', LogisticRegression(max_iter=100000, solver='newton-cholesky')))
level0.append(('rf', RandomForestClassifier(n_estimators=1000,
                                    max_depth=2,
                                    min_samples_split = 2,
                                    max_features = 3)))
level0.append(('mlp1', MLPClassifier()))
best_par = {'l2_regularization': np.float64(0.13258194641807666),
 'learning_rate': np.float64(0.0627604096823675),
 'max_depth': 27,
 'max_iter': 472,
 'min_samples_leaf': 14}
level0.append(('hboost1', HistGradientBoostingClassifier(**best_par)))
level0.append(('gboost', GradientBoostingClassifier()))
level0.append(('xgb', xgb.XGBClassifier(tree_method="exact",
                                        learning_rate = 0.3100696163412409, 
                                        n_estimators = 557, max_depth = 2
                    )))
level0.append(('mlp2', MLPClassifier(alpha = np.float64(0.0018514177742853834), 
                                     hidden_layer_sizes = (50,100,50))))
best_par = {'class_weight': 'balanced',
 'l2_regularization': np.float64(0.0006269644876478571),
 'learning_rate': np.float64(0.05793385109728729),
 'max_depth': 35,
 'max_iter': 255,
 'min_samples_leaf': 14}
level0.append(('hboost2', HistGradientBoostingClassifier(**best_par)))

level1 = LogisticRegression(max_iter=100000, solver='newton-cholesky')

In [39]:
metrics, y_hat = modeling(StackingClassifier, estimators=level0, final_estimator=level1, cv=5, verbose=2)
disp_metrics(metrics, y_hat)

f1:
0.6600175361683472
accuracy:
0.8225400457665903
precision:
0.6971521185459597
recall:
0.6266389177939646
roc_auc:
0.7617218257608876


#### Przykłady zastosowania walidacji krzyżowej z algorytmami grid search i random search

In [21]:
X_train, X_val, y_train, y_val, X_train_val, y_train_val, X_test, y_test = create_sets(df, 1)
def CrossValidateGrid(model, param_grid, random = False):

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    if not random:
        grid = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy')
        grid.fit(X_train_val, y_train_val)
        print(grid.best_params_)
        print(grid.best_score_)
    else:
        rand = RandomizedSearchCV(model, param_grid, cv=cv, scoring = 'accuracy')
        rand.fit(X_train_val, y_train_val)
        print(rand.best_score_)
        print(rand.best_params_)

(52437, 15) (17480, 15) (17479, 15)


In [None]:
CrossValidateGrid(DecisionTreeClassifier(), param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
})

{'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2} \
0.8153667052964444

In [None]:
modeling(DecisionTreeClassifier, criterion = 'gini', max_depth = 10, max_features = None, min_samples_leaf = 4, min_samples_split = 2)

f1:
0.6294037940379403
accuracy:
0.8122425629290618
precision:
0.6879782769686498
recall:
0.5800208116545266
roc_auc:
0.7401484728884072


In [None]:
CrossValidateGrid(LogisticRegression(), param_grid = {
    'penalty': ['l1', 'l2', None],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'newton-cholesky'],
    'l1_ratio': [0, 0.5, 1],
    'max_iter': [100, 500, 1000]
}, random= True)

0.799390590912044 \
{'solver': 'newton-cholesky', 'penalty': None, 'max_iter': 1000, 'l1_ratio': 1, 'C': 1}

In [None]:
modeling(LogisticRegression, solver = 'newton-cholesky', penalty = None, max_iter = 1000, l1_ratio = 1, C = 1)

f1:
0.5756805807622505
accuracy:
0.799370709382151
precision:
0.6875722543352601
recall:
0.4951092611862643
roc_auc:
0.7049116325655187


In [None]:
CrossValidateGrid(GradientBoostingClassifier(), param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}, random= True)

0.8214310505924896 \
{'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}

In [None]:
metrics, y_hat = modeling(GradientBoostingClassifier, n_estimators = 100, max_depth = 5, learning_rate = 0.1)
disp_metrics(metrics, y_hat)

f1:
0.6475014037057832
accuracy:
0.8204233409610984
precision:
0.703170731707317
recall:
0.6
roc_auc:
0.7519921104536489


Udało nam się podciągnąć zarówno accuracy na, którym był skoncentrowany algorytm, a także zarazem wzrosło f1. Z uwagi na spory koszt obliczeniowy i potrzebny czas wykonania algorytmu można było wykorzystać tylko 8 unikalnych zestawów hiperparametrów. Użyto random search dla przyspieszenia tego procesu. Ostatecznie jednak wybrany został HistGradientBoosting, który jest zarówno szybszy, jak i dawał lepsze wyniki przed strojeniem hiperparametrów. Dla nieograniczonych zasobów zwykły GradientBoosting może mieć jednak większy potencjał.

# Podsumowanie:
- rozważane metryki to f1_score, accuracy, precision, recall i roc_auc
- najlepiej wypadły modele HistGradientBoostingClassifier, XGBoost, MLPClassifier (wszystkie z dostrojonymi hiperparametrami) oraz StackingClassifier
- zdecydowaliśmy się na model HistGradientBoostingClassifier z hiperparametrami: {'l2_regularization': 0.13258194641807666,
 'learning_rate': 0.0627604096823675, 'max_depth': 27, 'max_iter': 472, 'min_samples_leaf': 14}
- jest on prostszy i mniej wymagający obliczeniowo niż StackingClassifier oraz daje nieznacznie, ale jednak lepsze wyniki niż modele XGBoost i MLPClassifier
 - Wyniki dla wybranego przez nas modelu: \
f1: \
0.6532357906584131 \
accuracy:\
0.8237414187643021\
precision:\
0.7112745098039216\
recall:\
0.6039542143600416\
roc_auc:\
0.7555076791721312

## Dodatek - model dla "surowych" danych

In [None]:
df_temp = df_raw.drop_duplicates().drop(['agent', 'company', 'reservation_status', 'reservation_status_date'], axis = 1).dropna()
df_all_en = pd.get_dummies(df_temp, columns= df_temp.select_dtypes(include=['object']).columns.to_list(), drop_first=True, dtype='int')
X_train, X_val, y_train, y_val, X_train_val, y_train_val, X_test, y_test = create_sets(df_all_en, 1)
clf = xgb.XGBClassifier(early_stopping_rounds=200, learning_rate = 0.10780119865397095, n_estimators = 900, max_depth = 8)
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)])
y_hat = clf.predict(X_val, iteration_range=(0, clf.best_iteration))

In [46]:
for metric in metrics:
        print(metric + ":")
        print(metrics[metric](y_val, y_hat))

f1:
0.7047366091190792
accuracy:
0.8465608465608465
precision:
0.7511205472988912
recall:
0.6637481759432979
roc_auc:
0.7899790836034495


**Wnioski** 
- model dla "surowych" danych (zawierających kilkaset, zamiast kilkunastu kolumn) z dostrojonymi hiperparametrami osiąga niewiele lepsze wyniki niż wybrany przez nas model, a jest nieporównywalnie gorszy z punktu widzenia wykorzystywanych zasobów
- wybrany przez nas model powinien być raczej uznany jako dobry, gdyż niewiele więcej można "wycisnąć" z tego zbioru danych