# Walidacja
## wystarczy odpalić cały plik

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

In [11]:
df_raw = pd.read_csv("hotel_bookings.csv")

df_raw_temp = df_raw.copy()
df_raw_temp['lead_time_std'] = -1.0
df_raw_temp.loc[df_raw_temp['lead_time'] > 0, 'lead_time_std'] = np.log(df_raw_temp.lead_time[df_raw_temp.lead_time > 0])
df_raw_temp['lead_time_std'] = (df_raw_temp['lead_time_std'] - np.mean(df_raw_temp['lead_time_std']))/np.std(df_raw_temp['lead_time_std'])

df_raw_temp['is_reserved_compatible'] = (df_raw_temp['assigned_room_type'] == df_raw_temp['reserved_room_type']).astype(int)
df_raw_temp["cancelations_proportion"] = df_raw_temp.apply(
    lambda row: 0.5 if row["is_repeated_guest"] == 0 else row["previous_cancellations"] / 
    (row["previous_cancellations"] + row["previous_bookings_not_canceled"]) 
    if (row["previous_cancellations"] + row["previous_bookings_not_canceled"]) > 0 else 0.5,
    axis=1
)
df_raw_en = pd.get_dummies(df_raw_temp, columns= df_raw_temp.drop(['market_segment'], axis = 1).select_dtypes(include=['object']).columns.to_list(), dtype='int')
df = df_raw_en.drop_duplicates().loc[
    :, ['is_canceled', 'required_car_parking_spaces', 'lead_time_std', 'is_reserved_compatible', 'total_of_special_requests', 
        'deposit_type_Non Refund', 'adr', 'cancelations_proportion', 'customer_type_Transient', 'country_PRT', 'previous_cancellations',
        'previous_bookings_not_canceled', 'hotel_Resort Hotel', 'country_GBR', 'country_FRA', 'market_segment']].dropna()

In [None]:
def create_sets(df, with_tests = 0): 
    y = np.array(df.is_canceled)
    X = df.drop(['is_canceled'], axis=1)

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )
    X_train, X_test, y_train, y_test = train_test_split(
        X_train, y_train, stratify=y_train, test_size=0.25, random_state=42 # 0.25 * 0.8 = 0.2
    )
    X_train_val=pd.concat((X_train,X_val))
    y_train_val=np.concatenate((y_train,y_val), axis=0)
    print(X_train.shape, X_val.shape, X_test.shape)
    if with_tests: return X_train, X_val, y_train, y_val, X_train_val, y_train_val, X_test, y_test
    return X_train, X_val, y_train, y_val, X_train_val, y_train_val

In [None]:
X_train, X_val, y_train, y_val, X_train_val, y_train_val, X_test, y_test = create_sets(df, 1)
X_y_train = X_train.copy()
X_y_train['is_canceled'] = y_train
X_y_val = X_val.copy()
X_y_val['is_canceled'] = y_val
X_y_test = X_test.copy()
X_y_test['is_canceled'] = y_test
X_y_train_val = X_train_val.copy()
X_y_train_val['is_canceled'] = y_train_val

X_y_variables = [X_y_train, X_y_val, X_y_test, X_y_train_val]
for X_y in X_y_variables:
    print(X_y.columns)
    target_means = X_y.groupby('market_segment')['is_canceled'].mean()
    X_y['market_encoded'] = X_y['market_segment'].map(target_means)
    X_y.drop(['market_segment'], axis=1, inplace=True)

y_train, X_train = X_y_train['is_canceled'], X_y_train.drop('is_canceled', axis = 1)
y_val, X_val = X_y_val['is_canceled'], X_y_val.drop('is_canceled', axis = 1)
y_test, X_test = X_y_test['is_canceled'], X_y_test.drop('is_canceled', axis = 1)
y_train_val, X_train_val = X_y_train_val['is_canceled'], X_y_train_val.drop('is_canceled', axis = 1)

In [None]:
metrics = {
        'f1': f1_score,
        'accuracy': accuracy_score,
        'precision': precision_score,
        'recall': recall_score,
        'roc_auc': roc_auc_score
    }

def disp_metrics(metrics, y_val_hat, _y_val = y_val , y_train = None, y_hat_train = None):
      for metric in metrics:
        print(metric + ":")
        print(metrics[metric](_y_val, y_val_hat))

def tester(model_test):
    model_test.fit(X_train_val, y_train_val)
    y_hat = model_test.predict(X_test)
    disp_metrics(metrics, y_hat, y_test)

In [None]:
best_params = {'l2_regularization': np.float64(0.13258194641807666),
 'learning_rate': np.float64(0.0627604096823675),
 'max_depth': 27,
 'max_iter': 472,
 'min_samples_leaf': 14}
tester(HistGradientBoostingClassifier(random_state=42, **best_params))