# Milestone 2

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("hotel_bookings.csv")

In [None]:
# from sklearnex import patch_sklearn
# patch_sklearn()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, SelectFromModel, RFE
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

### Poniższa komórka to przetworzenie danych z plików EDA i data_preparation

In [None]:
from sklearn.preprocessing import OneHotEncoder
pd.set_option('future.no_silent_downcasting', True)
null_replacements = {"children:": 0, "country": "Unknown", "agent": 0, "company": 0}
hotels_df = df.fillna(null_replacements)
hotels_df = pd.get_dummies(df, columns=['hotel'])
hotels_df = hotels_df.rename(columns={'hotel_City Hotel': 'City', 'hotel_Resort Hotel': 'Resort'})
encoder = OneHotEncoder(sparse_output=False)
encoded_months = encoder.fit_transform(hotels_df[['arrival_date_month']])
encoded_df = pd.DataFrame(encoded_months, columns=encoder.categories_[0])
encoded_df['is_canceled'] = hotels_df['is_canceled']
hotels_df = pd.concat([hotels_df, encoded_df], axis=1)
hotels_df["cancelations_proportion"] = hotels_df.apply(
    lambda row: 0.5 if row["is_repeated_guest"] == 0 else row["previous_cancellations"] / 
    (row["previous_cancellations"] + row["previous_bookings_not_canceled"]) 
    if (row["previous_cancellations"] + row["previous_bookings_not_canceled"]) > 0 else 0.5,
    axis=1
)
hotels_df['is_reserved_compatible'] = (hotels_df['assigned_room_type'] == hotels_df['reserved_room_type']).astype(int)
hotels_df = hotels_df.loc[:, ~hotels_df.columns.duplicated()].copy()
hotels_df[['City', 'Resort']] = hotels_df[['City', 'Resort']].replace({True: 1, False: 0}).infer_objects(copy=False)
columns_to_select = [
    'April', 'August', 'December', 'February', 'January', 'July', 'June', 'March', 'May',
    'November', 'October', 'September',  
    'required_car_parking_spaces',  
    'total_of_special_requests', 
    'is_canceled',  
    'is_reserved_compatible', 
    'cancelations_proportion',  
    'lead_time', 
    'City', 'Resort',
    'market_segment','distribution_channel',
    'customer_type','arrival_date_month'
]
hotels_df = hotels_df[columns_to_select]
target_means4 = hotels_df.groupby('arrival_date_month')['is_canceled'].mean()
# Usunięcie duplikatów
hotels_df['month_encoded'] = hotels_df['arrival_date_month'].map(target_means4)
hotels_df.drop_duplicates(inplace=True)
hotels_df = hotels_df.replace("Undefined", np.nan)
hotels_df = hotels_df.dropna()
# Target encoding dla market_segment
target_means = hotels_df.groupby('market_segment')['is_canceled'].mean()
hotels_df['market_encoded'] = hotels_df['market_segment'].map(target_means)
hotels_df.reset_index()
hotels_df = pd.get_dummies(hotels_df, columns=['market_segment'], drop_first=False)
hotels_df.iloc[:, -7:] = hotels_df.iloc[:, -7:].astype(int)
# Target encoding dla distribution_channel
target_means2 = hotels_df.groupby('distribution_channel')['is_canceled'].mean()
hotels_df['distribution_encoded'] = hotels_df['distribution_channel'].map(target_means2)
hotels_df.reset_index()
hotels_df = pd.get_dummies(hotels_df, columns=['distribution_channel'], drop_first=False)
hotels_df.iloc[:, -4:] = hotels_df.iloc[:, -4:].astype(int)
# Target encoding dla customer_type
target_means3 = hotels_df.groupby('customer_type')['is_canceled'].mean()
hotels_df['customer_encoded'] = hotels_df['customer_type'].map(target_means3)
hotels_df = pd.get_dummies(hotels_df, columns=['customer_type'], drop_first=False)
hotels_df.head()
hotels_df.iloc[:, -4:] = hotels_df.iloc[:, -4:].astype(int)
hotels_df.head()
hotels_df.drop(columns=['arrival_date_month'], inplace = True)

In [7]:
from sklearn.model_selection import train_test_split
X = hotels_df.drop(columns=['is_canceled']) 
y = hotels_df['is_canceled']

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, stratify=y, random_state=42
)

X_test, X_val, y_test, y_val = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

#### Tworzymy funkcję do podziału zbioru na 4 zestawy: 
- treningowy
- walidacyjny
- treningowo-walidacyjny (połączenie obu powyżej) - do walidacji krzyżowej
- testowy po weryfikacji wyników po skończeniu procesu

In [None]:
def create_sets(df, with_tests = 0): 
    y = np.array(df.is_canceled)
    X = df.drop(['is_canceled'], axis=1)

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    ) 

    X_train, X_test, y_train, y_test = train_test_split(
        X_train, y_train, stratify=y_train, test_size=0.25, random_state=42 # 0.25 * 0.8 = 0.2
    )

    X_train_val=pd.concat((X_train,X_val))
    y_train_val=np.concatenate((y_train,y_val), axis=0)
    print(X_train.shape, X_val.shape, X_test.shape)
    if with_tests: return X_train, X_val, y_train, y_val, X_train_val, y_train_val, X_test, y_test
    return X_train, X_val, y_train, y_val, X_train_val, y_train_val

##### Po podziale na zbiory możemy wykonać target encoding na każdym zbiorze osobno, aby uniknąć data leak (ten sposób pojawi się w kolejnym pliku gdzie modelujemy)

- Zmienna lead_time miała rozkład skośny, więc użyto na niej logarytmu i dokanano standaryzacji \
- Ponadto usunięto kolumny agent, company gdyż miały zbyt dużo braków danych i były z grubsza napisami \
- Usunięto także reservation_status i reservation_status_date, które niemal jednoznacznie pokrywały się z targetem, więc nie było sensu używać ich w modelowaniu

In [None]:
df['lead_time_std'] = -1.0
df.loc[df['lead_time'] > 0, 'lead_time_std'] = np.log(df.lead_time[df.lead_time > 0])
df['lead_time_std'] = (df['lead_time_std'] - np.mean(df['lead_time_std']))/np.std(df['lead_time_std'])
df_temp = df.drop_duplicates().drop(['agent', 'company', 'reservation_status', 'lead_time', 'reservation_status_date'], axis = 1).dropna()
target_means = df_temp.groupby('market_segment')['is_canceled'].mean()
df_temp['market_encoded'] = df_temp['market_segment'].map(target_means)
df_temp = df_temp.drop(['market_segment'], axis = 1)
df_all_en = pd.get_dummies(df_temp, columns= df_temp.select_dtypes(include=['object']).columns.to_list(), drop_first=True, dtype='int')
X_train, X_val, y_train, y_val, X_train_val, y_train_val= create_sets(df_all_en)
var_X = X_train
var_y = y_train

(52164, 239) (17388, 239) (17388, 239)


### Select KBest dla przygotowanych tak zbiorów

In [25]:
def kbest(score_func):
    cat_choose = SelectKBest(score_func, k='all') 
    cat_choose.fit(var_X, var_y)
    feature_scores = pd.DataFrame({'Feature': var_X.columns, 'Score': cat_choose.scores_})
    feature_scores = feature_scores.sort_values(by='Score', ascending=False)
    return feature_scores

In [46]:
feature_scores = kbest(f_classif)
print(feature_scores.iloc[:25, :])

                            Feature        Score
16                    lead_time_std  3147.896104
17                   market_encoded  2687.463693
14      required_car_parking_spaces  1831.517317
234         deposit_type_Non Refund  1390.760391
212      distribution_channel_TA/TO  1210.002753
13                              adr   985.137994
237         customer_type_Transient   879.991068
15        total_of_special_requests   786.912873
168                     country_PRT   777.386441
210     distribution_channel_Direct   739.086242
238   customer_type_Transient-Party   652.567330
11                  booking_changes   447.060888
8                 is_repeated_guest   428.484690
0                 arrival_date_year   424.247422
4              stays_in_week_nights   386.321717
5                            adults   337.668224
92                      country_GBR   294.712394
6                          children   280.201506
18               hotel_Resort Hotel   239.172737
3           stays_in

In [52]:
feature_scores = kbest(mutual_info_classif)
print(feature_scores.iloc[:25, :])

                            Feature     Score
13                              adr  0.038859
16                    lead_time_std  0.036490
14      required_car_parking_spaces  0.030784
17                   market_encoded  0.026172
212      distribution_channel_TA/TO  0.018365
234         deposit_type_Non Refund  0.017006
237         customer_type_Transient  0.015473
15        total_of_special_requests  0.010223
9            previous_cancellations  0.009770
168                     country_PRT  0.009295
11                  booking_changes  0.008675
0                 arrival_date_year  0.008208
10   previous_bookings_not_canceled  0.008165
210     distribution_channel_Direct  0.008095
238   customer_type_Transient-Party  0.006514
4              stays_in_week_nights  0.006313
8                 is_repeated_guest  0.005398
225            assigned_room_type_D  0.005298
95                      country_GHA  0.005184
5                            adults  0.005056
3           stays_in_weekend_night

In [8]:
from sklearn.feature_selection import SelectKBest, f_classif
cat_choose = SelectKBest(f_classif, k='all') 
cat_choose.fit(X_train, y_train)
feature_scores = pd.DataFrame({'Feature': X_train.columns, 'Score': cat_choose.scores_})
feature_scores = feature_scores.sort_values(by='Score', ascending=False)
print(feature_scores)

                           Feature        Score
14          is_reserved_compatible  1621.802217
12     required_car_parking_spaces  1549.252897
20                  market_encoded  1220.167029
16                       lead_time   996.152026
27        market_segment_Online TA   923.893578
15         cancelations_proportion   483.246047
28            distribution_encoded   464.601949
32      distribution_channel_TA/TO   461.422082
26    market_segment_Offline TA/TO   392.609327
33                customer_encoded   361.298493
36         customer_type_Transient   349.402222
13       total_of_special_requests   311.332470
30     distribution_channel_Direct   261.902249
24           market_segment_Direct   234.539867
37   customer_type_Transient-Party   223.780912
23        market_segment_Corporate   149.658352
29  distribution_channel_Corporate   138.657024
18                          Resort   113.046436
17                            City   113.046436
25           market_segment_Groups    70

### SelectFromModel: RandomForest i XGBoost

**RandomForest**

In [51]:
clf = RandomForestClassifier(random_state=42)
clf = clf.fit(var_X, var_y)
clf.feature_importances_  
model = SelectFromModel(clf, prefit=True)
X_train_t = model.transform(var_X)
selected_features = var_X.columns[model.get_support()]
selected_features

Index(['arrival_date_year', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'previous_cancellations',
       'previous_bookings_not_canceled', 'booking_changes', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'lead_time_std', 'market_encoded', 'hotel_Resort Hotel',
       'arrival_date_month_August', 'arrival_date_month_July',
       'arrival_date_month_June', 'arrival_date_month_May', 'meal_HB',
       'meal_SC', 'country_DEU', 'country_ESP', 'country_FRA', 'country_GBR',
       'country_ITA', 'country_PRT', 'distribution_channel_Direct',
       'distribution_channel_TA/TO', 'reserved_room_type_D',
       'assigned_room_type_D', 'assigned_room_type_E',
       'deposit_type_Non Refund', 'customer_type_Transient',
       'customer_type_Transient-Party'],
      dtype='object')

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf = clf.fit(X_train, y_train)
clf.feature_importances_  
model = SelectFromModel(clf, prefit=True)
X_train_t = model.transform(X_train)
selected_features = X_train.columns[model.get_support()]
print("Selected features:", selected_features)

Selected features: Index(['required_car_parking_spaces', 'total_of_special_requests',
       'is_reserved_compatible', 'lead_time'],
      dtype='object')


**XGBoost**

In [None]:
X_train, X_val, y_train, y_val, X_train_val, y_train_val= create_sets(df_all_en)
clf = xgb.XGBClassifier(learning_rate = 0.10780119865397095, n_estimators = 464, max_depth = 8)
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)])
y_hat = clf.predict(X_val)
model = SelectFromModel(clf, prefit=True)
X_train_t = model.transform(var_X)
selected_features_xgb = var_X.columns[model.get_support()]
print(selected_features_xgb)

Index(['arrival_date_year', 'stays_in_weekend_nights', 'stays_in_week_nights',
       'adults', 'children', 'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'booking_changes', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'market_encoded', 'hotel_Resort Hotel', 'arrival_date_month_December',
       'arrival_date_month_January', 'arrival_date_month_March',
       'arrival_date_month_May', 'arrival_date_month_November', 'meal_SC',
       'country_AGO', 'country_ARE', 'country_AUT', 'country_BEL',
       'country_BRA', 'country_CHE', 'country_CHN', 'country_DEU',
       'country_ESP', 'country_FIN', 'country_FRA', 'country_GBR',
       'country_GIB', 'country_IRL', 'country_ITA', 'country_MAR',
       'country_NLD', 'country_PRT', 'country_RUS',
       'distribution_channel_Direct', 'distribution_channel_TA/TO',
       'reserved_room_type_B', 'reserved_room_type_C', 'reserved_room_type_D',
       'reserved_room_type

### RFE

**LogisticRegression**

In [None]:
log_clf = LogisticRegression(C = 2, max_iter = 100000, solver='newton-cholesky', verbose=1)
selector = RFE(estimator=log_clf, n_features_to_select=10, step=1, verbose = 1)
selector.fit(var_X, var_y)
X_train_t = selector.transform(var_X)
selected_features2 = var_X.columns[selector.get_support()]
print(selected_features2)

In [50]:
print(selected_features2)

Index(['required_car_parking_spaces', 'market_encoded', 'country_ARE',
       'country_GIB', 'country_HKG', 'country_MDV', 'country_SRB',
       'assigned_room_type_I', 'assigned_room_type_K',
       'deposit_type_Non Refund'],
      dtype='object')


**XGBoost**

In [None]:
selector = RFE(estimator=xgb_clf, n_features_to_select=10, step=5, verbose = 1)
selector.fit(var_X, var_y)
X_train_t = selector.transform(var_X)
selected_features3 = var_X.columns[selector.get_support()]
print(selected_features3)

0              arrival_date_year
1         previous_cancellations
2    required_car_parking_spaces
3      total_of_special_requests
4                 market_encoded
5                    country_GIB
6                    country_PRT
7    distribution_channel_Direct
8        deposit_type_Non Refund
9        customer_type_Transient
Name: 0, dtype: object


**Podsumowanie - cechy wybrane do modelu**: 
- *kluczowe:* required_car_parking_spaces, lead_time_std, is_reserved_compatible, total_of_special_requests, deposit_type_Non Refund, adr,  market_encoded
- *pozostałe:* cancelations_proportion, customer_type_Transient, country_PRT, previous_cancellations, previous_bookings_not_canceled, hotel_Resort Hotel, country_GBR, country_FRA
Z krajów wybraliśmy te, które najwyżej się pojawiały i zarazem były dość liczne (nie ma sensu modelowanie w oparciu o kraje, z których pochodzi mniej niż 1 % klientów). Większość była z Portugalii \

Rozważyliśmy też zmienną is_repeated_guest, która w 1 etapie EDA wykazywała zauważalną zależność z etykietą docelową, ale żaden z algorytmów nie pokazał jej znaczącej zależności, więc odrzuciliśmy tę cechę