# HW 1 - DATA SCIENCE 2
## Adam Němec, Emma Kovalčíková

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from pathlib import Path

In [2]:
# set width of Jupyter notebook
from IPython.core.display import HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

# set some visual properties of displaying pandas DataFrame
pd.options.display.max_columns=200
pd.options.display.max_rows=200

In [3]:
# Load data - set index column, decimal point, separator
data_file = Path("../Data/2024_DS2_HW1_data_train.csv")
data = pd.read_csv(data_file, sep = ',', decimal = '.', index_col = 'Booking_ID')

# print time of data being loaded - use strftime
print(f'Data loaded on:   {datetime.datetime.now().strftime(format="%Y-%m-%d %H:%M:%S")}')

Data loaded on:   2024-04-16 18:05:28


In [4]:
data.head()

Unnamed: 0_level_0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
Booking_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
INN10204,,,,2.0,Meal Plan 2,,Room_Type 6,,2018.0,9.0,,Online,0.0,0.0,,,1.0,0.0
INN20020,,,,2.0,Meal Plan 1,,,,,12.0,,Online,0.0,0.0,0.0,,,0.0
INN16435,1.0,,,2.0,,0.0,Room_Type 1,,2018.0,11.0,,,0.0,0.0,,,1.0,0.0
INN07143,3.0,,,3.0,,,,100.0,2018.0,5.0,,Online,0.0,0.0,,,2.0,0.0
INN20511,1.0,0.0,1.0,1.0,Meal Plan 1,0.0,,,2018.0,11.0,,,0.0,0.0,0.0,150.0,,1.0


In [5]:
# Print some numbers about data sample size
print(f'Number of rows:   {data.shape[0]:,}'.replace(',', ' '))
print(f'Number of unique indexes:   {data.index.nunique():,}'.replace(',', ' '))
print(f'Number of columns:   {data.shape[1]:,}'.replace(',', ' '))

data.booking_status.value_counts(dropna=False)

Number of rows:   32 647
Number of unique indexes:   32 647
Number of columns:   18


booking_status
0.0    21774
1.0    10521
NaN      352
Name: count, dtype: int64

Teď chceme rozdělit data na trénovací a validační + vybrat prediktory atd.

In [6]:
#name of the target column
col_target = "booking_status"
#name of the time column
col_year = "arrival_year"
#name of the month column
col_month = "arrival_month"

data['year'] = data['arrival_year'].fillna(0).astype(int).astype(str)
data['month'] = data['arrival_month'].fillna(0).astype(int).astype(str)

# Pad month with leading zero if necessary
data['month'] = data['month'].apply(lambda x: x.zfill(2))

# Combine 'year' and 'month' columns to form 'date' column
data['arrival_date'] = data['year'] + data['month']

data['arrival_date'] = data['arrival_date'].astype(int)

# Replace 'date' values with NaN where 'year' or 'month' were NaN
data.loc[data['year'] == '0', 'arrival_date'] = float('nan')

data.drop('year', axis=1, inplace=True)
data.drop('month', axis=1, inplace=True)

data.head()
#data['arrival_date']
#data.arrival_date.value_counts(dropna=False)

Unnamed: 0_level_0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
Booking_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
INN10204,,,,2.0,Meal Plan 2,,Room_Type 6,,2018.0,9.0,201809.0,Online,0.0,0.0,,,1.0,0.0
INN20020,,,,2.0,Meal Plan 1,,,,,12.0,,Online,0.0,0.0,0.0,,,0.0
INN16435,1.0,,,2.0,,0.0,Room_Type 1,,2018.0,11.0,201811.0,,0.0,0.0,,,1.0,0.0
INN07143,3.0,,,3.0,,,,100.0,2018.0,5.0,201805.0,Online,0.0,0.0,,,2.0,0.0
INN20511,1.0,0.0,1.0,1.0,Meal Plan 1,0.0,,,2018.0,11.0,201811.0,,0.0,0.0,0.0,150.0,,1.0


In [7]:
cols_pred = list(data.columns[0:8].append(data.columns[11:17]))
cols_pred

# define list of numerical predictors
cols_pred_num = [col for col in cols_pred if data[col].dtype != 'O']
# define list of categorical predictors
cols_pred_cat = [col for col in cols_pred if data[col].dtype == 'O']

# potřeba upravit 0-1čkové?
print('Numerical predictors:')
print('---------------------')
print(data[cols_pred_num].dtypes)
print()
print('Categorical predictors:')
print('-----------------------')
print(data[cols_pred_cat].dtypes)

Numerical predictors:
---------------------
no_of_adults                            float64
no_of_children                          float64
no_of_weekend_nights                    float64
no_of_week_nights                       float64
required_car_parking_space              float64
lead_time                               float64
repeated_guest                          float64
no_of_previous_cancellations            float64
no_of_previous_bookings_not_canceled    float64
avg_price_per_room                      float64
no_of_special_requests                  float64
dtype: object

Categorical predictors:
-----------------------
type_of_meal_plan      object
room_type_reserved     object
market_segment_type    object
dtype: object


In [8]:
# define function to plot default rate in time for different samples
def cancelation_rate_in_time_per_sample(dt, col_target, col_month, col_sample):
    # group by over month and sample
    dt_grp = dt.groupby([col_month, col_sample]).agg(
        def_rt = (col_target, 'mean')
    ).reset_index()
    
    # pivot sample values to columns
    dt_grp_pivot = dt_grp.pivot(index = col_month, columns = col_sample, values = 'def_rt')

    # plot default rate in time
    lines = plt.plot(range(len(dt_grp_pivot)), dt_grp_pivot, marker = 'o')
    plt.xticks(range(len(dt_grp_pivot)), dt_grp_pivot.index, rotation = 90)
    # set legend
    plt.legend(iter(lines), tuple(dt_grp_pivot.columns), loc='best', bbox_to_anchor=(1.05, 1))
    
    plt.ylim([0, 0.1])
    plt.ylabel('default rate')
    plt.xlabel('month')
    
    ax = plt.gca()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_color('gray')
    ax.spines['bottom'].set_color('gray')
    ax.tick_params(axis='y', colors='gray')
    ax.tick_params(axis='x', colors='gray')
        
    plt.show()

In [9]:
data['sample'] = 'default'

# define 'hoot' and 'oot' sample
data.loc[data[col_month] <= 201701, 'sample'] = 'hoot'
data.loc[data[col_month] >= 201911, 'sample'] = 'oot'

# define intime mask
intime_mask = (data[col_month] > 201701) & (data[col_month] < 201911)
# use train_test_split to split the intime into train and rest (don't forget seed); use stratification
data_train, data_rest = train_test_split(data[intime_mask], test_size=0.4, random_state = 12, stratify = (data[intime_mask][[col_month, col_target]]))
data.loc[data_train.index, 'sample'] = 'train'
# use train_test_split to split the rest into valid and test (don't forget seed); use stratification
data_valid, data_test = train_test_split(data_rest, test_size=0.5, random_state = 12, stratify = (data_rest[[col_month, col_target]]))
data.loc[data_valid.index, 'sample'] = 'valid'
data.loc[data_test.index, 'sample'] = 'test'

ValueError: With n_samples=0, test_size=0.4 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.