In [None]:
# Install libraries

%%capture

! pip install git+https://github.com/microsoft/FLAML.git#egg=flaml[catboost]
! pip install optuna
! pip install ray[tune]

In [None]:
# Download the dataset

%%capture

! rm -rf *
! wget http://156.253.5.172/hotels.zip
! unzip hotels.zip
! rm hotels.zip

In [None]:
# Import libraries

%%capture

from tqdm.notebook import tqdm
import pandas as pd
import matplotlib
import numpy as np
from pathlib import Path
import pickle
import gc
import datetime

# Machine learning libraries

from flaml import AutoML
from sklearn.metrics import roc_auc_score

# Pandas settings to show more columns are rows in the jupyter notebook

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50000)

In [None]:
# Set variables

data_dir = Path("hotels")
train_file = data_dir/"train.csv"
test_file = data_dir/"test.csv"

scoring = "roc_auc"
target_name = "is_booking"
base_date = datetime.datetime(2020,10,1)

In [None]:
# Read the raw training data

raw_df = pd.read_csv(train_file, parse_dates=["search_date","checkIn_date","checkOut_date"],
                 dtype={'is_booking':bool, "is_package": bool, "is_mobile": bool,
                        'n_adults':'int8','n_children':'int8','n_rooms':'int8'
                        })

# column types are:

raw_df.dtypes

user                      object
search_date       datetime64[ns]
channel                   object
is_mobile                   bool
is_package                  bool
destination               object
checkIn_date      datetime64[ns]
checkOut_date     datetime64[ns]
n_adults                    int8
n_children                  int8
n_rooms                     int8
hotel_category            object
is_booking                  bool
dtype: object

In [None]:
# Select only subset of it (last two months)

df = raw_df[raw_df.search_date>=base_date]
del raw_df
gc.collect()

# Remove user ids

df.drop(columns=["user"], inplace=True)

# Change string to int

for col in ["channel","destination","hotel_category"]:
  df[col] = df[col].apply(lambda x: x[1:])

for col in ["destination"]:
  df[col] = df[col].astype('uint32')

for col in ["channel","hotel_category"]:
  df[col] = df[col].astype('uint8')

# Column types are:

df.dtypes

user                      uint32
search_date       datetime64[ns]
channel                    uint8
is_mobile                   bool
is_package                  bool
destination               uint32
checkIn_date      datetime64[ns]
checkOut_date     datetime64[ns]
n_adults                    int8
n_children                  int8
n_rooms                     int8
hotel_category             uint8
is_booking                  bool
dtype: object

In [None]:
# Print some statistics

print(f"Number of rows is : {df.shape[0]}")
print(f"Booking percentage is : {round(100*df[df.is_booking==True].shape[0]/df.shape[0],2)}%")

df.head()

Number of rows is : 5355570
Booking percentage is : 7.3%


Unnamed: 0,user,search_date,channel,is_mobile,is_package,destination,checkIn_date,checkOut_date,n_adults,n_children,n_rooms,hotel_category,is_booking
29387405,629189,2020-10-01 00:00:18,9,False,False,8744,2020-11-29,2020-11-30,1,0,1,6,False
29387406,93135,2020-10-01 00:00:21,9,False,False,4688,2020-10-27,2020-10-30,2,0,1,97,False
29387407,360774,2020-10-01 00:00:35,9,True,True,7635,2020-12-08,2020-12-13,2,0,1,41,False
29387408,107833,2020-10-01 00:00:42,9,False,False,11683,2020-10-02,2020-10-03,1,0,1,98,False
29387409,726923,2020-10-01 00:00:43,1,False,False,24689,2020-11-21,2020-11-23,2,0,1,91,False


In [None]:
# Check for missing values

pd.isnull(df).sum()

user                 0
search_date          0
channel              0
is_mobile            0
is_package           0
destination          0
checkIn_date      1790
checkOut_date     1789
n_adults             0
n_children           0
n_rooms              0
hotel_category       0
is_booking           0
dtype: int64

In [None]:
# Drop missing values, as they are a few of them (compared to the dataset size)

df.dropna(subset=['checkIn_date', 'checkOut_date'], inplace=True)

pd.isnull(df).sum()

user              0
search_date       0
channel           0
is_mobile         0
is_package        0
destination       0
checkIn_date      0
checkOut_date     0
n_adults          0
n_children        0
n_rooms           0
hotel_category    0
is_booking        0
dtype: int64

In [None]:
# Split the training and validation dataset

df_len = df.shape[0]
data_train = df[:int(df_len*0.8)]
data_valid = df[int(df_len*0.8):]

print(f"Train booking percentage is : {round(100*data_train[data_train.is_booking==True].shape[0]/data_train.shape[0],2)}%")
print(f"Validation Booking percentage is : {round(100*data_valid[data_valid.is_booking==True].shape[0]/data_valid.shape[0],2)}%")

del df

y = data_train[target_name]
xs = data_train.drop(columns=target_name)

del data_train

valid_y = data_valid[target_name]
valid_xs = data_valid.drop(columns=target_name)

del data_valid

Train booking percentage is : 7.35%
Validation Booking percentage is : 7.13%


We did not do any feature engineering, but we could have done: 

* Days to go (checkIn - searchDate)
* Number of days to stay (checkOut - checkIn)
* Number of people (number of adults + number of children)
* Person per room (Number of people divide by Number of rooms)
* combination of is_mobile and is_package columns



In [None]:
# Evalaution metric function

def calc_auc_roc(y, prob_pred):
    return roc_auc_score(y, prob_pred)

# Initialize an AutoML instance

gc.collect()

performance_list = []

model_names = {"lgbm":"Light Gradient Boosting", "catboost": "Cat Boost"}

for key in tqdm(model_names):
  model = AutoML()
  # Specify automl goal and constraint
  settings = {
      "time_budget": 1800,
      "metric": scoring,
      "task": 'classification',
      "verbose": 0,
      "estimator_list": [key],
      "n_jobs": -1,
      "eval_method": "cv",
      "n_splits": 5,
      "mem_thres": 11294967296,
      }
  model.fit(xs, y, **settings)
  
  # Save the model

  with open(f'{key}.pkl', 'wb') as f:
    pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)
  
  # Log the performance

  performance = {}
  performance["model"] = model_names[key]
  train_pred_proba = model.predict_proba(xs)[:,1]
  performance["training (auc_roc)"] = calc_auc_roc(y, train_pred_proba)
  validation_pred_proba = model.predict_proba(valid_xs)[:,1]
  performance["validation (auc_roc)"]  = calc_auc_roc(valid_y, validation_pred_proba)
  performance_list.append(performance)
  gc.collect()

performance_df = pd.DataFrame(performance_list).round(3)
display(performance_df)

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,model,training (auc_roc),validation (auc_roc)
0,Light Gradient Boosting,0.698,0.689
1,Cat Boost,0.695,0.687


In [None]:
best_model_name = "lgbm"

with open(f'{best_model_name}.pkl', 'rb') as f:
  selected_model = pickle.load(f)

print(f"selected model is {best_model_name}.\n")
print("Its parameters are:")
selected_model.model.get_params()

selected model is lgbm.

Its parameters are:


{'_estimator_type': 'classifier',
 'colsample_bytree': 0.47342795260738935,
 'learning_rate': 0.024282186997032957,
 'max_bin': 1023,
 'min_child_samples': 9,
 'n_estimators': 189,
 'n_jobs': -1,
 'num_leaves': 20,
 'reg_alpha': 0.0009765625,
 'reg_lambda': 0.003241489691566038,
 'task': 'binary',
 'verbose': -1}

In [None]:
test_df = pd.read_csv(test_file, parse_dates=["search_date","checkIn_date","checkOut_date"],
                 dtype={'is_booking':bool, "is_package": bool, "is_mobile": bool,
                        'n_adults':'int8','n_children':'int8','n_rooms':'int8'
                        })

# Remove user ids

test_df.drop(columns=["user"], inplace=True)

# Change string to int

for col in ["channel", "destination", "hotel_category"]:
  test_df[col] = test_df[col].apply(lambda x: x[1:])

for col in ["destination"]:
  test_df[col] = test_df[col].astype('uint32')

for col in ["channel", "hotel_category"]:
  test_df[col] = test_df[col].astype('uint8')

# Column types are:

test_df.dtypes

search_date       datetime64[ns]
channel                    uint8
is_mobile                   bool
is_package                  bool
destination               uint32
checkIn_date      datetime64[ns]
checkOut_date     datetime64[ns]
n_adults                    int8
n_children                  int8
n_rooms                     int8
hotel_category             uint8
dtype: object

In [None]:
submission_df = pd.DataFrame()
submission_df["prediction"] = selected_model.predict_proba(test_df)[:,1]
submission_df.to_csv("output.csv", index=False)
submission_df.head()

Unnamed: 0,prediction
0,0.06578
1,0.139641
2,0.078441
3,0.013434
4,0.060074
