# Imports

In [1]:
%%capture
!pip install -U feature-engine

In [2]:
import os
import pandas as pd
import numpy as np
from copy import deepcopy
import category_encoders as ce
from feature_engine.creation import CyclicalTransformer
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt

from utils import get_target
from features import get_time_features

# Parameters

In [3]:
USE_FULL = True
data_path = 'data'
train_filename = 'training_set_VU_DM.csv'
test_filename = 'test_set_VU_DM.csv'

CAT_FEATURES = ['site_id',
                'visitor_location_country_id',
                'prop_country_id',
                'prop_id',
                'srch_destination_id']

bool_cols = ['prop_brand_bool',
             'promotion_flag',
             'srch_saturday_night_bool',
             'random_bool']

drop_cols = ['position', 'click_bool',
             'gross_bookings_usd', 'booking_bool']

group_col = 'srch_id'
time_col = 'date_time'

# Data reading

In [4]:
if USE_FULL:
    df = pd.read_csv(os.path.join(data_path, train_filename))
else:
    import random
    p = 0.01
    df = pd.read_csv(os.path.join(data_path, train_filename),
                     header=0,
                     skiprows=lambda i: i > 0 and random.random() > p)
df.shape

(4958347, 54)

In [5]:
subm_df = pd.read_csv(os.path.join(data_path, test_filename))
subm_df.shape

(4959183, 50)

In [6]:
# split later. be very careful
df['subm'] = False
subm_df['subm'] = True
df = pd.concat([df, subm_df])

df['date_time'] = pd.to_datetime(df['date_time'])

df.shape

(9917530, 55)

# Missing values

In [None]:
df['prop_log_historical_price'].replace(0, np.nan, inplace=True)

# Time-independent feature engineering

## Decompose time

In [7]:
%%time

# various time-related columns are added to the df
df = get_time_features(deepcopy(df), 'date_time')

# time column is encoded as some algorithms don't work with datetime. 
# keep both. each specific model may load needed columns
enc = ce.OrdinalEncoder(cols=['date_time'])
df.sort_values(['date_time'], inplace=True)
df = enc.fit_transform(df)
# df.drop('date_time', axis=1, inplace=True)

CPU times: user 4min 52s, sys: 1min 8s, total: 6min
Wall time: 6min 17s


In [8]:
pd.set_option('max_columns', None)
df.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,subm,day,month,year,quarter,week,dow,doy,days_in_month,is_weekend,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,week_id,season_num,week_summer_index
485242,32491,1,24,216,,,225,327,4,4.5,0,3.66,0.1358,5.31,,142.71,0,14083,2,16,1,0,1,1,,,0,,,,1.0,0.0,8.0,1.0,0.0,4.0,1.0,0.0,4.0,1.0,0.0,7.0,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10
485243,32491,1,24,216,,,225,20797,3,3.0,0,4.65,0.1955,4.97,,79.22,0,14083,2,16,1,0,1,1,,,0,,,,,,,-1.0,0.0,2.0,1.0,0.0,9.0,,0.0,,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10
485244,32491,1,24,216,,,225,24202,3,0.0,0,4.73,0.3831,4.97,,93.07,1,14083,2,16,1,0,1,1,,,0,,,,,,,,,,1.0,0.0,9.0,,1.0,,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10
485245,32491,1,24,216,,,225,33813,5,4.5,1,4.76,0.3524,6.13,,278.63,0,14083,2,16,1,0,1,1,,,0,,,,0.0,0.0,7.0,1.0,0.0,3.0,1.0,0.0,15.0,0.0,0.0,7.0,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10
485246,32491,1,24,216,,,225,36599,3,4.0,0,4.64,0.3488,5.13,,100.92,1,14083,2,16,1,0,1,1,,,0,,,,1.0,0.0,6.0,0.0,0.0,8.0,1.0,0.0,11.0,1.0,0.0,2.0,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10


## Time cyclicals

In [9]:
max_values_dict = {'day': 31,
                   'month': 12,
                   'quarter': 4,
                   'week': 52,
                   'dow': 7,
                   'doy': 366,
                   'season_num': 4}

cyclical = CyclicalTransformer(variables=list(max_values_dict.keys()), 
                               drop_original=False,
                               max_values=max_values_dict)

df = cyclical.fit_transform(df)
df.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,subm,day,month,year,quarter,week,dow,doy,days_in_month,is_weekend,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,week_id,season_num,week_summer_index,day_sin,day_cos,month_sin,month_cos,quarter_sin,quarter_cos,week_sin,week_cos,dow_sin,dow_cos,doy_sin,doy_cos,season_num_sin,season_num_cos
485242,32491,1,24,216,,,225,327,4,4.5,0,3.66,0.1358,5.31,,142.71,0,14083,2,16,1,0,1,1,,,0,,,,1.0,0.0,8.0,1.0,0.0,4.0,1.0,0.0,4.0,1.0,0.0,7.0,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0
485243,32491,1,24,216,,,225,20797,3,3.0,0,4.65,0.1955,4.97,,79.22,0,14083,2,16,1,0,1,1,,,0,,,,,,,-1.0,0.0,2.0,1.0,0.0,9.0,,0.0,,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0
485244,32491,1,24,216,,,225,24202,3,0.0,0,4.73,0.3831,4.97,,93.07,1,14083,2,16,1,0,1,1,,,0,,,,,,,,,,1.0,0.0,9.0,,1.0,,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0
485245,32491,1,24,216,,,225,33813,5,4.5,1,4.76,0.3524,6.13,,278.63,0,14083,2,16,1,0,1,1,,,0,,,,0.0,0.0,7.0,1.0,0.0,3.0,1.0,0.0,15.0,0.0,0.0,7.0,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0
485246,32491,1,24,216,,,225,36599,3,4.0,0,4.64,0.3488,5.13,,100.92,1,14083,2,16,1,0,1,1,,,0,,,,1.0,0.0,6.0,0.0,0.0,8.0,1.0,0.0,11.0,1.0,0.0,2.0,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0


## Competitors

In [10]:
for i in range(1, 9):
    df[f'comp{i}_rate_percent_diff_signed'] = df[f'comp{i}_rate'] * df[f'comp{i}_rate_percent_diff']
    df.drop([f'comp{i}_rate'], axis=1, inplace=True)  # bcs it's just a sign for the new feature

In [11]:
df[sorted([c for c in df.columns if 'comp' in c and 'rate' in c])].head()

Unnamed: 0,comp1_rate_percent_diff,comp1_rate_percent_diff_signed,comp2_rate_percent_diff,comp2_rate_percent_diff_signed,comp3_rate_percent_diff,comp3_rate_percent_diff_signed,comp4_rate_percent_diff,comp4_rate_percent_diff_signed,comp5_rate_percent_diff,comp5_rate_percent_diff_signed,comp6_rate_percent_diff,comp6_rate_percent_diff_signed,comp7_rate_percent_diff,comp7_rate_percent_diff_signed,comp8_rate_percent_diff,comp8_rate_percent_diff_signed
485242,,,8.0,8.0,4.0,4.0,4.0,4.0,7.0,7.0,,,,,,
485243,,,,,2.0,-2.0,9.0,9.0,,,,,,,,
485244,,,,,,,9.0,9.0,,,,,,,,
485245,,,7.0,0.0,3.0,3.0,15.0,15.0,7.0,0.0,,,,,,
485246,,,6.0,6.0,8.0,0.0,11.0,11.0,2.0,2.0,,,,,,


## Money transformations

In [12]:
def num_transformations(df, cols, powers=[0.33, 0.5, 2, 3], log_bases=[2, 10, np.e]):
    for c in cols:
        for p in powers:
            if (p <=2) or (p > 2 and all(df[c] < 100)): 
                df[f'{c}_pow_{p}'] = df[c]**p
        
        for log_base in log_bases:
            df[f'{c}_log_{log_base}'] = np.log(df[c]) / np.log(log_base)
        
        df[c+'_reciprocal'] = 1/(df[c] + 1e-6)
        
        if all(df[c] < 10): 
            df[c+'_exp'] = np.exp(df[c])
    return df

In [13]:
df = num_transformations(df,
                         ['price_usd', 'visitor_hist_adr_usd',
                          'prop_log_historical_price'],
                         powers=[0.33, 0.5, 2, 3],
                         log_bases=[2, 10, round(np.e, 2)])

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
df = num_transformations(df,
                         [f'comp{i}_rate_percent_diff' if i in range(1, 9)],
                         powers=[],
                         log_bases=[2, 10])

##

**visitor_hist_adr_usd** - The mean price per night (in US$) of the hotels the customer has previously purchased; null signifies there is no purchase history on the customer  

**price_usd** - Displayed price of the hotel for the given search.  Note that different countries have different conventions regarding displaying taxes and fees and the value may be per night or for the whole stay

In [14]:
df['price_diff_to_usr_hist'] = df['price_usd'] - df['visitor_hist_adr_usd']
df['price_ratio_to_usr_hist'] = df['price_usd'] / df['visitor_hist_adr_usd']

**prop_log_historical_price** - The logarithm of the mean price of the hotel over the last trading period. A 0 will occur if the hotel was not sold in that period.  

In [15]:
df['price_diff_to_place_hist_price'] = df['price_usd'] - np.exp(df['prop_log_historical_price'])
df['place_hist_price_diff_usr_hist'] = np.exp(df['prop_log_historical_price']) - df['visitor_hist_adr_usd']

**gross_booking_usd** - Total value of the transaction.  This can differ from the price_usd due to taxes, fees, conventions on multiple day bookings and purchase of a room type other than the one shown in the search
ID of the destination where the hotel search was performed

In [16]:
df.head(2)

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_inv,comp1_rate_percent_diff,comp2_inv,comp2_rate_percent_diff,comp3_inv,comp3_rate_percent_diff,comp4_inv,comp4_rate_percent_diff,comp5_inv,comp5_rate_percent_diff,comp6_inv,comp6_rate_percent_diff,comp7_inv,comp7_rate_percent_diff,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,subm,day,month,year,quarter,week,dow,doy,days_in_month,is_weekend,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,week_id,season_num,week_summer_index,day_sin,day_cos,month_sin,month_cos,quarter_sin,quarter_cos,week_sin,week_cos,dow_sin,dow_cos,doy_sin,doy_cos,season_num_sin,season_num_cos,comp1_rate_percent_diff_signed,comp2_rate_percent_diff_signed,comp3_rate_percent_diff_signed,comp4_rate_percent_diff_signed,comp5_rate_percent_diff_signed,comp6_rate_percent_diff_signed,comp7_rate_percent_diff_signed,comp8_rate_percent_diff_signed,price_usd_pow_0.33,price_usd_pow_0.5,price_usd_pow_2,price_usd_log_2,price_usd_log_10,price_usd_log_2.72,price_usd_reciprocal,visitor_hist_adr_usd_pow_0.33,visitor_hist_adr_usd_pow_0.5,visitor_hist_adr_usd_pow_2,visitor_hist_adr_usd_log_2,visitor_hist_adr_usd_log_10,visitor_hist_adr_usd_log_2.72,visitor_hist_adr_usd_reciprocal,prop_log_historical_price_pow_0.33,prop_log_historical_price_pow_0.5,prop_log_historical_price_pow_2,prop_log_historical_price_pow_3,prop_log_historical_price_log_2,prop_log_historical_price_log_10,prop_log_historical_price_log_2.72,prop_log_historical_price_reciprocal,prop_log_historical_price_exp,price_diff_to_usr_hist,price_ratio_to_usr_hist,price_diff_to_place_hist_price,place_hist_price_diff_usr_hist
485242,32491,1,24,216,,,225,327,4,4.5,0,3.66,0.1358,5.31,,142.71,0,14083,2,16,1,0,1,1,,,0,,,0.0,8.0,0.0,4.0,0.0,4.0,0.0,7.0,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0,,8.0,4.0,4.0,7.0,,,,5.140081,11.946129,20366.1441,7.156943,2.154454,4.957682,0.007007,,,,,,,,1.734927,2.304344,28.1961,149.721291,2.408712,0.725095,1.668538,0.188324,202.350228,,,-59.640228,
485243,32491,1,24,216,,,225,20797,3,3.0,0,4.65,0.1955,4.97,,79.22,0,14083,2,16,1,0,1,1,,,0,,,,,0.0,2.0,0.0,9.0,0.0,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0,,,-2.0,9.0,,,,,4.232681,8.900562,6275.8084,6.307793,1.898835,4.369468,0.012623,,,,,,,,1.697452,2.22935,24.7009,122.763473,2.313246,0.696356,1.602407,0.201207,144.026887,,,-64.806887,


In [17]:
df['usr_extra_pay'] = df['gross_bookings_usd'] - df['price_usd']

## Days of staying

**srch_length_of_stay** - Number of nights stay that was searched


**srch_booking_window** - Number of days in the future the hotel stay started from the search date

In [18]:
df['booking_weeks_ahead'] = df['srch_booking_window'] / 7
df['booking_months_ahead'] = df['srch_booking_window'] / 30

df['len_of_stay_to_booking_ahead_ratio'] = df['srch_booking_window'] / df['srch_length_of_stay']
df['len_of_stay_to_booking_ahead_diff'] = df['srch_booking_window'] - df['srch_length_of_stay']

## Family

**srch_adults_count** - The number of adults specified in the hotel room

 
**srch_children_count** - The number of (extra occupancy) children specified in the hotel room

 
**srch_room_count** - Number of hotel rooms specified in the search

In [19]:
df['children_per_adult'] = df['srch_children_count'] / df['srch_adults_count']
df['total_people'] = df['srch_children_count'] + df['srch_adults_count']
df['people_per_room'] = df['total_people'] / df['srch_room_count']

## Others

**srch_query_affinity_score** - The log of the probability a hotel will be clicked on in Internet searches (hence the values are negative)  A null signifies there are no data (i.e. hotel did not register in any searches)

In [20]:
df['srch_query_affinity_score_prob'] = np.exp(df['srch_query_affinity_score'])

# Data split

In [21]:
df.shape

(9917530, 123)

In [22]:
df.sort_values([group_col, time_col], inplace=True)

In [23]:
# further all feature engineering will be separate

subm_df = deepcopy(df[df['subm'] == True])
df = deepcopy(df[df['subm'] == False])

subm_df.drop(['subm'], axis=1, inplace=True)
df.drop(['subm'], axis=1, inplace=True)

df.shape, subm_df.shape

((4958347, 122), (4959183, 122))

In [24]:
from utils import train_test_group_split, get_target

X = df.drop(drop_cols, axis=1)
y = df.apply(get_target, axis=1)
groups = X[group_col]


X_train_val, X_test, y_train_val, y_test, groups_train_val, groups_test = train_test_group_split(X, y, groups,
                                                                                                 group_array=groups,
                                                                                                 train_size=0.9)
# del X, y

X_train, X_val, y_train, y_val, groups_train, groups_val = train_test_group_split(X_train_val,
                                                                                  y_train_val,
                                                                                  groups_train_val,
                                                                                  group_array=groups_train_val,
                                                                                  train_size=0.9)

In [25]:
# make sure that this is empty
set(X_train[group_col]).intersection(set(X_test[group_col]))

set()

In [26]:
X_train.shape, X_val.shape, X_test.shape

((4016262, 118), (446248, 118), (495837, 118))

# Feature engineering with respect to the time 

# Correlations

In [27]:
profile = ProfileReport(X_train, title="Pandas Profiling Report", minimal=True)
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [29]:
profile = ProfileReport(X_train.sample(1000), title="Pandas Profiling Report")
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

# Save tables

In [28]:
X_train.to_csv(os.path.join(data_path, 'X_train.csv'), index=False)
y_train.to_csv(os.path.join(data_path, 'y_train.csv'), index=False)
groups_train.to_csv(os.path.join(data_path, 'groups_train.csv'), index=False)
print('train is done')

X_val.to_csv(os.path.join(data_path, 'X_val.csv'), index=False)
y_val.to_csv(os.path.join(data_path, 'y_val.csv'), index=False)
groups_val.to_csv(os.path.join(data_path, 'groups_val.csv'), index=False)
print('val is done')

X_test.to_csv(os.path.join(data_path, 'X_test.csv'), index=False)
y_test.to_csv(os.path.join(data_path, 'y_test.csv'), index=False)
groups_test.to_csv(os.path.join(data_path, 'groups_test.csv'), index=False)
print('test is done')

subm_df.to_csv(os.path.join(data_path, 'submission_df_preprocessed.csv'), index=False)
print('submission is done')

train is done
val is done
test is done
submission is done
