# Imports

In [13]:
%%capture
!pip install -U feature-engine

In [8]:
import os
import pandas as pd
import numpy as np
from copy import deepcopy
import category_encoders as ce


from utils import get_target, get_time_features

# Parameters

In [9]:
USE_FULL = False
data_path = 'data'
train_filename = 'training_set_VU_DM.csv'

CAT_FEATURES = ['site_id',
                'visitor_location_country_id',
                'prop_country_id',
                'prop_id',
                'srch_destination_id']

bool_cols = ['prop_brand_bool',
             'promotion_flag',
             'srch_saturday_night_bool',
             'random_bool']

drop_cols = ['position', 'click_bool',
             'gross_bookings_usd', 'booking_bool']

group_col = 'srch_id'
time_col = 'date_time'

# Data reading

In [10]:
if USE_FULL:
    df = pd.read_csv(os.path.join(data_path, train_filename))
else:
    import random
    p = 0.01
    df = pd.read_csv(os.path.join(data_path, train_filename),
                     header=0,
                     skiprows=lambda i: i > 0 and random.random() > p)

df['date_time'] = pd.to_datetime(df['date_time'])
df.shape

(49606, 54)

# Time-independent feature engineering

## Decompose time

In [11]:
# various time-related columns are added to the df
df = get_time_features(deepcopy(df), 'date_time')

# time column is encoded as some algorithms don't work with datetime. 
# keep both. each specific model may load needed columns
enc = ce.OrdinalEncoder(cols=['date_time'])
df.sort_values(['date_time'], inplace=True)
df = enc.fit_transform(df)
# df.drop('date_time', axis=1, inplace=True)

basics are calculated
week-related are calculated


In [15]:
pd.set_option('max_columns', None)
df.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,day,month,year,quarter,week,dow,doy,days_in_month,is_weekend,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,week_id,season_num,week_summer_index
12040,81578,1,5,219,,,219,63105,4,4.0,0,5.16,0.6328,5.5,13,199.0,1,17012,1,15,2,0,1,1,,355.6,1,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,,,,0,,0,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10
12038,81578,1,5,219,,,219,6713,4,4.5,1,5.18,0.6015,5.78,20,229.0,0,17012,1,15,2,0,1,1,,355.8,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10
12039,81578,1,5,219,,,219,20587,4,4.5,0,5.16,0.6302,5.53,28,209.0,0,17012,1,15,2,0,1,1,,355.6,1,,,,,1.0,,1.0,1.0,10.0,,,,-1.0,1.0,6.0,,,,,,,-1.0,1.0,6.0,0,,0,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10
47191,316376,2,5,219,,,219,117765,3,4.5,1,2.48,0.5868,4.68,1,103.0,0,19512,1,1,1,0,1,1,,301.26,0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,0,,0,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10
1707,11676,3,24,216,,,219,133072,2,3.5,1,1.79,0.0037,4.28,8,46.92,1,23904,3,103,2,0,1,0,,,1,,,,,,,,,,,,,,,,,,,,,,,,,0,,0,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10


## Time cyclicals

In [21]:
from feature_engine.creation import CyclicalTransformer

max_values_dict = {'day': 31,
                   'month': 12,
                   'quarter': 4,
                   'week': 52,
                   'dow': 7,
                   'doy': 366,
                   'season_num': 4}

cyclical = CyclicalTransformer(variables=list(max_values_dict.keys()), 
                               drop_original=False,
                               max_values=max_values_dict)

df = cyclical.fit_transform(df)

In [22]:
df.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,day,month,year,quarter,week,dow,doy,days_in_month,is_weekend,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,week_id,season_num,week_summer_index,day_sin,day_cos,month_sin,month_cos,quarter_sin,quarter_cos,week_sin,week_cos,dow_sin,dow_cos,doy_sin,doy_cos,season_num_sin,season_num_cos
12040,81578,1,5,219,,,219,63105,4,4.0,0,5.16,0.6328,5.5,13,199.0,1,17012,1,15,2,0,1,1,,355.6,1,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,,,,0,,0,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0
12038,81578,1,5,219,,,219,6713,4,4.5,1,5.18,0.6015,5.78,20,229.0,0,17012,1,15,2,0,1,1,,355.8,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0
12039,81578,1,5,219,,,219,20587,4,4.5,0,5.16,0.6302,5.53,28,209.0,0,17012,1,15,2,0,1,1,,355.6,1,,,,,1.0,,1.0,1.0,10.0,,,,-1.0,1.0,6.0,,,,,,,-1.0,1.0,6.0,0,,0,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0
47191,316376,2,5,219,,,219,117765,3,4.5,1,2.48,0.5868,4.68,1,103.0,0,19512,1,1,1,0,1,1,,301.26,0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,0,,0,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0
1707,11676,3,24,216,,,219,133072,2,3.5,1,1.79,0.0037,4.28,8,46.92,1,23904,3,103,2,0,1,0,,,1,,,,,,,,,,,,,,,,,,,,,,,,,0,,0,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0


# Data split

In [23]:
from utils import train_test_group_split, get_target

df.sort_values([group_col, time_col], inplace=True)
X = df
y = df.apply(get_target, axis=1)
groups = X[group_col]


X_train_val, X_test, y_train_val, y_test, groups_train_val, groups_test = train_test_group_split(X, y, groups,
                                                                                                 group_array=groups,
                                                                                                 train_size=0.9)
# del X, y

X_train, X_val, y_train, y_val, groups_train, groups_val = train_test_group_split(X_train_val,
                                                                                  y_train_val,
                                                                                  groups_train_val,
                                                                                  group_array=groups_train_val,
                                                                                  train_size=0.9)

In [24]:
# make sure that this is empty
set(X_train[group_col]).intersection(set(X_test[group_col]))

set()

In [25]:
X_train.shape, X_val.shape, X_test.shape

((40178, 86), (4466, 86), (4962, 86))

# Feature engineering

# Save tables

In [27]:
X_train.to_csv(os.path.join(data_path, 'X_train.csv'))
y_train.to_csv(os.path.join(data_path, 'y_train.csv'))

X_val.to_csv(os.path.join(data_path, 'X_val.csv'))
y_val.to_csv(os.path.join(data_path, 'y_val.csv'))

X_test.to_csv(os.path.join(data_path, 'X_test.csv'))
y_test.to_csv(os.path.join(data_path, 'y_test.csv'))