# Imports

In [1]:
%%capture
!pip install -U feature-engine

In [49]:
import os
import pandas as pd
import numpy as np
from copy import deepcopy
import category_encoders as ce
from feature_engine.creation import CyclicalTransformer
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt

from utils import get_target, get_time_features

# Parameters

In [15]:
USE_FULL = True
data_path = 'data'
train_filename = 'training_set_VU_DM.csv'
test_filename = 'test_set_VU_DM.csv'

CAT_FEATURES = ['site_id',
                'visitor_location_country_id',
                'prop_country_id',
                'prop_id',
                'srch_destination_id']

bool_cols = ['prop_brand_bool',
             'promotion_flag',
             'srch_saturday_night_bool',
             'random_bool']

drop_cols = ['position', 'click_bool',
             'gross_bookings_usd', 'booking_bool']

group_col = 'srch_id'
time_col = 'date_time'

# Data reading

In [17]:
if USE_FULL:
    df = pd.read_csv(os.path.join(data_path, train_filename))
else:
    import random
    p = 0.01
    df = pd.read_csv(os.path.join(data_path, train_filename),
                     header=0,
                     skiprows=lambda i: i > 0 and random.random() > p)
df.shape

(4958347, 54)

In [18]:
subm_df = pd.read_csv(os.path.join(data_path, test_filename))
subm_df.shape

(4959183, 50)

In [21]:
# split later. be very careful
df['subm'] = False
subm_df['subm'] = True
df = pd.concat([df, subm_df])

df['date_time'] = pd.to_datetime(df['date_time'])

In [29]:
df.shape

(9917530, 87)

# Time-independent feature engineering

## Decompose time

In [22]:
%%time

# various time-related columns are added to the df
df = get_time_features(deepcopy(df), 'date_time')

# time column is encoded as some algorithms don't work with datetime. 
# keep both. each specific model may load needed columns
enc = ce.OrdinalEncoder(cols=['date_time'])
df.sort_values(['date_time'], inplace=True)
df = enc.fit_transform(df)
# df.drop('date_time', axis=1, inplace=True)

CPU times: user 5min 7s, sys: 1min 34s, total: 6min 41s
Wall time: 7min 38s


In [23]:
pd.set_option('max_columns', None)
df.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,subm,day,month,year,quarter,week,dow,doy,days_in_month,is_weekend,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,week_id,season_num,week_summer_index
485242,32491,1,24,216,,,225,327,4,4.5,0,3.66,0.1358,5.31,,142.71,0,14083,2,16,1,0,1,1,,,0,,,,1.0,0.0,8.0,1.0,0.0,4.0,1.0,0.0,4.0,1.0,0.0,7.0,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10
485243,32491,1,24,216,,,225,20797,3,3.0,0,4.65,0.1955,4.97,,79.22,0,14083,2,16,1,0,1,1,,,0,,,,,,,-1.0,0.0,2.0,1.0,0.0,9.0,,0.0,,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10
485244,32491,1,24,216,,,225,24202,3,0.0,0,4.73,0.3831,4.97,,93.07,1,14083,2,16,1,0,1,1,,,0,,,,,,,,,,1.0,0.0,9.0,,1.0,,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10
485245,32491,1,24,216,,,225,33813,5,4.5,1,4.76,0.3524,6.13,,278.63,0,14083,2,16,1,0,1,1,,,0,,,,0.0,0.0,7.0,1.0,0.0,3.0,1.0,0.0,15.0,0.0,0.0,7.0,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10
485246,32491,1,24,216,,,225,36599,3,4.0,0,4.64,0.3488,5.13,,100.92,1,14083,2,16,1,0,1,1,,,0,,,,1.0,0.0,6.0,0.0,0.0,8.0,1.0,0.0,11.0,1.0,0.0,2.0,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10


## Time cyclicals

In [25]:
%%time

max_values_dict = {'day': 31,
                   'month': 12,
                   'quarter': 4,
                   'week': 52,
                   'dow': 7,
                   'doy': 366,
                   'season_num': 4}

cyclical = CyclicalTransformer(variables=list(max_values_dict.keys()), 
                               drop_original=False,
                               max_values=max_values_dict)

df = cyclical.fit_transform(df)
df.head()

CPU times: user 5.95 s, sys: 4.48 s, total: 10.4 s
Wall time: 10.5 s


Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,subm,day,month,year,quarter,week,dow,doy,days_in_month,is_weekend,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,week_id,season_num,week_summer_index,day_sin,day_cos,month_sin,month_cos,quarter_sin,quarter_cos,week_sin,week_cos,dow_sin,dow_cos,doy_sin,doy_cos,season_num_sin,season_num_cos
485242,32491,1,24,216,,,225,327,4,4.5,0,3.66,0.1358,5.31,,142.71,0,14083,2,16,1,0,1,1,,,0,,,,1.0,0.0,8.0,1.0,0.0,4.0,1.0,0.0,4.0,1.0,0.0,7.0,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0
485243,32491,1,24,216,,,225,20797,3,3.0,0,4.65,0.1955,4.97,,79.22,0,14083,2,16,1,0,1,1,,,0,,,,,,,-1.0,0.0,2.0,1.0,0.0,9.0,,0.0,,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0
485244,32491,1,24,216,,,225,24202,3,0.0,0,4.73,0.3831,4.97,,93.07,1,14083,2,16,1,0,1,1,,,0,,,,,,,,,,1.0,0.0,9.0,,1.0,,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0
485245,32491,1,24,216,,,225,33813,5,4.5,1,4.76,0.3524,6.13,,278.63,0,14083,2,16,1,0,1,1,,,0,,,,0.0,0.0,7.0,1.0,0.0,3.0,1.0,0.0,15.0,0.0,0.0,7.0,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0
485246,32491,1,24,216,,,225,36599,3,4.0,0,4.64,0.3488,5.13,,100.92,1,14083,2,16,1,0,1,1,,,0,,,,1.0,0.0,6.0,0.0,0.0,8.0,1.0,0.0,11.0,1.0,0.0,2.0,,,,,,,,,,,,,True,1,11,2012,4,44,4,306,30,0,1,0,0,0,0,0,201244,4,10,0.201299,0.97953,-0.5,0.866025,-2.449294e-16,1.0,-0.822984,0.568065,-0.433884,-0.900969,-0.857315,0.514793,-2.449294e-16,1.0


## Competitors

In [37]:
for i in range(1, 9):
    df[f'comp{i}_rate_percent_diff_signed'] = df[f'comp{i}_rate'] * df[f'comp{i}_rate_percent_diff']

In [41]:
df[sorted([c for c in df.columns if 'comp' in c and 'rate' in c])].head()

Unnamed: 0,comp1_rate,comp1_rate_percent_diff,comp1_rate_percent_diff_signed,comp2_rate,comp2_rate_percent_diff,comp2_rate_percent_diff_signed,comp3_rate,comp3_rate_percent_diff,comp3_rate_percent_diff_signed,comp4_rate,comp4_rate_percent_diff,comp4_rate_percent_diff_signed,comp5_rate,comp5_rate_percent_diff,comp5_rate_percent_diff_signed,comp6_rate,comp6_rate_percent_diff,comp6_rate_percent_diff_signed,comp7_rate,comp7_rate_percent_diff,comp7_rate_percent_diff_signed,comp8_rate,comp8_rate_percent_diff,comp8_rate_percent_diff_signed
485242,,,,1.0,8.0,8.0,1.0,4.0,4.0,1.0,4.0,4.0,1.0,7.0,7.0,,,,,,,,,
485243,,,,,,,-1.0,2.0,-2.0,1.0,9.0,9.0,,,,,,,,,,,,
485244,,,,,,,,,,1.0,9.0,9.0,,,,,,,,,,,,
485245,,,,0.0,7.0,0.0,1.0,3.0,3.0,1.0,15.0,15.0,0.0,7.0,0.0,,,,,,,,,
485246,,,,1.0,6.0,6.0,0.0,8.0,0.0,1.0,11.0,11.0,1.0,2.0,2.0,,,,,,,,,


# Data split

In [None]:
df.sort_values([group_col, time_col], inplace=True)

In [44]:
# further all feature engineering will be separate

subm_df = deepcopy(df[df['subm'] == True])
df = deepcopy(df[df['subm'] == False])

subm_df.drop(['subm'], axis=1, inplace=True)
df.drop(['subm'], axis=1, inplace=True)

df.shape, subm_df.shape

((4958347, 94), (4959183, 94))

In [45]:
from utils import train_test_group_split, get_target

X = df.drop(drop_cols, axis=1)
y = df.apply(get_target, axis=1)
groups = X[group_col]


X_train_val, X_test, y_train_val, y_test, groups_train_val, groups_test = train_test_group_split(X, y, groups,
                                                                                                 group_array=groups,
                                                                                                 train_size=0.9)
# del X, y

X_train, X_val, y_train, y_val, groups_train, groups_val = train_test_group_split(X_train_val,
                                                                                  y_train_val,
                                                                                  groups_train_val,
                                                                                  group_array=groups_train_val,
                                                                                  train_size=0.9)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_values([group_col, time_col], inplace=True)


In [46]:
# make sure that this is empty
set(X_train[group_col]).intersection(set(X_test[group_col]))

set()

In [47]:
X_train.shape, X_val.shape, X_test.shape

((4016262, 90), (446248, 90), (495837, 90))

# Feature engineering

# Correlations

In [50]:
profile = ProfileReport(X_train, title="Pandas Profiling Report", minimal=True)
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
profile = ProfileReport(X_train.sample(10000), title="Pandas Profiling Report")
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

# Save tables

In [51]:
X_train.to_csv(os.path.join(data_path, 'X_train.csv'), index=False)
y_train.to_csv(os.path.join(data_path, 'y_train.csv'), index=False)
groups_train.to_csv(os.path.join(data_path, 'groups_train.csv'), index=False)
print('train is done')

X_val.to_csv(os.path.join(data_path, 'X_val.csv'), index=False)
y_val.to_csv(os.path.join(data_path, 'y_val.csv'), index=False)
groups_val.to_csv(os.path.join(data_path, 'groups_val.csv'), index=False)
print('val is done')

X_test.to_csv(os.path.join(data_path, 'X_test.csv'), index=False)
y_test.to_csv(os.path.join(data_path, 'y_test.csv'), index=False)
groups_test.to_csv(os.path.join(data_path, 'groups_test.csv'), index=False)
print('test is done')

subm_df.to_csv(os.path.join(data_path, 'submission_df_preprocesseds.csv'), index=False)
print('submission is done')

train is done
val is done
test is done
submission is done
