In [108]:
import pandas as pd
import matplotlib as mlp
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [142]:
trainingdf = pd.read_csv('./data/training_set_VU_DM.csv')
startlen = len(trainingdf.index)

In [143]:
#Add column with day of the week
trainingdf['date_time'] = pd.to_datetime(trainingdf['date_time'])
trainingdf['weekday'] = trainingdf['date_time'].dt.dayofweek
trainingdf['weekday_name'] = trainingdf['date_time'].dt.day_name()

#Add column with part of day: morning, afternoon, evening, night
trainingdf['part_of_day'] = pd.cut(trainingdf['date_time'].dt.hour,[0,6,12,18,24],labels=['night','morning','afternoon','evening'],include_lowest=True)

#Add column with season: winter, spring, summer, autumn
# Doesn't work since the mins must increase monotonically? Other option I say is 
# trainingdf['season'] = pd.cut(trainingdf['date_time'].dt.strftime('%m-%d'),[1-1, 3-21, 6-21, 21-9, 12-21, 12-31],labels=['winter','spring','summer','autumn', 'winter'],include_lowest=True)
trainingdf['season'] = pd.cut(trainingdf['date_time'].dt.month,[0,3,6,9,12],labels=['winter','spring','summer','autumn'],include_lowest=True)

#Add column last minute whenever it is booked within 2 weeks upfront
trainingdf['last_minute'] = [1 if x <= 14 else 0 for x in trainingdf['srch_booking_window']]

#Fill NaN values from Gross_bookings_usd with 0
trainingdf['gross_bookings_usd'] = trainingdf['gross_bookings_usd'].fillna(0)

#Delete rows with price lower than lower quantile (0.001) and higher than upper quantile (0.999)
trainingdf = trainingdf[(trainingdf['price_usd'] >= trainingdf['price_usd'].quantile(0.001)) & (trainingdf['price_usd'] <= trainingdf['price_usd'].quantile(0.999))]

In [144]:
# Combining the competitors rate (based on the price), takes the rate only if all competitors have the same rate
# Not correct but this is what the 'highest' is --> lowest price, same price or highest price compared to all competitors
trainingdf['comp_rate'] = [1 if all(x == 1 for x in row) else 0 if all(x == 0 for x in row) else -1 for row in trainingdf[['comp1_rate', 'comp2_rate', 'comp3_rate', 'comp4_rate', 'comp5_rate', 'comp6_rate', 'comp7_rate', 'comp8_rate']].values]

In [146]:
#Add 1 to all values in comp1_rate, comp2_rate etc. if they are not NaN so we get a range of 0 to 2 and calculated the total score of all competitors per prop_id (hotel)
columns = ['comp1_rate', 'comp2_rate', 'comp3_rate', 'comp4_rate', 'comp5_rate', 'comp6_rate', 'comp7_rate', 'comp8_rate']
for column in columns:
    trainingdf[column] = trainingdf[column] + 1
    trainingdf['comp_rate'] = trainingdf.groupby('prop_id')[column].transform('count')

In [152]:
#Add 1 to all values in comp1_inv, comp2_inv etc. if they are not NaN so we get a range of 0 to 2 and calculated the total score of all competitors per prop_id (hotel)
columns2 = ['comp1_inv', 'comp2_inv', 'comp3_inv', 'comp4_inv', 'comp5_inv', 'comp6_inv', 'comp7_inv', 'comp8_inv']
for column in columns2:
    trainingdf[column] = trainingdf[column] + 1
    trainingdf['comp_inv'] = trainingdf.groupby('prop_id')[column].transform('count')

In [153]:
#Add 1 to all values in comp1_inv, comp2_inv etc. if they are not NaN so we get a range of 0 to 2 and calculated the total score of all competitors per prop_id (hotel)
columns2 = ['comp1_rate_percent_diff', 'comp2_rate_percent_diff', 'comp3_rate_percent_diff', 'comp4_rate_percent_diff', 'comp5_rate_percent_diff', 'comp6_rate_percent_diff', 'comp7_rate_percent_diff', 'comp8_rate_percent_diff']
for column in columns2:
    trainingdf['comp_rate_percent_diff'] = trainingdf.groupby('prop_id')[column].transform('mean')

In [155]:
#Needs to be run after rows are dropped
#Based on the occurences of the srch_id in the temporal time, add 1 to the column if click_bool is 1 and 0 if click_bool is 0 over the whole dataset
trainingdf['click_history'] = trainingdf.groupby('srch_id')['click_bool'].transform(lambda x: x.cumsum().shift())

In [157]:
trainingdf.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,booking_bool,weekday,weekday_name,part_of_day,season,last_minute,comp_rate,comp_inv,comp_rate_percent_diff,click_history
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,0,3,Thursday,morning,spring,1,508,510,11.990741,
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,0,3,Thursday,morning,spring,1,500,502,10.298507,0.0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,0,3,Thursday,morning,spring,1,467,469,7.6875,0.0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,0,3,Thursday,morning,spring,1,394,396,7.521739,0.0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,0,3,Thursday,morning,spring,1,561,563,13.704082,0.0
