# Data preparation

In [59]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
path = '../data/' 
hotels = pd.read_csv(path + '/features_hotels.csv')
data = pd.read_csv(path + 'data.csv')
data_test = pd.read_csv(path + 'test_set.csv')

In [3]:
to_exclude = data[['city', 'language','mobile','request_number','date']].drop_duplicates()
# to_exclude.to_csv('to_exclude.csv', index = False)
print('Nombre de requêtes uniques :', to_exclude.shape[0])

Nombre de requêtes uniques : 8329


In [4]:
tot = 0 
for i in range(7):
    tot += np.unique(data.loc[data.request_number == i].avatar_id.values).shape[0]

print('Estimation du nombre total de requêtes :', tot)

Estimation du nombre total de requêtes : 11171


In [5]:
# exclusion des avatar posant problème
# data = data.drop(index = data.loc[data.avatar_id == 134].index)
# to_drop = [108,134,135,136,137,138,141]
to_drop = [134]
data = data.loc[~data.avatar_id.isin(to_drop)]

In [6]:
data = data.drop(['avatar_id'], axis = 1).drop_duplicates()

In [7]:
# ajout des caractéristiques des hotels
data = data.merge(hotels, on=['hotel_id','city'])
data_test = data_test.merge(hotels, on=['hotel_id','city'])
data_test = data_test.sort_values('index').reset_index(drop=True).drop(['index'], axis = 1)

In [8]:
# création de la colonne request_number dans le test set
data_test['request_number'] = 1
for avatar in np.unique(data_test['avatar_id']):
    data_test.loc[data_test['avatar_id'] == avatar, 'request_number'] = data_test['order_requests'].loc[data_test['avatar_id']== avatar] - min(data_test['order_requests'].loc[data_test['avatar_id']== avatar])+1

In [9]:
# ajout de la variable ville == langue
lang_cit = True

if lang_cit == True: 
    dic_lang = {'amsterdam':'dutch', 'copenhagen':'danish', 'madrid':'spanish', 'paris':'french', 'rome':'italian', 'sofia':'bulgarian', 'valletta':'maltese', 'vienna':'austrian' ,'vilnius':'lithuanian'}
    data['city_language'] = data['city'].map(dic_lang)
    data['is_same_cl'] = data['city_language']==data['language']
    data_test['city_language'] = data_test['city'].map(dic_lang)
    data_test['is_same_cl'] = data_test['city_language']==data_test['language']

In [15]:
colQT = ['date', 'request_number', 'stock','price']
colQL = ['city', 'language', 'mobile', 'group', 'brand', 'parking', 'pool', 'children_policy', 'is_same_cl']

In [56]:
# Créer un encoder one-hot
encoder = OneHotEncoder()

train_data = encoder.fit_transform(data[colQL]).toarray()

test_data = encoder.transform(data_test[colQL]).toarray()

X_train = np.concatenate((train_data, data[colQT[:-1]].values), axis = 1)
X_test = np.concatenate((test_data, data_test[colQT[:-1]].values), axis = 1)

y = data.price.values

In [61]:
model = GradientBoostingRegressor(learning_rate = .02,
                                  max_depth = 4,
                                  n_estimators = 2500,
                                  validation_fraction = .2,
                                  criterion = 'friedman_mse',
                                  subsample = .2,
                                  max_leaf_nodes = 50,
                                  max_features = 1.0,
                                  verbose = 1)

model.fit(X_train, y)

y_pred = model.predict(X_test)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1        6714.8195         155.1052           27.88m
         2        6527.7848         149.2195           27.74m
         3        6391.3030         143.3055           27.61m
         4        6291.9077         135.2786           27.71m
         5        6153.1611         131.1026           27.76m
         6        5971.3124         124.8510           27.92m
         7        5883.8633         121.8140           27.91m
         8        5788.2376         116.2970           27.90m
         9        5628.9437         112.4307           27.86m
        10        5546.2129         107.5391           27.90m
        20        4572.1687          87.8675           27.57m
        30        3861.0074          76.8338           27.59m
        40        3272.5215          55.9824           27.54m
        50        2827.7335          35.4516           27.51m
        60        2453.4564          34.6076           27.51m
       

In [64]:
# from joblib import dump, load
# dump(model, 'model/one_hot.pkl')

['model/one_hot.pkl']

In [66]:
predictions = y_pred

# save
_round = False # tester aussi des arrondis au dessus ou en dessous

name = 'one_hot'

if _round : y_pred = np.round(predictions)
else : y_pred = predictions.copy()

sub = pd.DataFrame(y_pred)
sub.to_csv(path + 'submit/' + name + '.csv',index=True, header=['price'], index_label = 'index')