# Data preparation

In [1]:
import pandas as pd
import numpy as np 

In [2]:
path = '../../data/' 
hotels = pd.read_csv(path + '/features_hotels.csv')
data = pd.read_csv(path + 'data.csv')
data_test = pd.read_csv(path + 'test_set.csv')

In [3]:
to_exclude = data[['city', 'language','mobile','request_number','date']].drop_duplicates()
# to_exclude.to_csv('to_exclude.csv', index = False)
print('Nombre de requêtes uniques :', to_exclude.shape[0])

Nombre de requêtes uniques : 8329


In [4]:
tot = 0 
for i in range(7):
    tot += np.unique(data.loc[data.request_number == i].avatar_id.values).shape[0]

print('Estimation du nombre total de requêtes :', tot)

Estimation du nombre total de requêtes : 11171


In [5]:
# exclusion des avatar posant problème
# data = data.drop(index = data.loc[data.avatar_id == 134].index)
# to_drop = [108,134,135,136,137,138,141]
to_drop = [134]
data = data.loc[~data.avatar_id.isin(to_drop)]

In [6]:
data = data.drop(['avatar_id'], axis = 1).drop_duplicates()

In [7]:
# ajout des caractéristiques des hotels
data = data.merge(hotels, on=['hotel_id','city'])
data_test = data_test.merge(hotels, on=['hotel_id','city'])
data_test = data_test.sort_values('index').reset_index(drop=True).drop(['index'], axis = 1)

In [8]:
# création de la colonne request_number dans le test set
data_test['request_number'] = 1
for avatar in np.unique(data_test['avatar_id']):
    data_test.loc[data_test['avatar_id'] == avatar, 'request_number'] = data_test['order_requests'].loc[data_test['avatar_id']== avatar] - min(data_test['order_requests'].loc[data_test['avatar_id']== avatar])+1

In [9]:
# ajout de la variable ville == langue
lang_cit = True

if lang_cit == True: 
    dic_lang = {'amsterdam':'dutch', 'copenhagen':'danish', 'madrid':'spanish', 'paris':'french', 'rome':'italian', 'sofia':'bulgarian', 'valletta':'maltese', 'vienna':'austrian' ,'vilnius':'lithuanian'}
    data['city_language'] = data['city'].map(dic_lang)
    data['is_same_cl'] = data['city_language']==data['language']
    data_test['city_language'] = data_test['city'].map(dic_lang)
    data_test['is_same_cl'] = data_test['city_language']==data_test['language']
#     colBool += ['is_same_cl']

In [10]:
col = ['city', 'date', 'language', 'mobile', 'request_number', 'stock', 'hotel_id', 'price']
data = data[col]
data_test = data_test[col[:-1]]

In [11]:
def additive_smoothing(x,alpha= 1.96):
    """ alpha = 1.96 borne à 95% pour la loi normale  """
    moy = (x+alpha).mean()
    d = x.max()-x.min()
    N = x.count()
    return N*moy/(N+alpha*d)

additive_smoothing.__name__ = 'additive_smoothing'

In [12]:
colQT = ['request_number', 'stock', 'date']
notQT = ['city', 'language', 'mobile','hotel_id'] 

In [13]:
for i in notQT:
    df_tmp = data[[i, 'price']].groupby(i).agg({'price': ['mean', 'var', additive_smoothing]}).price
    df_tmp = df_tmp.add_suffix('_' + i)
    colQT += list(df_tmp.columns.unique())
    data = data.join(df_tmp, on=i)
    data_test = data_test.join(df_tmp, on=i)

In [14]:
colQT += ['price']

In [15]:
df = data[colQT[:-1]]
df_test = data_test[colQT[:-1]]
y = data[colQT[-1]]

In [17]:
X_train = df
X_test = df_test
Y_train = y
Y_test = None

In [18]:
# pd.read_csv('test.csv')

# GBR

In [19]:
X_train

Unnamed: 0,request_number,stock,date,mean_city,var_city,additive_smoothing_city,mean_language,var_language,additive_smoothing_language,mean_mobile,var_mobile,additive_smoothing_mobile,mean_hotel_id,var_hotel_id,additive_smoothing_hotel_id
0,1,3,12,165.55173,5985.767282,166.516354,154.748637,5999.095160,150.843929,159.610475,6843.099494,161.216981,86.930476,84.676763,83.732551
1,1,15,27,165.55173,5985.767282,166.516354,159.636337,6808.807107,158.623704,160.026551,6878.134125,161.643961,86.930476,84.676763,83.732551
2,1,2,11,165.55173,5985.767282,166.516354,156.157476,6641.903925,155.367276,159.610475,6843.099494,161.216981,86.930476,84.676763,83.732551
3,1,14,26,165.55173,5985.767282,166.516354,158.518461,6578.472846,154.689115,160.026551,6878.134125,161.643961,86.930476,84.676763,83.732551
4,1,13,25,165.55173,5985.767282,166.516354,156.713749,6541.710854,152.944919,159.610475,6843.099494,161.216981,86.930476,84.676763,83.732551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
916700,1,84,39,165.55173,5985.767282,166.516354,164.508062,7283.720668,159.483584,159.610475,6843.099494,161.216981,101.290323,12.316971,76.015516
916701,1,84,39,165.55173,5985.767282,166.516354,156.456098,6441.953525,153.253943,160.026551,6878.134125,161.643961,101.290323,12.316971,76.015516
916702,1,110,44,165.55173,5985.767282,166.516354,165.404823,7131.140515,164.022882,160.026551,6878.134125,161.643961,101.290323,12.316971,76.015516
916703,1,104,43,165.55173,5985.767282,166.516354,155.846236,6642.503216,152.507311,159.610475,6843.099494,161.216981,101.290323,12.316971,76.015516


In [20]:
from sklearn.ensemble import GradientBoostingRegressor

GBR = GradientBoostingRegressor()

GBR.set_params(learning_rate = .02,
               max_depth = 4,
               n_estimators = 5000,
               validation_fraction = .2,
               criterion = 'friedman_mse',
               subsample = .2,
               max_leaf_nodes = 50,
              max_features = 1.0,
              verbose = 1)

fit = GBR.fit(X_train,Y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1        6586.5928         255.5471           40.21m
         2        6377.8262         245.1704           39.14m
         3        6135.9325         235.0299           39.22m
         4        5912.2876         226.2157           39.09m
         5        5684.8110         217.2539           38.94m
         6        5542.9139         208.6519           38.95m
         7        5249.4130         201.0086           38.97m
         8        5112.5302         192.8104           38.95m
         9        4893.0655         185.3722           38.88m
        10        4732.4229         178.8565           38.87m
        20        3265.9648         120.8039           38.58m
        30        2265.0591          81.9751           38.42m
        40        1624.2375          55.6364           38.31m
        50        1166.8814          38.0250           38.17m
        60         839.1105          25.9164           38.09m
       

In [None]:
# GBR.get_params().keys()

In [21]:
y_pred = fit.predict(X_test)

In [22]:
y_pred

array([111.08281877, 134.0924187 ,  57.54196131, ..., 144.12770683,
       194.19088865, 136.27661264])

In [26]:
from joblib import dump, load
dump(GBR, '../model/target_encoding_gradio.pkl')

['../model/target_encoding_gradio.pkl']

In [27]:


import pickle
file_name = '../model/target_encoding_gradio'
pickle.dump(GBR, open(file_name,'wb')) #avec gbmOpt = gbm.fit(...)
model_loaded = pickle.load(open(file_name,'rb'))

# Submit

In [23]:
predictions = y_pred

In [24]:
# save
_round = False # tester aussi des arrondis au dessus ou en dessous

name = 'target_encoding_gradio'

if _round : y_pred = np.round(predictions)
else : y_pred = predictions.copy()

sub = pd.DataFrame(y_pred)
sub.to_csv(path + 'submit/' + name + '.csv',index=True, header=['price'], index_label = 'index')