In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from math import sqrt, log
sns.set_style("darkgrid")
from tqdm.notebook import tqdm, trange

from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split  
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler  

In [3]:
prints = False

In [4]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

# Import des données

In [79]:
path = '../data/' 
hotels = pd.read_csv(path + '/features_hotels.csv')
data = pd.read_csv(path + 'data.csv')
data_test = pd.read_csv(path + 'test_set.csv')

In [80]:
# création de la colonne request_number dans le test set
data_test['request_number'] = 1
for avatar in np.unique(data_test['avatar_id']):
    data_test.loc[data_test['avatar_id'] == avatar, 'request_number'] = data_test['order_requests'].loc[data_test['avatar_id']== avatar] - min(data_test['order_requests'].loc[data_test['avatar_id']== avatar])+1

In [40]:
data

Unnamed: 0,city,date,language,mobile,request_number,avatar_id,hotel_id,price,stock
0,valletta,2,bulgarian,1,1,134,55,109,10
1,valletta,2,bulgarian,1,1,134,81,71,0
2,valletta,2,bulgarian,1,1,134,106,133,0
3,valletta,2,bulgarian,1,1,134,134,151,0
4,valletta,2,bulgarian,1,1,134,200,146,10
...,...,...,...,...,...,...,...,...,...
916933,madrid,17,danish,0,1,256415,870,76,1
916934,madrid,17,danish,0,1,256415,398,68,9
916935,madrid,17,danish,0,1,256415,689,93,0
916936,madrid,17,danish,0,1,256415,276,74,0


In [41]:
# suppression de la requête de l'avatar_id 134 qui pose problème
# print(data.loc[data.avatar_id == 134])
data = data.drop(index = data.loc[data.avatar_id == 134].index)

In [81]:
data_test

Unnamed: 0,index,order_requests,city,date,language,mobile,avatar_id,hotel_id,stock,request_number
0,0,1,vilnius,21,romanian,0,1,161,46,1
1,1,1,vilnius,21,romanian,0,1,187,32,1
2,2,1,vilnius,21,romanian,0,1,279,12,1
3,3,1,vilnius,21,romanian,0,1,395,10,1
4,4,1,vilnius,21,romanian,0,1,488,42,1
...,...,...,...,...,...,...,...,...,...,...
6639,6639,843,rome,5,irish,0,794,987,1,3
6640,6640,844,vienna,1,irish,1,794,26,1,4
6641,6641,844,vienna,1,irish,1,794,263,0,4
6642,6642,844,vienna,1,irish,1,794,456,0,4


In [82]:
data = data.drop(['avatar_id'], axis = 1).drop_duplicates()

In [83]:
data

Unnamed: 0,city,date,language,mobile,request_number,hotel_id,price,stock
0,valletta,2,bulgarian,1,1,55,109,10
1,valletta,2,bulgarian,1,1,81,71,0
2,valletta,2,bulgarian,1,1,106,133,0
3,valletta,2,bulgarian,1,1,134,151,0
4,valletta,2,bulgarian,1,1,200,146,10
...,...,...,...,...,...,...,...,...
916828,paris,43,slovakian,0,1,65,144,132
916829,paris,43,slovakian,0,1,640,149,132
916830,paris,43,slovakian,0,1,827,80,53
916831,paris,43,slovakian,0,1,963,104,106


In [84]:
# ajout des caractéristiques des hotels
data = data.merge(hotels, on=['hotel_id','city'])
data_test = data_test.merge(hotels, on=['hotel_id','city'])
data_test = data_test.sort_values('index')    

# Mise en forme des données

In [85]:
colQT = ['request_number', 'stock', 'date']
colBool = ['mobile', 'pool', 'parking']
colQL = ['city', 'group', 'brand', 'children_policy']

In [16]:
# ajout de la variable ville == langue
lang_cit = True

if lang_cit == True: 
    dic_lang = {'amsterdam':'dutch', 'copenhagen':'danish', 'madrid':'spanish', 'paris':'french', 'rome':'italian', 'sofia':'bulgarian', 'valletta':'maltese', 'vienna':'austrian' ,'vilnius':'lithuanian'}
    data['city_language'] = data['city'].map(dic_lang)
    data['is_same_cl'] = data['city_language']==data['language']
    data_test['city_language'] = data_test['city'].map(dic_lang)
    data_test['is_same_cl'] = data_test['city_language']==data_test['language']
    colBool += ['is_same_cl']

In [50]:
for col in ['city', 'hotel_id', 'group', 'brand','children_policy','mobile','parking','pool'] :
    data[col] = pd.Categorical(data[col],ordered=False)
    data_test[col] = pd.Categorical(data_test[col],ordered=False)

In [51]:
for col in colBool :
    print(data[col].dtypes)

category
category
category


## Transformation des variables QL en QT

In [52]:
def additive_smoothing(x,alpha= 1.96):
    """ alpha = 1.96 borne à 95% pour la loi normale  """
    moy = (x+alpha).mean()
    d = x.max()-x.min()
    N = x.count()
    return N*moy/(N+alpha*d)

additive_smoothing.__name__ = 'additive_smoothing'

In [53]:
hotelId = False
if hotelId:
    colQL = ['hotel_id']
    colBool = ['mobile']

In [54]:
for i in colQL + colBool :
    df_tmp = data[[i,'price']].groupby(i).agg({'price':['mean','var',additive_smoothing]}).price
    df_tmp = df_tmp.add_suffix('_'+i)
    colQT+= list(df_tmp.columns.unique())
    data = data.join(df_tmp, on = i)
    data_test = data_test.join(df_tmp, on = i)


colQL = []
colBool = []

In [55]:
colQT += ['price']

In [56]:
dataQT = data[colQT[:-1]]
dataQT_test = data_test[colQT[:-1]]

In [57]:
dataBool = data[colBool]
dataBool_test = data_test[colBool]

In [58]:
df = pd.concat([dataDum,dataQT, dataBool],axis=1)
df_test = pd.concat([dataDum_test,dataQT_test, dataBool_test],axis=1)
df.head()

Unnamed: 0,request_number,stock,date,mean_city,var_city,additive_smoothing_city,mean_group,var_group,additive_smoothing_group,mean_brand,...,additive_smoothing_children_policy,mean_mobile,var_mobile,additive_smoothing_mobile,mean_pool,var_pool,additive_smoothing_pool,mean_parking,var_parking,additive_smoothing_parking
0,1,3,12,167.162619,6253.245151,167.844864,97.007655,769.200994,98.770057,97.007655,...,161.632022,162.832147,7209.010377,164.323796,144.194103,5113.341039,145.910267,163.918898,7341.625875,165.350089
1,1,15,27,167.162619,6253.245151,167.844864,97.007655,769.200994,98.770057,97.007655,...,161.632022,162.08494,7172.167241,163.602736,144.194103,5113.341039,145.910267,163.918898,7341.625875,165.350089
2,1,2,11,167.162619,6253.245151,167.844864,97.007655,769.200994,98.770057,97.007655,...,161.632022,162.832147,7209.010377,164.323796,144.194103,5113.341039,145.910267,163.918898,7341.625875,165.350089
3,1,14,26,167.162619,6253.245151,167.844864,97.007655,769.200994,98.770057,97.007655,...,161.632022,162.08494,7172.167241,163.602736,144.194103,5113.341039,145.910267,163.918898,7341.625875,165.350089
4,1,13,25,167.162619,6253.245151,167.844864,97.007655,769.200994,98.770057,97.007655,...,161.632022,162.832147,7209.010377,164.323796,144.194103,5113.341039,145.910267,163.918898,7341.625875,165.350089


In [59]:
# variable à expliquer
y = data[colQT[-1]]

# Séparation des données

In [60]:
submit = True

In [61]:
# séparation pour les tests et tunner les modèles
if submit :
    X_train = df
    X_test = df_test
    Y_train = y
    Y_test = None
else :
    X_train, X_test, Y_train, Y_test = train_test_split(df,y,test_size=.1,random_state=11) #25% pour le test_size

# Boosting

In [42]:
# # A parameter grid for XGBoost
# params = {
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0.5, 1, 1.5, 2, 5],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [3, 5, 7, 10],
#         'learning_rate': [0.01, 0.02, 0.05]    
#         }


# parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
#                   'subsample'    : [0.9, 0.5, 0.2, 0.1],
#                   'n_estimators' : [100,500,1000, 1500],
#                   'max_depth'    : [4,6,8,10]}

 class sklearn.ensemble.GradientBoostingRegressor(*, loss='squared_error', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)[source]¶

> for instance : best params : 'learning_rate': [0.02], 'subsample'    : [0.2], 'n_estimators' : [1000], 'max_depth'    : [4], 'criterion'    : ['friedman_mse']

> max_depth : test 4, 8, keep 4

> n_estimators : test 100,500,1000, keep 1000, test more ? 

> subsample : test 0.9, 0.5, 0.2, 0.1, keep 0.2 

> learning_rate : test .02, .05, keep ? and n_estimators : test 1000, 1500 keep ? and subsample : 0.2, 1., keep ? and max_depth : 2, 4, keep ? 
 {'criterion': 'friedman_mse', 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 1500, 'subsample': 0.2}

In [62]:
%%time
# définition des paramètres
# boost = GradientBoostingRegressor()

GBR = GradientBoostingRegressor()

parameters = {'learning_rate': [0.02],
              'subsample'    : [0.2],
              'n_estimators' : [1000],
              'max_depth'    : [4],
              'criterion'    : ['friedman_mse'],
             }

grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters, cv = 2, n_jobs=-1)

rfFit = grid_GBR.fit(X_train, Y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)

KeyboardInterrupt: 

In [81]:
# apprentissage
params = grid_GBR.best_params_
GBR = GradientBoostingRegressor()
GBR.set_params(learning_rate = params['learning_rate'], max_depth = params['max_depth'], n_estimators = params['n_estimators'], criterion = params['criterion'], subsample = params['subsample'])
rfFit = GBR.fit(X_train,Y_train)

In [65]:
params = {'learning_rate': 0.02,
              'subsample'    : 0.2,
              'n_estimators' : 1000,
              'max_depth'    : 4,
             }
GBR = GradientBoostingRegressor()
GBR.set_params(learning_rate = params['learning_rate'], max_depth = params['max_depth'], n_estimators = params['n_estimators'], subsample = params['subsample'])
rfFit = GBR.fit(X_train,Y_train)

In [66]:
feature_importances = zip(X_train.columns,GBR.feature_importances_)
x = dict(feature_importances)
{k: v for k, v in sorted(x.items(), key=lambda item: -item[1])}

{'mean_brand': 0.31073972524623505,
 'additive_smoothing_brand': 0.2575039158817395,
 'var_brand': 0.19391880591557697,
 'stock': 0.11931133148358025,
 'additive_smoothing_city': 0.04097187932151894,
 'mean_city': 0.036920654349877624,
 'var_city': 0.02248311409869644,
 'date': 0.012574344752797943,
 'additive_smoothing_pool': 0.000973553549690841,
 'var_pool': 0.0008382042556813082,
 'mean_pool': 0.000796413642509469,
 'additive_smoothing_group': 0.0005684804409428559,
 'var_group': 0.0004433075639927659,
 'var_children_policy': 0.0003957294729474096,
 'mean_group': 0.0002872281725108597,
 'mean_parking': 0.00027553544068980375,
 'var_parking': 0.00023582404084947762,
 'additive_smoothing_children_policy': 0.00019229292654513423,
 'additive_smoothing_parking': 0.0001822784111028221,
 'request_number': 0.0001794960814453157,
 'mean_children_policy': 0.00015895706219671982,
 'additive_smoothing_mobile': 2.2684306723330364e-05,
 'mean_mobile': 1.3883797583630355e-05,
 'var_mobile': 1.235

In [84]:
if not submit:
# erreur de prédiction sur le test
    1-rfFit.score(X_test,Y_test)

In [67]:
# prédiction
y_pred = rfFit.predict(X_test)
if not submit:
    print("RMSE =",np.sqrt(mean_squared_error(y_pred,Y_test)))
    print("R2 =",r2_score(Y_test,y_pred))

# Random Forest

In [None]:
# définition des paramètres
forest = RandomForestRegressor(n_estimators=500,
                               max_depth=None,
                               min_samples_split=2, min_samples_leaf=1, 
                               max_features=1.0, max_leaf_nodes=None,
                               bootstrap=True, oob_score=True)
# apprentissage
rfFit = forest.fit(X_train,Y_train)
print(1-rfFit.oob_score_)

In [None]:
param=[{"max_features":list(range(2,10,1))}]
rf= GridSearchCV(RandomForestRegressor(n_estimators=100),param,cv=5,n_jobs=-1)
rfOpt=rf.fit(X_train, Y_train)
# paramètre optimal
print("Meilleur score = %f, Meilleur paramètre = %s" % (1. - rfOpt.best_score_,rfOpt.best_params_))

In [None]:
if not submit:
# erreur de prévision sur le test
    1-rfOpt.score(X_test,Y_test)

In [None]:
# prévision
y_pred = rfOpt.predict(X_test)
if not submit :
    print("RMSE =",np.sqrt(mean_squared_error(y_pred,Y_test)))
    print("R2=",r2_score(Y_test,y_pred))

# Export submission

In [75]:
diff_price = pd.read_csv('diff_price.csv')
diff_price

Unnamed: 0,city,language,mean,var
0,amsterdam,austrian,-0.057517,0.000008
1,amsterdam,bulgarian,0.001142,0.000007
2,amsterdam,croatian,-0.019485,0.000009
3,amsterdam,cypriot,-0.056665,0.000008
4,amsterdam,czech,-0.019717,0.000009
...,...,...,...,...
193,vilnius,polish,0.000000,0.000000
194,vilnius,romanian,0.019961,0.000011
195,vilnius,slovakian,0.019845,0.000011
196,vilnius,slovene,-0.018972,0.000013


In [93]:
data_test

Unnamed: 0,index,order_requests,city,date,language,mobile,avatar_id,hotel_id,stock,request_number,group,brand,parking,pool,children_policy
0,0,1,vilnius,21,romanian,0,1,161,46,1,Boss Western,J.Halliday Inn,1,0,0
29,1,1,vilnius,21,romanian,0,1,187,32,1,Accar Hotels,Marcure,1,1,0
58,2,1,vilnius,21,romanian,0,1,279,12,1,Independant,Independant,1,0,0
87,3,1,vilnius,21,romanian,0,1,395,10,1,Accar Hotels,Ibas,0,0,0
176,4,1,vilnius,21,romanian,0,1,488,42,1,Accar Hotels,Safitel,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3972,6639,843,rome,5,irish,0,794,987,1,3,Accar Hotels,Ibas,1,0,0
6241,6640,844,vienna,1,irish,1,794,26,1,4,Accar Hotels,Marcure,1,0,0
6285,6641,844,vienna,1,irish,1,794,263,0,4,Boss Western,Boss Western,1,0,0
6329,6642,844,vienna,1,irish,1,794,456,0,4,Yin Yang,Ardisson,1,0,0


In [94]:
data_test.merge(diff_price, how= 'left',on = ['city','language'])['mean']

0       0.019961
1       0.019961
2       0.019961
3       0.019961
4       0.019961
          ...   
6639   -0.039067
6640    0.000000
6641    0.000000
6642    0.000000
6643    0.000000
Name: mean, Length: 6644, dtype: float64

In [121]:
diff = 1-data_test.merge(diff_price, how= 'left',on = ['city','language'])['mean']
diff[np.isnan(diff)]=1

In [113]:
np.isnan(diff).sum()

0

In [77]:
X_test.merge(diff_price, on = ['city','language'])

Unnamed: 0,request_number,stock,date,mean_city,var_city,additive_smoothing_city,mean_group,var_group,additive_smoothing_group,mean_brand,...,additive_smoothing_children_policy,mean_mobile,var_mobile,additive_smoothing_mobile,mean_pool,var_pool,additive_smoothing_pool,mean_parking,var_parking,additive_smoothing_parking
0,1,46,21,111.070198,2717.428216,111.848527,137.159596,969.757213,138.689619,150.952010,...,161.632022,162.084940,7172.167241,163.602736,144.194103,5113.341039,145.910267,163.918898,7341.625875,165.350089
29,1,32,21,111.070198,2717.428216,111.848527,194.086751,10760.672184,194.673646,205.030550,...,161.632022,162.084940,7172.167241,163.602736,250.542351,7845.024135,250.597605,163.918898,7341.625875,165.350089
58,1,12,21,111.070198,2717.428216,111.848527,97.007655,769.200994,98.770057,97.007655,...,161.632022,162.084940,7172.167241,163.602736,144.194103,5113.341039,145.910267,163.918898,7341.625875,165.350089
87,1,10,21,111.070198,2717.428216,111.848527,194.086751,10760.672184,194.673646,80.882270,...,161.632022,162.084940,7172.167241,163.602736,144.194103,5113.341039,145.910267,161.306734,7069.897883,162.865978
176,1,42,21,111.070198,2717.428216,111.848527,194.086751,10760.672184,194.673646,302.091062,...,179.404189,162.084940,7172.167241,163.602736,250.542351,7845.024135,250.597605,163.918898,7341.625875,165.350089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3972,3,1,5,149.597014,5749.358449,150.310856,194.086751,10760.672184,194.673646,80.882270,...,161.632022,162.084940,7172.167241,163.602736,144.194103,5113.341039,145.910267,163.918898,7341.625875,165.350089
6241,4,1,1,157.833740,5629.002714,157.928268,194.086751,10760.672184,194.673646,205.030550,...,161.632022,162.832147,7209.010377,164.323796,144.194103,5113.341039,145.910267,163.918898,7341.625875,165.350089
6285,4,0,1,157.833740,5629.002714,157.928268,137.159596,969.757213,138.689619,125.594781,...,161.632022,162.832147,7209.010377,164.323796,144.194103,5113.341039,145.910267,163.918898,7341.625875,165.350089
6329,4,0,1,157.833740,5629.002714,157.928268,154.660708,4453.728189,155.693415,160.202692,...,161.632022,162.832147,7209.010377,164.323796,144.194103,5113.341039,145.910267,163.918898,7341.625875,165.350089


In [122]:
print(y_pred*diff)

0       109.516780
1       133.996921
2        69.675513
3        58.907741
4       193.105272
           ...    
6639     85.428834
6640    264.802362
6641    151.113982
6642    172.894057
6643    151.113982
Name: mean, Length: 6644, dtype: float64


In [119]:
all_data = y_pred

In [123]:
sub = pd.DataFrame(y_pred*diff)
sub.to_csv('diff_city_language.csv',index=True, header=['price'], index_label = 'index')

In [69]:
# save
_round = False # tester aussi des arrondis au dessus ou en dessous

name = 'boosting_langCitv2'

if _round : 
    newY = np.round(y_pred)

else : newY = y_pred.copy()

if transfo:
    newY = newY**3
if submit :
    sub = pd.DataFrame(newY)
    sub.to_csv(path + 'submit/' + name + '.csv',index=True, header=['price'], index_label = 'index')

OSError: Cannot save file into a non-existent directory: '../data/submit'

# A faire

> gridsearch sur alpha 

> ajouter une variable écart de temps

> fusionner children policy 1 et 2

> transfos sur le prix

> transfos sur autres variables QT

> tester un réseau prenant en ajoutant comme variables les prédictions de nos différents modèles

> soumission à faire:


1) RF 


2) tester hotelId = True 
