# Data preparation

In [1]:
import pandas as pd
import numpy as np 

In [2]:
path = '../../data/' 
hotels = pd.read_csv(path + '/features_hotels.csv')
data = pd.read_csv(path + 'data.csv')
data_test = pd.read_csv(path + 'test_set.csv')

In [3]:
to_exclude = data[['city', 'language','mobile','request_number','date']].drop_duplicates()
# to_exclude.to_csv('to_exclude.csv', index = False)
print('Nombre de requêtes uniques :', to_exclude.shape[0])

Nombre de requêtes uniques : 8329


In [4]:
tot = 0 
for i in range(7):
    tot += np.unique(data.loc[data.request_number == i].avatar_id.values).shape[0]

print('Estimation du nombre total de requêtes :', tot)

Estimation du nombre total de requêtes : 11171


In [5]:
# exclusion des avatar posant problème
# data = data.drop(index = data.loc[data.avatar_id == 134].index)
# to_drop = [108,134,135,136,137,138,141]
to_drop = [134]
data = data.loc[~data.avatar_id.isin(to_drop)]

In [6]:
data = data.drop(['avatar_id'], axis = 1).drop_duplicates()

In [7]:
# ajout des caractéristiques des hotels
data = data.merge(hotels, on=['hotel_id','city'])
data_test = data_test.merge(hotels, on=['hotel_id','city'])
data_test = data_test.sort_values('index').reset_index(drop=True).drop(['index'], axis = 1)

In [8]:
# création de la colonne request_number dans le test set
data_test['request_number'] = 1
for avatar in np.unique(data_test['avatar_id']):
    data_test.loc[data_test['avatar_id'] == avatar, 'request_number'] = data_test['order_requests'].loc[data_test['avatar_id']== avatar] - min(data_test['order_requests'].loc[data_test['avatar_id']== avatar])+1

In [9]:
# ajout de la variable ville == langue
lang_cit = True

if lang_cit == True: 
    dic_lang = {'amsterdam':'dutch', 'copenhagen':'danish', 'madrid':'spanish', 'paris':'french', 'rome':'italian', 'sofia':'bulgarian', 'valletta':'maltese', 'vienna':'austrian' ,'vilnius':'lithuanian'}
    data['city_language'] = data['city'].map(dic_lang)
    data['is_same_cl'] = data['city_language']==data['language']
    data_test['city_language'] = data_test['city'].map(dic_lang)
    data_test['is_same_cl'] = data_test['city_language']==data_test['language']
#     colBool += ['is_same_cl']

In [10]:
col = ['city', 'date', 'language', 'mobile', 'request_number', 'stock', 'group', 'brand', 'parking', 'pool','children_policy', 'is_same_cl', 'price']
data = data[col]
data_test = data_test[col[:-1]]

In [11]:
def additive_smoothing(x,alpha= 1.96):
    """ alpha = 1.96 borne à 95% pour la loi normale  """
    moy = (x+alpha).mean()
    d = x.max()-x.min()
    N = x.count()
    return N*moy/(N+alpha*d)

additive_smoothing.__name__ = 'additive_smoothing'

In [12]:
colQT = ['request_number', 'stock', 'date']
notQT = ['city', 'language', 'mobile', 'group', 'brand', 'parking', 'pool','children_policy', 'is_same_cl'] 

In [13]:
for i in notQT:
    df_tmp = data[[i, 'price']].groupby(i).agg({'price': ['mean', 'var', additive_smoothing]}).price
    df_tmp = df_tmp.add_suffix('_' + i)
    colQT += list(df_tmp.columns.unique())
    data = data.join(df_tmp, on=i)
    data_test = data_test.join(df_tmp, on=i)

In [14]:
colQT += ['price']

In [15]:
df = data[colQT[:-1]]
df_test = data_test[colQT[:-1]]
y = data[colQT[-1]]

In [16]:
X_train = df
X_test = df_test
Y_train = y
Y_test = None

In [17]:
# pd.read_csv('test.csv')

# GBR

In [None]:
X_train

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

GBR = GradientBoostingRegressor()

GBR.set_params(learning_rate = .01,
               max_depth = 4,
               n_estimators = 1500,
               validation_fraction = .1,
               criterion = 'friedman_mse',
               subsample = .2,
               max_leaf_nodes = 50,
              max_features = 1.0,
              verbose = 1)

fit = GBR.fit(X_train,Y_train)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

# Créez un modèle de Gradient Boosting
gbr = GradientBoostingRegressor()

# Définissez les paramètres à tester
param_grid = {'n_estimators': [100, 500, 1000],
              'max_depth': [3, 5, 7],
              'min_samples_split': [2, 4, 6],
              'learning_rate': [0.01, 0.1, 1.0],
              'subsample': [0.5, 0.8, 1.0],
              'max_features': [None, 'sqrt', 'log2'],
              'loss': ['l', 'huber'],
              'alpha': [0.5, 1.0, 2.0],
              'criterion':['friedman_mse'],
              'validation_fraction' : [.1,.2,.3]}

# Créez un objet GridSearchCV
grid_search = GridSearchCV(gbr, param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, Y_train)

In [None]:
print(grid_search.best_params_)

In [None]:
grid_search.score(X_test,)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

X, y = X_train, Y_train

GBR.set_params(learning_rate = .01,
               max_depth = 4,
               n_estimators = 1500,
               validation_fraction = .1,
               criterion = 'friedman_mse',
               subsample = .2,
               max_leaf_nodes = 50,
              max_features = 1.0,
              verbose = 1)


param_grid = {
    'learning_rate': np.linspace(.001,0.2,5),
    'max_depth' : [4],
    'n_estimators': [500],
    'validation_fraction': [.1,.2,.3], # learning rate
    'criterion':['friedman_mse'],
    'subsample': np.linspace(.1,.6,5),
    'max_leaf_nodes' :  np.linspace(40,150,5),
    'max_features' : np.linspace(1.,5.,5),
}


GBR = GradientBoostingRegressor()


random_search = RandomizedSearchCV(GBR,
                                   param_grid,
                                   n_iter=5,
                                   cv=6,
                                   scoring='accuracy',
                                   random_state=0,
                                   verbose=True)

random_search.fit(X, y)

print(random_search.best_params_)

In [None]:
# GBR.get_params().keys()

In [None]:
y_pred = fit.predict(X_test)

In [None]:
y_pred

In [None]:
# from joblib import dump, load
# dump(GBR, 'model/target_encoding.pkl')

# Submit

In [None]:
predictions = y_pred

In [None]:
# save
_round = False # tester aussi des arrondis au dessus ou en dessous

name = 'target_encoding_1500_estim'

if _round : y_pred = np.round(predictions)
else : y_pred = predictions.copy()

sub = pd.DataFrame(y_pred)
sub.to_csv(path + 'submit/' + name + '.csv',index=True, header=['price'], index_label = 'index')