In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from math import sqrt, log
sns.set_style("darkgrid")
from tqdm.notebook import tqdm, trange

from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split  
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler  

In [None]:
prints = False

In [None]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

# Import des données

In [None]:
path = '../data/' 
hotels = pd.read_csv(path + '/features_hotels.csv')
data = pd.read_csv(path + 'data.csv')
data_test = pd.read_csv(path + 'test_set.csv')

In [None]:
# création de la colonne request_number dans le test set
data_test['request_number'] = 1
for avatar in np.unique(data_test['avatar_id']):
    data_test.loc[data_test['avatar_id'] == avatar, 'request_number'] = data_test['order_requests'].loc[data_test['avatar_id']== avatar] - min(data_test['order_requests'].loc[data_test['avatar_id']== avatar])+1

In [None]:
data_test

In [None]:
# suppression de la requête de l'avatar_id 134 qui pose problème
# print(data.loc[data.avatar_id == 134])
data = data.drop(index = data.loc[data.avatar_id == 134].index)

In [None]:
data

In [None]:
data = data.drop(['avatar_id'], axis = 1).drop_duplicates()

In [None]:
data

In [None]:
# ajout des caractéristiques des hotels
data = data.merge(hotels, on=['hotel_id','city'])
data_test = data_test.merge(hotels, on=['hotel_id','city'])
data_test = data_test.sort_values('index')    

# Approche naïve

Ici on génère des prix en utilisant les requêtes les plus proches.

In [None]:
# y_pred = []

# for i in tqdm(data_test.index) :
#     _, order, city, date, lang, mob, avatar, hotel, stock, req, group, brand, park, pool, child = data_test.iloc[i].values
#     data_tmp = data.loc[data.hotel_id == hotel]
#     delta_date = data_tmp.date.values - date
    
#     y_pred.append(data_tmp.iloc[np.argmin(delta_date)].price)

# Mise en forme des données

In [None]:
colQT = ['request_number', 'stock', 'date']
colBool = ['mobile', 'pool', 'parking']
colQL = ['city', 'language', 'group', 'brand', 'children_policy']

In [None]:
for col in ['city', 'language', 'hotel_id', 'group', 'brand','children_policy','mobile','parking','pool'] :
    data[col] = pd.Categorical(data[col],ordered=False)
    data_test[col] = pd.Categorical(data_test[col],ordered=False)

In [None]:
for col in colBool :
    print(data[col].dtypes)

## Transformation des variables QL en QT

In [None]:
def additive_smoothing(x,alpha= 1.96):
    """ alpha = 1.96 borne à 95% pour la loi normale  """
    moy = (x+alpha).mean()
    d = x.max()-x.min()
    N = x.count()
    return N*moy/(N+alpha*d)

additive_smoothing.__name__ = 'additive_smoothing'

In [None]:
hotelId = False
if hotelId:
    colQL = ['hotel_id']
    colBool = ['mobile']

In [None]:
for i in colQL :
    df_tmp = data[[i,'price']].groupby(i).agg({'price':['mean','var',additive_smoothing]}).price
    df_tmp = df_tmp.add_suffix('_'+i)
    colQT+= list(df_tmp.columns.unique())
    data = data.join(df_tmp, on = i)
    data_test = data_test.join(df_tmp, on = i)


colQL = []

In [None]:
colQT += ['price']

# Analyse de stock

In [None]:
df_stock = data[['hotel_id','date','stock']].groupby(['hotel_id','date']).mean().reset_index()
test = df_stock.loc[df_stock.hotel_id == 500]

sns.scatterplot(data = test, x = 'date', y = 'stock')
plt.show()

# Exploration

In [None]:
if prints :
    plt.figure()
    sns.histplot(data['price'])
    plt.show()

## Unidimensionnelle

### QL

In [None]:
if prints :
    for col in colQL:
        fig, axes = plt.subplots(1, 2, figsize=(20, 3))
        sns.histplot(ax = axes[0], x = data[col])
        sns.histplot(ax = axes[1], x = data_test[col])
        axes[0].tick_params(labelrotation=45)
        axes[1].tick_params(labelrotation=45)
        axes[0].set_title('train set')
        axes[1].set_title('test set')
        plt.show()

### QT

In [None]:
if prints :
    for col in colQT[:-1]:
        fig, axes = plt.subplots(1, 2, figsize=(20, 3))
        sns.histplot(ax = axes[0], x = data[col])
        sns.histplot(ax = axes[1], x = data_test[col])
        axes[0].tick_params(labelrotation=45)
        axes[1].tick_params(labelrotation=45)
        axes[0].set_title('train set')
        axes[1].set_title('test set')
        plt.show()

## Bidimensionnelle

In [None]:
if prints :
    scatter_matrix(data[colQT], alpha=0.2, figsize=(10, 10), diagonal='kde')
    plt.tight_layout()
    plt.show()

In [None]:
if prints :
    scatter_matrix(data_test[colQT[:-1]], alpha=0.2, figsize=(10, 10), diagonal='kde')
    plt.tight_layout()
    plt.show()

# Transformations

In [None]:
transfo = False

In [None]:
if prints :
    plt.figure()
    sns.histplot(data['price'].map(lambda x: x**(1/3)))
    plt.show()

In [None]:
if transfo :
    data['SRprice'] = data['price'].map(lambda x: x**(1/3))
    colQT[-1] = 'SRprice'

# Préparation des datasets 

In [None]:
dummies = False

In [None]:
if dummies :
    dataDum = pd.get_dummies(data[colQL])
    dataDum_test = pd.get_dummies(data_test[colQL])
    if dataDum.shape[1] != dataDum_test.shape[1]:
        print('nombre de colonnes différent')
else:
    dataDum = None
    dataDum_test = None

In [None]:
dataQT = data[colQT[:-1]]
dataQT_test = data_test[colQT[:-1]]

In [None]:
dataBool = data[colBool]
dataBool_test = data_test[colBool]

In [None]:
df = pd.concat([dataDum,dataQT, dataBool],axis=1)
df_test = pd.concat([dataDum_test,dataQT_test, dataBool_test],axis=1)
df.head()

In [None]:
# variable à expliquer
y = data[colQT[-1]]

# Séparation des données

In [None]:
submit = True

In [None]:
# séparation pour les tests et tunner les modèles
if submit :
    X_train = df
    X_test = df_test
    Y_train = y
    Y_test = None
else :
    X_train, X_test, Y_train, Y_test = train_test_split(df,y,test_size=.1,random_state=11) #25% pour le test_size

# Standardisation des données

In [None]:
stand = False

In [None]:
if stand :
    # standardisation des données
    scaler = StandardScaler()  
    scaler.fit(X_train)
    X_train_nonScale, X_test_nonScale = X_train.copy(), X_test.copy()
    X_train = scaler.transform(X_train)  
    # Meme transformation sur le test
    X_test = scaler.transform(X_test)

# Boosting

In [None]:
# # A parameter grid for XGBoost
# params = {
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0.5, 1, 1.5, 2, 5],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [3, 5, 7, 10],
#         'learning_rate': [0.01, 0.02, 0.05]    
#         }


# parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
#                   'subsample'    : [0.9, 0.5, 0.2, 0.1],
#                   'n_estimators' : [100,500,1000, 1500],
#                   'max_depth'    : [4,6,8,10]}

 class sklearn.ensemble.GradientBoostingRegressor(*, loss='squared_error', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)[source]¶

> max_depth : test 4, 8, keep 4

> n_estimators : test 100,500,1000, keep 1000, test more ? 

> subsample : test 0.9, 0.5, 0.2, 0.1, keep 0.2 

> learning_rate : test .02, .05, keep ? and n_estimators : test 1000, 1500 keep ? and subsample : 0.2, 1., keep ? and max_depth : 2, 4, keep ? 
 {'criterion': 'friedman_mse', 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 1500, 'subsample': 0.2}

In [None]:
%%time
# définition des paramètres
# boost = GradientBoostingRegressor()

GBR = GradientBoostingRegressor()

parameters = {'learning_rate': [0.02],
              'subsample'    : [0.2],
              'n_estimators' : [1000],
              'max_depth'    : [4],
              'criterion'    : ['friedman_mse'],
             }

grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters, cv = 2, n_jobs=-1)

rfFit = grid_GBR.fit(X_train, Y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)

In [None]:
# apprentissage
# rfFit = GBR.fit(X_train,Y_train)

In [None]:
feature_importances = zip(X_train.columns,GBR.feature_importances_)
x = dict(feature_importances)
{k: v for k, v in sorted(x.items(), key=lambda item: -item[1])}

In [None]:
if not submit:
# erreur de prédiction sur le test
    1-rfFit.score(X_test,Y_test)

In [None]:
# prédiction
y_pred = rfFit.predict(X_test)
if not submit:
    print("RMSE =",np.sqrt(mean_squared_error(y_pred,Y_test)))
    print("R2 =",r2_score(Y_test,y_pred))

# Random Forest

In [None]:
# définition des paramètres
forest = RandomForestRegressor(n_estimators=500,
                               max_depth=None,
                               min_samples_split=2, min_samples_leaf=1, 
                               max_features=1.0, max_leaf_nodes=None,
                               bootstrap=True, oob_score=True)
# apprentissage
rfFit = forest.fit(X_train,Y_train)
print(1-rfFit.oob_score_)

In [None]:
param=[{"max_features":list(range(2,10,1))}]
rf= GridSearchCV(RandomForestRegressor(n_estimators=100),param,cv=5,n_jobs=-1)
rfOpt=rf.fit(X_train, Y_train)
# paramètre optimal
print("Meilleur score = %f, Meilleur paramètre = %s" % (1. - rfOpt.best_score_,rfOpt.best_params_))

In [None]:
if not submit:
# erreur de prévision sur le test
    1-rfOpt.score(X_test,Y_test)

In [None]:
# prévision
y_pred = rfOpt.predict(X_test)
if not submit :
    print("RMSE =",np.sqrt(mean_squared_error(y_pred,Y_test)))
    print("R2=",r2_score(Y_test,y_pred))

# Export submission

In [None]:
print(y_pred)

In [None]:
# save
name = 'boostingBasicGS5'

newY = np.round(y_pred)
if transfo:
    newY = y_pred**3
if submit :
    sub = pd.DataFrame(newY)
    sub.to_csv(path + 'submit/' + name + '.csv',index=True, header=['price'], index_label = 'index')

# A faire

> gridsearch sur alpha 

> ajouter une variable écart de temps

> fusionner children policy 1 et 2

> transfos sur le prix

> transfos sur autres variables QT

> soumission à faire:


1) RF 


2) tester hotelId = True 
