In [186]:
import catboost
import pandas as pd
import features
import csv

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb
from sklearn.metrics import mean_squared_error

from sklearn.compose import make_column_transformer

In [114]:
df = features.prepareDataframe(pd.read_csv("./data/allData.csv"))

# ce que je fais la c'est que je m'amuse a virer tout les hotels qui sont pas dans le test set
listHotelIDdata = pd.unique(df['hotel_id'])
listHotelIDTestSet = pd.unique(pd.read_csv("./data/test_set.csv")['hotel_id'])
df = df[df['hotel_id'].isin(listHotelIDTestSet)]

# on récupère la colonne cible, le prix, et on la supprime
y = df["price"]
df.drop(["price", "Unnamed: 0"], axis=1, inplace=True)

df["brandgroup"] = df["group"] + df["brand"]
df.drop(["group", "brand"], axis=1, inplace=True)
print(df.shape)


(48533, 12)


In [115]:
# Essayer d'encoder la col hotel_id
columns_transfo = make_column_transformer(
    (OneHotEncoder(), ['brandgroup', 'city', 'language', "hotel_id"]), 
    remainder='passthrough')
transformed = columns_transfo.fit_transform(df).toarray()
df = pd.DataFrame(transformed, columns=columns_transfo.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=0)

In [116]:
# On standardise les données
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [117]:
df

Unnamed: 0,onehotencoder__brandgroup_Accar HotelsIbas,onehotencoder__brandgroup_Accar HotelsMarcure,onehotencoder__brandgroup_Accar HotelsNavatel,onehotencoder__brandgroup_Accar HotelsSafitel,onehotencoder__brandgroup_Boss WesternBoss Western,onehotencoder__brandgroup_Boss WesternJ.Halliday Inn,onehotencoder__brandgroup_Chillton WorldwideChill Garden Inn,onehotencoder__brandgroup_Chillton WorldwideQuadrupletree,onehotencoder__brandgroup_Chillton WorldwideTripletree,onehotencoder__brandgroup_IndependantIndependant,...,onehotencoder__hotel_id_992,onehotencoder__hotel_id_995,remainder__stock,remainder__date,remainder__mobile,remainder__avatar_id,remainder__order_requests,remainder__parking,remainder__pool,remainder__children_policy
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,280.0,44.0,0.0,132.0,1.0,1.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,56.0,44.0,0.0,132.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,280.0,44.0,0.0,132.0,1.0,1.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,111.0,44.0,0.0,132.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,195.0,44.0,0.0,132.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,73.0,30.0,0.0,91793.0,2.0,0.0,0.0,0.0
48529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,24.0,30.0,0.0,91793.0,2.0,0.0,0.0,0.0
48530,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,101.0,30.0,0.0,91793.0,2.0,0.0,0.0,0.0
48531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,103.0,30.0,0.0,91793.0,2.0,1.0,0.0,0.0


In [184]:
models = list()
models.append('xg', xgb.XGBRegressor(n_estimators = 500, max_depth = 5, objective="reg:squarederror"))
models.append('ct', ctb.CatBoostRegressor(learning_rate=0.3, depth=11, loss_function='RMSE'))
models.append('lg', lgb.LGBMRegressor(boosting_type='gbdt', n_estimators=1500, num_leaves=500, learning_rate=0.05))

model = StackingRegressor(estimators=models, final_estimator=LinearRegression(), cv=5)
# modelBagg = BaggingRegressor(base_estimator=model)


In [185]:
model.fit(X_train, y_train)

train_score = mean_squared_error(y_train, model.predict(X_train), squared=False)
test_score = mean_squared_error(y_test, model.predict(X_test), squared=False)

print("Train Score:", train_score)
print("Test Score:", test_score)

Train Score: 1.1174788556480542
Test Score: 2.6288503478681045


In [120]:
test_data = pd.read_csv("./data/test_set.csv")
test_data = test_data.drop(columns=["index"])
# On ajoute les caractéristiques des hôtels
test_data = features.prepareDataframe(test_data)

test_data["hotel_id"] = test_data["hotel_id"].astype(str)
test_data["brandgroup"] = test_data["group"] + test_data["brand"]
test_data.drop(["group", "brand"], axis=1, inplace=True)

# On encode les données non numériques avec OneHotEncoder
columns_transfo = make_column_transformer(
    (OneHotEncoder(), ['brandgroup', 'city', 'language', "hotel_id"]), 
    remainder='passthrough')
transformed = columns_transfo.fit_transform(test_data).toarray()
test_data = pd.DataFrame(transformed, columns=columns_transfo.get_feature_names_out())

test_data = features.rearrangeCol(df, test_data)
# print(test_data.columns)

# On normalise les données en se basant sur le training set
X_test_data_transformed = scaler.transform(test_data)

# On génère le csv
header = ["index", "price"]
data = []
for i in range(len(X_test_data_transformed)):
    prediction = [i, int(model.predict([X_test_data_transformed[i]]))]
    data.append(prediction)

with open('predictionsKaggle.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    # write the header
    writer.writerow(header)
    # write data
    writer.writerows(data)

  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] = aFaire[col].tolist()
  df[col] 

In [98]:
test_data = pd.read_csv("./data/test_set.csv")
len(pd.unique(test_data["hotel_id"]))

185