In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from datetime import datetime

from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.utils import resample
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

In [None]:
data = pd.read_csv('data/FE_training_set.csv')

In [None]:
data['target'] = 4* data['booking_bool'] + data['click_bool']
data['target'].loc[data['target'].isna()] = int(0)
data['target'] = data['target'].astype(int)

In [None]:
data['target'].hist()

In [None]:
data['target'].value_counts()

In [None]:
data.head()

In [None]:
# get search ids with any booking
ids = data['srch_id'].loc[data['booking_bool'] == 1].unique()

In [None]:
# use only the searches that resulted in a booking
data = data.loc[data['srch_id'].isin(ids)]

In [None]:
# undersample non bookings
X_under, y_under = resample(data.drop(['srch_id', 'click_bool', 'gross_bookings_usd', 'booking_bool', 'position', 'target']
                                      , axis=1).loc[data['target'] == 0], data['target'].loc[data['target'] == 0]
                                      , n_samples=1 * sum(data['target'] != 0), replace=False)

In [None]:
X_bal = pd.concat([X_under, data.drop(['srch_id','click_bool', 'gross_bookings_usd', 'booking_bool', 'position', 'target']
                                      , axis=1).loc[data['target'] != 0]])

y_bal = y_under
y_bal = y_bal.append(data['target'].loc[data['target'] != 0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal)

In [None]:
categoricals = ['prop_country_id', 'prop_starrating','prop_brand_bool', 'promotion_flag',
                'srch_saturday_night_bool', 'random_bool']

#baseline = HistGradientBoostingRegressor(max_leaf_nodes=None, max_depth=4, categorical_features= categoricals)

# much better than just replacing with 0
X_train = X_train.fillna(-9999)
baseline = GradientBoostingRegressor(max_leaf_nodes=None, max_depth=4, max_features=int(np.sqrt(X_train.shape[1])))
baseline.fit(X_train,y_train)

In [None]:
pred = baseline.predict(data.drop(['srch_id', 'click_bool', 'gross_bookings_usd', 'booking_bool', 'position', 'target'], axis=1).fillna(-9999))

In [None]:
mean_squared_error(data['target'], pred)

In [None]:
pred_test = baseline.predict(X_test.fillna(-9999))
mean_squared_error(y_test, pred_test)

In [None]:
pred

In [None]:
plt.hist(y_train)

## Predict the test dataset and bring it into requested format

In [None]:
test_data = pd.read_csv('data/FE_test_set.csv')

In [None]:
test_data.head()

In [None]:
X = test_data.drop(['srch_id'], axis=1)

In [None]:
X[X['comp4_rate'] == 'NUL']  = np.nan

In [None]:
test_data['prediction'] = baseline.predict(X.fillna(-9999))

In [None]:
X[X['comp4_rate'] == 'NUL']  = np.nan

In [None]:
test_data['prediction'] = baseline.predict(X.fillna(-9999))

In [None]:
# sort values for final format
test_data.sort_values(['srch_id', 'prediction'], axis=0, inplace=True, ignore_index=True, ascending=[True, False])

In [None]:
filename = 'data/predictions/prediction.csv'+str(datetime.now())
test_data[['srch_id', 'prop_id']].to_csv(filename, index=False)