In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv("datasets/data_cleaned.csv")
df.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,srch_saturday_night_bool,orig_destination_distance,random_bool,click_bool,booking_bool,prop_review_score_is_nan,srch_query_affinity_score_is_nan,prop_review_score_is_zero,prop_starrating_is_zero,target
0,1,12,187,219,893,3,3.5,1,2.83,0.0438,...,1,1776.833608,1,0,0,0,1,0,0,0
1,1,12,187,219,10404,4,4.0,1,2.2,0.0149,...,1,1760.058186,1,0,0,0,1,0,0,0
2,1,12,187,219,21315,3,4.5,1,2.2,0.0245,...,1,1760.820221,1,0,0,0,1,0,0,0
3,1,12,187,219,27348,2,4.0,1,2.83,0.0125,...,1,1786.456451,1,0,0,0,1,0,0,0
4,1,12,187,219,29604,4,3.5,1,2.64,0.1241,...,1,1857.881111,1,0,0,0,1,0,0,0


In [19]:
test_df = pd.read_csv('datasets/test_data_cleaned.csv')

In [23]:
# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Define the XGBoost parameters
xgb_params = {
    "objective": "rank:ndcg",
    "eval_metric": "ndcg@5",
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "seed": 42
}

# Train the XGBoost model
X_train = train_df.drop(['position', 'click_bool', 'booking_bool'], axis=1)
y_train = train_df[['target']] # , ['click_bool', 'booking_bool']
dtrain = xgb.DMatrix(X_train, y_train)
xgb_model = xgb.train(xgb_params, dtrain)


# ------ THIS PART IS USED FOR THE VALIDATION SET ------
# X_val = val_df.drop(['position', 'click_bool', 'booking_bool'], axis=1)

# dtest = xgb.DMatrix(X_val)
# preds = xgb_model.predict(dtest)

# # Format the predictions into the required submission format
# val_df['pred'] = preds
# val_df = val_df.sort_values(['srch_id', 'pred'], ascending=[True, False])
# val_df['rank'] = val_df.groupby('srch_id')['srch_id'].rank(method='first')
# val_df = val_df[['srch_id', 'prop_id', 'rank', 'click_bool', 'booking_bool']]
# ------ END VALIDATION SET ------

# ------ THIS PART IS USED FOR THE TEST SET ------
dtest = xgb.DMatrix(test_df)
preds = xgb_model.predict(dtest)

# Format the predictions into the required submission format
test_df['pred'] = preds
test_df = test_df.sort_values(['srch_id', 'pred'], ascending=[True, False])
test_df['rank'] = test_df.groupby('srch_id')['srch_id'].rank(method='first')
test_df = test_df[['srch_id', 'prop_id', 'rank']]
# ------ END TEST SET ------


In the predicted validation set some 'srch_id' contain only one instance. This is due to the small sample size and the train/val split which can result in only one instance being present in the validation data


In [24]:
test_df.head(60)

Unnamed: 0,srch_id,prop_id,rank
0,1,3180,1.0
1,1,5543,2.0
2,1,14142,3.0
3,1,22393,4.0
4,1,24194,5.0
5,1,28181,6.0
6,1,34263,7.0
7,1,37567,8.0
8,1,50162,9.0
9,1,54937,10.0


In [25]:
submission_df = test_df[['srch_id', 'prop_id']]
submission_df

Unnamed: 0,srch_id,prop_id
0,1,3180
1,1,5543
2,1,14142
3,1,22393
4,1,24194
...,...,...
4959178,332787,32019
4959179,332787,33959
4959180,332787,35240
4959181,332787,94437


In [26]:
submission_df.to_csv('datasets/submission.csv', index=False)