In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.model_selection import GroupShuffleSplit

# Load the data
#df = pd.read_csv("datasets/feature_engineered_data.csv")
df = pd.read_csv("datasets/feature_0.1_sample.csv")
df.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,...,prop_starrating_is_zero,year,month,day,hour,weekend,is_international_stay,children_accepted,mean_day_stay,season_stay
0,1,12,187,219,893,0.6,0.7,1.0,0.405444,0.0438,...,0.0,1.0,0.272727,0.5,0.347826,0.0,1.0,0.0,0.254795,1
1,1,12,187,219,10404,0.8,0.8,1.0,0.315186,0.0149,...,0.0,1.0,0.272727,0.5,0.347826,0.0,1.0,0.0,0.254795,1
2,1,12,187,219,21315,0.6,0.9,1.0,0.315186,0.0245,...,0.0,1.0,0.272727,0.5,0.347826,0.0,1.0,0.0,0.254795,1
3,1,12,187,219,27348,0.4,0.8,1.0,0.405444,0.0125,...,0.0,1.0,0.272727,0.5,0.347826,0.0,1.0,0.0,0.254795,1
4,1,12,187,219,29604,0.8,0.7,1.0,0.378223,0.1241,...,0.0,1.0,0.272727,0.5,0.347826,0.0,1.0,0.0,0.254795,1


## XGBoost rank:ndcg function


In [10]:
def xgb_boost(xgb_params, df):
    # Split the data into train and validation sets
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Target still needs to be defined differently
    target_col = 'position'

    # Train the XGBoost model on target
    X_train = train_df.drop(['position', 'click_bool', 'booking_bool'], axis=1)
    y_train = train_df[[target_col]] 

    # The training data matrix
    dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)

    # Train the model
    xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=1)

    # Define the validation data
    X_val = test_df.drop(['position', 'click_bool', 'booking_bool'], axis=1)

    # This is the validation data matrix
    dtest = xgb.DMatrix(X_val)

    # Make predictions on the validation set
    preds = xgb_model.predict(dtest)

    print([preds])

    # Prepare the true positions for the test data
    true_positions = test_df[target_col]

    print([true_positions])

    # Compute NDCG@5 score by comparing the predicted scores with the true positions
    ndcg_5 = ndcg_score([true_positions], [preds], k=5)

    # Print the NDCG@5 score
    print('NDCG@5 score on the test data:', ndcg_5)

# Define the XGBoost parameters
xgb_params = {
    "objective": "rank:ndcg",
    "eval_metric": "ndcg@5",
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "seed": 42
}

# Call the XGBoost function
xgb_boost(xgb_params, df)


[array([0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5], dtype=float32)]
[224887    27
61027     10
136455    18
264529    13
40617     21
          ..
200546    15
251039    16
129709     3
373923    29
436745     3
Name: position, Length: 99167, dtype: int64]
NDCG@5 score on the test data: 0.42830434590633565


## xgb.XGBRanker function

In [17]:
def xgb_ranker(df):
    gss = GroupShuffleSplit(test_size=.30, n_splits=1, random_state = 7).split(df, groups=df['srch_id'])

    X_train_inds, X_test_inds = next(gss)

    train_data= df.iloc[X_train_inds]
    X_train = train_data.loc[:, ~train_data.columns.isin(['srch_id','position', 'click_bool', 'booking_bool'])]
    y_train = train_data.loc[:, train_data.columns.isin(['position'])]

    groups = train_data['srch_id'].value_counts().sort_index()

    test_data= df.iloc[X_test_inds]
    #We need to keep the id for later predictions
    X_test = test_data.loc[:, ~test_data.columns.isin(['position', 'click_bool', 'booking_bool'])]
    y_test = test_data.loc[:, test_data.columns.isin(['position'])]

    # Define model parameters
    model = xgb.XGBRanker(  
        tree_method='auto',
        booster='gbtree',
        objective='rank:pairwise', #ndcg
        random_state=42, 
        learning_rate=0.1,
        colsample_bytree=0.9, 
        eta=0.05, 
        max_depth=6, 
        n_estimators=110, 
        subsample=0.75 
        )

    # Train the model
    model.fit(X_train, y_train, group=groups, verbose=True)

    temp = pd.concat([X_test, y_test], axis=1)[['srch_id','position']]
    # making the predictions
    temp['prediction'] = model.predict(X_test.drop('srch_id', axis=1))
    # getting lists of scores and predicted values for each srch_id
    temp = temp.groupby('srch_id').agg({'position':list, 'prediction':list}).reset_index()
    # calculating NDCG scores and adding them as in a column
    temp['NDCG'] = temp.apply(lambda x: ndcg_score(np.array([x.score]), np.array([x.prediction])), axis=1)

    


# Call the XGBoost function
xgb_ranker(df)


AttributeError: 'Series' object has no attribute 'score'

In [16]:
def xgb_ranker(df):
    gss = GroupShuffleSplit(test_size=.30, n_splits=1, random_state = 7).split(df, groups=df['srch_id'])

    X_train_inds, X_test_inds = next(gss)

    train_data= df.iloc[X_train_inds]
    X_train = train_data.loc[:, ~train_data.columns.isin(['srch_id','position', 'click_bool', 'booking_bool'])]
    y_train = train_data.loc[:, train_data.columns.isin(['position'])]

    groups = train_data.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

    test_data= df.iloc[X_test_inds]

    #We need to keep the id for later predictions
    X_test = test_data.loc[:, ~test_data.columns.isin(['position', 'click_bool', 'booking_bool'])]
    y_test = test_data.loc[:, test_data.columns.isin(['srch_id'])]

    # Define model parameters
    model = xgb.XGBRanker(  
        tree_method='auto',
        booster='gbtree',
        objective='rank:pairwise',
        random_state=42, 
        learning_rate=0.1,
        colsample_bytree=0.9, 
        eta=0.05, 
        max_depth=6, 
        n_estimators=110, 
        subsample=0.75 
        )

    # Train the model
    model.fit(X_train, y_train, group=groups, verbose=True)

    # Make predictions on the test set
    def predict(model, df):
        return model.predict(df.loc[:, ~df.columns.isin(['srch_id'])])
  
    predictions = (X_test.groupby('srch_id').apply(lambda x: predict(model, x)))

    print(y_test)
    print(predictions)
    
    # Compute NDCG@5 score by comparing the predicted scores with the true positions
    ndcg_5 = ndcg_score([y_test], [predictions], k=5)

    # Print the NDCG@5 score
    print('NDCG@5 score on the test data:', ndcg_5)


# Call the XGBoost function
xgb_ranker(df)


KeyboardInterrupt: 

## Experimenting 

In [13]:
# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.drop(['target', 'orig_destination_distance'], axis=1, inplace=True)

# Define the XGBoost parameters
xgb_params = {
    "objective": "rank:ndcg",
    "eval_metric": "ndcg@5",
    "max_depth": 6,
    "eta": 0.1,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "seed": 42
}


# Train the XGBoost model on target
X_train = train_df.drop(['position', 'click_bool', 'booking_bool'], axis=1)
y_train = train_df[['position']] 
dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
xgb_model = xgb.train(xgb_params, dtrain)


# ------ THIS PART IS USED FOR THE VALIDATION SET ------
# X_val = val_df.drop(['position', 'click_bool', 'booking_bool'], axis=1)

# dtest = xgb.DMatrix(X_val)
# preds = xgb_model.predict(dtest)

# # Format the predictions into the required submission format
# val_df['pred'] = preds
# val_df = val_df.sort_values(['srch_id', 'pred'], ascending=[True, False])
# val_df['rank'] = val_df.groupby('srch_id')['srch_id'].rank(method='first')
# val_df = val_df[['srch_id', 'prop_id', 'rank', 'click_bool', 'booking_bool']]
# ------ END VALIDATION SET ------

# ------ THIS PART IS USED FOR THE TEST SET ------
dtest = xgb.DMatrix(test_df, enable_categorical=True)
preds = xgb_model.predict(dtest)

# Format the predictions into the required submission format
test_df['pred'] = preds
test_df = test_df.sort_values(['srch_id', 'pred'], ascending=[True, False])
test_df['rank'] = test_df.groupby('srch_id')['srch_id'].rank(method='first')
test_df = test_df[['srch_id', 'prop_id', 'rank']]
# ------ END TEST SET ------


In the predicted validation set some 'srch_id' contain only one instance. This is due to the small sample size and the train/val split which can result in only one instance being present in the validation data


In [14]:
test_df.head(60)

Unnamed: 0,srch_id,prop_id,rank
0,1,3180,1.0
1,1,5543,2.0
2,1,14142,3.0
3,1,22393,4.0
4,1,24194,5.0
5,1,28181,6.0
6,1,34263,7.0
7,1,37567,8.0
8,1,50162,9.0
9,1,54937,10.0


In [15]:
submission_df = test_df[['srch_id', 'prop_id']]
submission_df

Unnamed: 0,srch_id,prop_id
0,1,3180
1,1,5543
2,1,14142
3,1,22393
4,1,24194
...,...,...
4959178,332787,32019
4959179,332787,33959
4959180,332787,35240
4959181,332787,94437


In [16]:
submission_df.to_csv('datasets/submission.csv', index=False)