In [16]:
import pandas as pd, pickle, numpy as np
from sklearn.metrics import ndcg_score
np.random.seed(42)

## Random Baseline

In [21]:
def dcg_score(gains):
    return sum([g / np.log2(i + 2) for i, g in enumerate(gains)])


def ndcg(gains, at=5):
    assert len(gains) >= at, f"Trying to calculate NDSG@{at} while having {len(gains)} objects"
    dcg = dcg_score(gains[:at])
    idcg = dcg_score(sorted(gains, reverse=True)[:at])
    if idcg == 0.:
        return 0
    else:
        return dcg / idcg

In [11]:
X_test = pd.read_feather('../data/X_test.feather')
X_train = pd.read_feather('../data/X_train.feather')
X_val = pd.read_feather('../data/X_val.feather')

In [17]:
y_test = pd.read_feather('../data/y_test.feather')
y_train = pd.read_feather('../data/y_train.feather')
y_val = pd.read_feather('../data/y_val.feather')

In [12]:
def random_baseline(df):
    ordinals = np.arange(len(df))
    np.random.shuffle(ordinals)

    predictions = pandas.DataFrame({'SearchId': df["srch_id"], 'PropertyId': df["prop_id"], 'ord': ordinals})

    predictions.sort_values(by=['SearchId', 'ord'], ascending=[True, False], inplace=True)
    predictions.drop(['ord'], axis=1, inplace=True)
    return predictions

# We need to score each of these prediction sets
pred_test = random_baseline(X_test)
pred_train = random_baseline(X_train)
pred_val = random_baseline(X_val)