# Assignment 2B: Ranking

This notebook contains the skeleton for training a model and then applying it to produce a document ranking.

## Loading the precomputed features

The code below loads the precomputed features and combines them into feature vectors for query-document pairs.

For this part to work, you'll need to run the `1_Feature_computation` notebook first to generate the sample features JSON files.

In [None]:
import json
import pandas as pd
import pickle

from IPython.display import clear_output

from sklearn.model_selection import GridSearchCV

In [None]:
qrels = {}

with open("data/qrels.csv") as f:    
    for line in f:        
        line = line.rstrip().split(',')
        if line[0] == 'QueryId':
            continue        
        
        if line[0] not in qrels.keys():
            qrels[line[0]] = {
                line[1]: int(line[2])
            }
        else:
            qrels[line[0]][line[1]] = int(line[2])

In [None]:
train_X = pd.read_csv('data/train_features.csv')
train_y = []

for row in train_X.iterrows():
    train_y.append(qrels[row['q_id']][row['doc_id']])

In [None]:
test_X = pd.read_csv('data/test_features.csv')

In [None]:
from sklearn import preprocessing
Encoder = preprocessing.LabelEncoder()

Encoder.fit(pd.concat([train_X['doc_id'], test_X['doc_id']], axis=0))

In [None]:
train_X['doc_id'] = Encoder.transform(train_X['doc_id'])
test_X['doc_id'] = Encoder.transform(test_X['doc_id'])

<!-- ## Training a model -->

Training needs to be done differently based on the scenario:

  * **Scenario 1**: The model is trained using cross-validation, that is on 4/5 of queries, then applied on the remaining 1/5 of queries (repeated 5 times).
  * **Scenario 2**: The model is trained on all available training data.
  
The feature vectors at this point are already created. These should contain both (a) the training queries and (b) the queries on which you want to apply your model.

Train your model on queries (a). For that you'll also need to load the corresponding relevance labels.

In [None]:
# Taken from 'https://www.kaggle.com/davidgasquez/ndcg-scorer'

"""Metrics to compute the model performance."""

import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer


def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.

    Returns
    -------
    score : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(ground_truth, predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    lb = LabelBinarizer()
    lb.fit(range(len(predictions) + 1))
    T = lb.transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        try:
            score = float(actual) / float(best)
        except:
            score = 0
        scores.append(score)

    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=False, k=5)

In [None]:
from sklearn.tree import DecisionTreeRegressor
parameters = {
    'max_depth': [depth for depth in range(2, 20, 2)],
    'min_samples_split': [min_samples for min_samples in range(200, 2001, 200)]
}

gridSearchResult = GridSearchCV(DecisionTreeRegressor(random_state = 0),
                                parameters, 
                                cv=5, 
                                scoring=ndcg_scorer, 
                                verbose=5
                               ).fit(train_X, train_y)
clear_output()

In [None]:
results_df = pd.DataFrame.from_dict(gridSearchResult.cv_results_)
results_df.sort_values(by=['rank_test_score']).T

In [None]:
Regressor = gridSearchResult.best_estimator_
Regressor.fit(train_X, train_y)

## Applying the model to produce a ranking

Apply the train model on queries (b) and sort documents according to the predicted relevance score.

In [None]:
# TODO
Regressor.fit(train_X, train_y)

In [None]:
rankings = pd.DataFrame(columns=[
        'q_id',
        'doc_id',
        'relevance'
])

queries = list(test_X['q_id'].unique())

for q_id in queries:
    test_X_qid = test_X[test_X['q_id'] == q_id]
    
    predictions = Regressor.predict(test_X_qid)
    test_X_qid = test_X_qid.assign(relevance=predictions)
    
    test_X_qid = test_X_qid.sort_values(by=['relevance'], ascending = False).head(100).reset_index()
    rankings = pd.concat([rankings, test_X_qid[['q_id', 'doc_id', 'relevance']]], ignore_index=True)
    
rankings['doc_id'] = rankings['doc_id'].astype(int)
rankings['doc_id'] = Encoder.inverse_transform(rankings['doc_id'])

rankings.rename(columns={"q_id": "QueryId", "doc_id": "DocumentId"}, inplace = True)

In [None]:
rankings.to_csv('own_ranking.csv', index = False)

In [None]:
rankings[['QueryId', 'DocumentId']].to_csv('submission.csv', index = False)

In [None]:
rankings