# Assignment 2B: Ranking

This notebook contains the skeleton for training a model and then applying it to produce a document ranking.

## Loading the precomputed features

The code below loads the precomputed features and combines them into feature vectors for query-document pairs.

For this part to work, you'll need to run the `1_Feature_computation` notebook first to generate the sample features JSON files.

In [1]:
import json
import pandas as pd
import pickle

from IPython.display import clear_output

from sklearn.model_selection import GridSearchCV

In [2]:
def load_train_features_and_labels(filepath):
    
    query_doc_relations = pickle.load(open("data/qrels.p", "rb" )) # Computed in 'Preprocessing.ipynb' file
    
    features_dict = pickle.load(open(filepath, "rb" ))
    
    X = pd.DataFrame(columns=[
        'q_id', 
        'doc_id', 
        'bm25_anchors', 
        'bm25_content', 
        'bm25_title', 
        'lm_anchors', 
        'lm_content', 
        'lm_title', 
        'q_len', 
        'q_token_len', 
        'doc_pagerank', 
        'doc_main_indx_length', 
        'doc_anchors_indx_length'
    ])
    
    y = []
    i = 0
    for q_id, docs in features_dict.items():
        for doc_id, features in docs.items():
            clear_output()
            print("For {} - {}".format(q_id, doc_id))
            X = X.append(pd.DataFrame(
                [[q_id, doc_id, features['bm25_anchors'], features['bm25_content'], features['bm25_title'], features['lm_anchors'], features['lm_content'], features['lm_title'], features['q_len'],features['q_token_len'], features['doc_pagerank'], features['doc_main_indx_length'], features['doc_anchors_indx_length']]], 
                columns=[
                    'q_id', 
                    'doc_id', 
                    'bm25_anchors', 
                    'bm25_content', 
                    'bm25_title', 
                    'lm_anchors', 
                    'lm_content', 
                    'lm_title', 
                    'q_len', 
                    'q_token_len', 
                    'doc_pagerank', 
                    'doc_main_indx_length', 
                    'doc_anchors_indx_length'
                ]))
            
            if doc_id in query_doc_relations[q_id]:
                y.append(query_doc_relations[q_id][doc_id])
            else:
                y.append(0)
            
            print(X.iloc[i])
            print(y[i])
            i += 1
    
    X = X.reset_index(drop=True)
    clear_output()
    return (X, y)


def load_test_features(filepath):
    
    query_doc_relations = pickle.load(open("data/qrels.p", "rb" )) # Computed in 'Preprocessing.ipynb' file
    
    features_dict = pickle.load(open(filepath, "rb" ))
    
    X = pd.DataFrame(columns=[
        'q_id', 
        'doc_id', 
        'bm25_anchors', 
        'bm25_content', 
        'bm25_title', 
        'lm_anchors', 
        'lm_content', 
        'lm_title', 
        'q_len', 
        'q_token_len', 
        'doc_pagerank', 
        'doc_main_indx_length', 
        'doc_anchors_indx_length'
    ])
    
    i = 0
    for q_id, docs in features_dict.items():
        for doc_id, features in docs.items():
            clear_output()
            print("For {} - {}".format(q_id, doc_id))
            X = X.append(pd.DataFrame(
                [[q_id, doc_id, features['bm25_anchors'], features['bm25_content'], features['bm25_title'], features['lm_anchors'], features['lm_content'], features['lm_title'], features['q_len'],features['q_token_len'], features['doc_pagerank'], features['doc_main_indx_length'], features['doc_anchors_indx_length']]], 
                columns=[
                    'q_id', 
                    'doc_id', 
                    'bm25_anchors', 
                    'bm25_content', 
                    'bm25_title', 
                    'lm_anchors', 
                    'lm_content', 
                    'lm_title', 
                    'q_len', 
                    'q_token_len', 
                    'doc_pagerank', 
                    'doc_main_indx_length', 
                    'doc_anchors_indx_length'
                ]))
            
            print(X.iloc[i])
            i += 1
    X = X.reset_index(drop=True)
    clear_output()
    return X

In [3]:
train_X, train_y = load_train_features_and_labels(filepath = "data/train_features.p")
test_X = load_test_features(filepath = "data/test_features.p")

In [4]:
from sklearn import preprocessing
Encoder = preprocessing.LabelEncoder()

Encoder.fit(pd.concat([train_X['doc_id'], test_X['doc_id']], axis=0))

LabelEncoder()

In [5]:
train_X['doc_id'] = Encoder.transform(train_X['doc_id'])
test_X['doc_id'] = Encoder.transform(test_X['doc_id'])

In [12]:
# Taken from 'https://www.kaggle.com/davidgasquez/ndcg-scorer'

"""Metrics to compute the model performance."""

import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer


def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.

    Returns
    -------
    score : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(ground_truth, predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    lb = LabelBinarizer()
    lb.fit(range(len(predictions) + 1))
    T = lb.transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        try:
            score = float(actual) / float(best)
        except:
            score = 0
        scores.append(score)

    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=False, k=5)

In [14]:
from sklearn.tree import DecisionTreeRegressor
parameters = {
    'max_depth': [depth for depth in range(2, 20, 2)],
    'min_samples_split': [min_samples for min_samples in range(200, 2001, 200)]
}

gridSearchResult = GridSearchCV(DecisionTreeRegressor(random_state = 0),
                                parameters, 
                                cv=5, 
                                scoring=ndcg_scorer, 
                                verbose=5
                               ).fit(train_X, train_y)
clear_output()

In [15]:
results_df = pd.DataFrame.from_dict(gridSearchResult.cv_results_)
results_df.sort_values(by=['rank_test_score']).T

Unnamed: 0,0,64,63,62,61,60,59,58,57,65,...,31,30,29,28,27,26,25,24,22,89
mean_fit_time,0.0155995,0.0318011,0.0348007,0.0374002,0.0410009,0.0447998,0.0242002,0.0242011,0.0240006,0.0290006,...,0.0327996,0.0362006,0.0224003,0.0225999,0.0226012,0.0240001,0.0256006,0.0264015,0.0280005,0.025001
std_fit_time,0.000489437,0.00614395,0.00633669,0.00624811,0.00489896,0.00470735,0.00762774,0.00711035,0.00723925,0.00596652,...,0.00172055,0.000748125,0.00387793,0.00392901,0.00417552,0.00352127,0.00241658,0.00233104,0.00209722,0.00777147
mean_score_time,0.0745993,0.0751997,0.0755998,0.0756,0.0749992,0.0748005,0.0756003,0.0741989,0.0745997,0.0749983,...,0.0744,0.0765995,0.0743996,0.075,0.0741988,0.0739997,0.0735994,0.0751984,0.0745993,0.0763992
std_score_time,0.0010191,0.000747935,0.00149682,0.000490349,0.00063324,0.000399972,0.00101874,0.000400712,0.000799036,0.000632486,...,0.000800026,0.00162529,0.00101978,0.00126422,0.000747411,5.30983e-07,0.000490741,0.0020396,0.00119974,0.00102064
param_max_depth,2,14,14,14,14,14,12,12,12,14,...,8,8,6,6,6,6,6,6,6,18
param_min_samples_split,200,1000,800,600,400,200,2000,1800,1600,1200,...,400,200,2000,1800,1600,1400,1200,1000,600,2000
params,"{'max_depth': 2, 'min_samples_split': 200}","{'max_depth': 14, 'min_samples_split': 1000}","{'max_depth': 14, 'min_samples_split': 800}","{'max_depth': 14, 'min_samples_split': 600}","{'max_depth': 14, 'min_samples_split': 400}","{'max_depth': 14, 'min_samples_split': 200}","{'max_depth': 12, 'min_samples_split': 2000}","{'max_depth': 12, 'min_samples_split': 1800}","{'max_depth': 12, 'min_samples_split': 1600}","{'max_depth': 14, 'min_samples_split': 1200}",...,"{'max_depth': 8, 'min_samples_split': 400}","{'max_depth': 8, 'min_samples_split': 200}","{'max_depth': 6, 'min_samples_split': 2000}","{'max_depth': 6, 'min_samples_split': 1800}","{'max_depth': 6, 'min_samples_split': 1600}","{'max_depth': 6, 'min_samples_split': 1400}","{'max_depth': 6, 'min_samples_split': 1200}","{'max_depth': 6, 'min_samples_split': 1000}","{'max_depth': 6, 'min_samples_split': 600}","{'max_depth': 18, 'min_samples_split': 2000}"
split0_test_score,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,...,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831
split1_test_score,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,...,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158
split2_test_score,0.948555,0.948555,0.948555,0.948555,0.948555,0.948555,0.948555,0.948555,0.948555,0.948555,...,0.948555,0.948555,0.948555,0.948555,0.948555,0.948555,0.948555,0.948555,0.948555,0.948555


In [16]:
BestDecisionTreeRegressor = gridSearchResult.best_estimator_
BestDecisionTreeRegressor.fit(train_X, train_y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=200, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [17]:
from sklearn.ensemble import RandomForestRegressor

parameters = {
    'n_estimators': [est for est in range(2, 11, 2)],
    'max_depth': [depth for depth in range(2, 20, 2)],
    'min_samples_split': [min_samples for min_samples in range(200, 2001, 200)]
}
gridSearchResult = GridSearchCV(RandomForestRegressor(random_state = 0),
                                parameters, 
                                cv=5, 
                                scoring=ndcg_scorer, 
                                verbose=5
                               ).fit(train_X, train_y)
clear_output()

In [18]:
results_df = pd.DataFrame.from_dict(gridSearchResult.cv_results_)
results_df.sort_values(by=['rank_test_score']).T

Unnamed: 0,0,306,305,304,303,302,301,300,299,298,...,149,148,147,146,145,144,143,142,112,449
mean_fit_time,0.0194004,0.0868003,0.0463999,0.234401,0.190199,0.144,0.100801,0.0536,0.0686006,0.0566004,...,0.0675999,0.0558004,0.0451999,0.0382006,0.0208022,0.0816002,0.0667995,0.0548001,0.0830006,0.0720007
std_fit_time,0.00149605,0.00962096,0.00431744,0.0147323,0.0132581,0.00952901,0.00810889,0.00300756,0.0109653,0.00803947,...,0.00958327,0.0067051,0.00594657,0.00470844,0.00391871,0.0108917,0.00890867,0.00770439,0.00303339,0.0117304
mean_score_time,0.0765997,0.0743997,0.0748002,0.0751994,0.0758005,0.0756,0.0761992,0.0752,0.0745995,0.075,...,0.074,0.0745995,0.0740001,0.0756004,0.0759978,0.0751996,0.0746005,0.0746004,0.0745996,0.0777999
std_score_time,0.000489517,0.000490506,0.000748494,0.000400425,0.000980028,0.000800061,0.00231525,0.00074931,0.00048998,0.000632259,...,5.76165e-07,0.000800109,1.78416e-07,0.000489395,0.000631958,0.000747832,0.00049031,0.000490624,0.000489804,0.00074894
param_max_depth,2,14,14,14,14,14,14,14,12,12,...,6,6,6,6,6,6,6,6,6,18
param_min_samples_split,200,400,400,200,200,200,200,200,2000,2000,...,2000,2000,2000,2000,2000,1800,1800,1800,600,2000
param_n_estimators,2,4,2,10,8,6,4,2,10,8,...,10,8,6,4,2,10,8,6,6,10
params,"{'max_depth': 2, 'min_samples_split': 200, 'n_...","{'max_depth': 14, 'min_samples_split': 400, 'n...","{'max_depth': 14, 'min_samples_split': 400, 'n...","{'max_depth': 14, 'min_samples_split': 200, 'n...","{'max_depth': 14, 'min_samples_split': 200, 'n...","{'max_depth': 14, 'min_samples_split': 200, 'n...","{'max_depth': 14, 'min_samples_split': 200, 'n...","{'max_depth': 14, 'min_samples_split': 200, 'n...","{'max_depth': 12, 'min_samples_split': 2000, '...","{'max_depth': 12, 'min_samples_split': 2000, '...",...,"{'max_depth': 6, 'min_samples_split': 2000, 'n...","{'max_depth': 6, 'min_samples_split': 2000, 'n...","{'max_depth': 6, 'min_samples_split': 2000, 'n...","{'max_depth': 6, 'min_samples_split': 2000, 'n...","{'max_depth': 6, 'min_samples_split': 2000, 'n...","{'max_depth': 6, 'min_samples_split': 1800, 'n...","{'max_depth': 6, 'min_samples_split': 1800, 'n...","{'max_depth': 6, 'min_samples_split': 1800, 'n...","{'max_depth': 6, 'min_samples_split': 600, 'n_...","{'max_depth': 18, 'min_samples_split': 2000, '..."
split0_test_score,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,...,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831,0.921831
split1_test_score,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,...,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158,0.904158


In [19]:
BestRandomForestRegressor = gridSearchResult.best_estimator_
BestRandomForestRegressor

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=200,
                      min_weight_fraction_leaf=0.0, n_estimators=2, n_jobs=None,
                      oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

## Applying the model to produce a ranking

Apply the train model on queries (b) and sort documents according to the predicted relevance score.

In [20]:
# TODO
BestDecisionTreeRegressor.fit(train_X, train_y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=200, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [21]:
rankings = pd.DataFrame(columns=[
        'q_id',
        'doc_id',
        'relevance'
])

queries = list(test_X['q_id'].unique())

for q_id in queries:
    test_X_qid = test_X[test_X['q_id'] == q_id]
    
    predictions = BestDecisionTreeRegressor.predict(test_X_qid)
    
    test_X_qid = test_X_qid.assign(relevance=predictions)
    
    test_X_qid = test_X_qid.sort_values(by=['relevance'], ascending = False).head(100).reset_index()
    
#     print(test_X_qid[['q_id', 'doc_id', 'relevance']])
    rankings = pd.concat([rankings, test_X_qid[['q_id', 'doc_id', 'relevance']]], ignore_index=True)
    
rankings['doc_id'] = rankings['doc_id'].astype(int)
rankings['doc_id'] = Encoder.inverse_transform(rankings['doc_id'])

rankings.rename(columns={"q_id": "QueryId", "doc_id": "DocumentId"}, inplace = True)

In [23]:
rankings.to_csv('own_ranking.csv', index = False)

In [24]:
rankings[['QueryId', 'DocumentId']].to_csv('submission.csv', index = False)

In [25]:
rankings

Unnamed: 0,QueryId,DocumentId,relevance
0,251,clueweb12-0000wb-17-37013,0.114768
1,251,clueweb12-0401wb-99-08601,0.114768
2,251,clueweb12-0610wb-29-30160,0.114768
3,251,clueweb12-1013wb-15-30773,0.114768
4,251,clueweb12-0000wb-74-21553,0.114768
...,...,...,...
4969,300,clueweb12-1807wb-66-15979,0.028472
4970,300,clueweb12-1201wb-96-10655,0.028472
4971,300,clueweb12-1806wb-44-35910,0.028472
4972,300,clueweb12-0400wb-15-10212,0.028472
