# Assignment 2B: Evalution

This notebook contains the skeleton for evaluating a document ranking against the ground truth.

## Loading ranking file

The file format is [specified in the assignment](https://github.com/kbalog/uis-dat640-fall2019/tree/master/assignments/assignment-2b#output-file-format).

In [1]:
import math
import pandas as pd
import pickle

from IPython.display import clear_output
from sklearn import preprocessing

from sklearn.metrics import mean_squared_error

In [2]:
RANKING_FILE = "data/ranking_bm25.csv"

ranking_csv = pd.read_csv(RANKING_FILE)

qids = ranking_csv['QueryId'].unique()
rankings = {}

for Id in qids:    
    r = ranking_csv[ranking_csv['QueryId'] == Id]['DocumentId'].to_list()    
    rankings[str(Id)] = r

## Loading relevance judgments

In [3]:
# TODO
gtruth = pickle.load(open("data/qrels.p", "rb" )) # Built in Preprocessing.ipynb

## Computing NDCG scores

In [4]:
def dcg(rel, p):
    dcg = rel[0]
    for i in range(1, min(p, len(rel))): 
        dcg += rel[i] / math.log(i + 1, 2)  # rank position is indexed from 1..
    return dcg


def get_ndcg_scores(gtruth, rankings):
    sum_ndcg5 = 0
    sum_ndcg10 = 0
    sum_ndcg20 = 0
    sum_ndcg100 = 0

    for qid, ranking in sorted(rankings.items()):
        gt = gtruth[qid]
        print("Query", qid)

        gains = [] # holds corresponding relevance levels for the ranked docs
        for doc_id in ranking: 
            gain = gt.get(doc_id, 0)
            gains.append(gain)
        print("\tGains:", gains)

        # relevance levels of the idealized ranking
        gain_ideal = sorted([v for _, v in gt.items()], reverse=True)
        print("\tIdeal gains:", gain_ideal)

        ndcg5 = dcg(gains, 5) / dcg(gain_ideal, 5)
        ndcg10 = dcg(gains, 10) / dcg(gain_ideal, 10)
        ndcg20 = dcg(gains, 20) / dcg(gain_ideal, 20)
        ndcg100 = dcg(gains, 100) / dcg(gain_ideal, 100)

        sum_ndcg5 += ndcg5
        sum_ndcg10 += ndcg10
        sum_ndcg20 += ndcg20
        sum_ndcg100 += ndcg100

        print("\tNDCG@5:", round(ndcg5, 3), "\n\tNDCG@10:", round(ndcg10, 3), "\n\tNDCG@20:", round(ndcg20, 3), "\n\tNDCG@100:", round(ndcg100, 3))
        print("----------------------------------------------------------------------------------------")

    clear_output()
    print("Average")
    print("\tNDCG@5:", sum_ndcg5 / len(rankings), "\n\tNDCG@10:", sum_ndcg10 / len(rankings), "\n\tNDCG@20:", sum_ndcg20 / len(rankings), "\n\tNDCG@100:", sum_ndcg100 / len(rankings))

In [5]:
get_ndcg_scores(gtruth, rankings)

Average
	NDCG@5: 0.1779689659560673 
	NDCG@10: 0.16570814178359747 
	NDCG@20: 0.15341632649418818 
	NDCG@100: 0.11104042798957728


In [6]:
def load_features_and_labels(filepath):
    
    query_doc_relations = pickle.load(open("data/qrels.p", "rb" )) # Computed in 'Preprocessing.ipynb' file
    
    features_dict = pickle.load(open(filepath, "rb" ))
    
    X = pd.DataFrame(columns=[
        'q_id', 
        'doc_id', 
        'bm25_anchors', 
        'bm25_content', 
        'bm25_title', 
        'lm_anchors', 
        'lm_content', 
        'lm_title', 
        'q_len', 
        'q_token_len', 
        'doc_pagerank', 
        'doc_main_indx_length', 
        'doc_anchors_indx_length'
    ])
    
    y = []
    i = 0
    for q_id, docs in features_dict.items():
        for doc_id, features in docs.items():
            clear_output()
            print("For {} - {}".format(q_id, doc_id))
            X = X.append(pd.DataFrame(
                [[q_id, doc_id, features['bm25_anchors'], features['bm25_content'], features['bm25_title'], features['lm_anchors'], features['lm_content'], features['lm_title'], features['q_len'],features['q_token_len'], features['doc_pagerank'], features['doc_main_indx_length'], features['doc_anchors_indx_length']]], 
                columns=[
                    'q_id', 
                    'doc_id', 
                    'bm25_anchors', 
                    'bm25_content', 
                    'bm25_title', 
                    'lm_anchors', 
                    'lm_content', 
                    'lm_title', 
                    'q_len', 
                    'q_token_len', 
                    'doc_pagerank', 
                    'doc_main_indx_length', 
                    'doc_anchors_indx_length'
                ]))
            
            if doc_id in query_doc_relations[q_id]:
                y.append(query_doc_relations[q_id][doc_id])
            else:
                y.append(0)
            
            print(X.iloc[i])
            print(y[i])
            i += 1
    
    X = X.reset_index(drop=True)
    clear_output()
    return (X, y)

In [7]:
def load_test_features(filepath):
    
    query_doc_relations = pickle.load(open("data/qrels.p", "rb" )) # Computed in 'Preprocessing.ipynb' file
    
    features_dict = pickle.load(open(filepath, "rb" ))
    
    X = pd.DataFrame(columns=[
        'q_id', 
        'doc_id', 
        'bm25_anchors', 
        'bm25_content', 
        'bm25_title', 
        'lm_anchors', 
        'lm_content', 
        'lm_title', 
        'q_len', 
        'q_token_len', 
        'doc_pagerank', 
        'doc_main_indx_length', 
        'doc_anchors_indx_length'
    ])
    
    i = 0
    for q_id, docs in features_dict.items():
        for doc_id, features in docs.items():
            clear_output()
            print("For {} - {}".format(q_id, doc_id))
            X = X.append(pd.DataFrame(
                [[q_id, doc_id, features['bm25_anchors'], features['bm25_content'], features['bm25_title'], features['lm_anchors'], features['lm_content'], features['lm_title'], features['q_len'],features['q_token_len'], features['doc_pagerank'], features['doc_main_indx_length'], features['doc_anchors_indx_length']]], 
                columns=[
                    'q_id', 
                    'doc_id', 
                    'bm25_anchors', 
                    'bm25_content', 
                    'bm25_title', 
                    'lm_anchors', 
                    'lm_content', 
                    'lm_title', 
                    'q_len', 
                    'q_token_len', 
                    'doc_pagerank', 
                    'doc_main_indx_length', 
                    'doc_anchors_indx_length'
                ]))
            
            print(X.iloc[i])
            i += 1
    X = X.reset_index(drop=True)
    clear_output()
    return X

In [8]:
X, y = load_features_and_labels(filepath = "data/train_features.p")
test_X = load_test_features(filepath = "data/test_features.p")

In [9]:
Encoder = preprocessing.LabelEncoder()
Encoder.fit(pd.concat([X['doc_id'], test_X['doc_id']], axis=0))

X['doc_id'] = Encoder.transform(X['doc_id'])
test_X['doc_id'] = Encoder.transform(test_X['doc_id'])

### Train-Test Split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state = 0)

In [11]:
def get_rankings(test_X, Regressor, ranking_file_name):
    rankings = pd.DataFrame(columns=[
            'q_id',
            'doc_id',
            'relevance'
    ])
    
    rankings_dict = {}

    queries = list(test_X['q_id'].unique())

    for q_id in queries:
        rankings_dict[q_id] = []
        
        test_X_qid = test_X[test_X['q_id'] == q_id]

        predictions = Regressor.predict(test_X_qid)
        test_X_qid = test_X_qid.assign(relevance=predictions)

        test_X_qid = test_X_qid.sort_values(by=['relevance'], ascending = False).head(100).reset_index()
        
        rankings = pd.concat([rankings, test_X_qid[['q_id', 'doc_id', 'relevance']]], ignore_index=True)        
        rankings_dict[q_id] = list(Encoder.inverse_transform(test_X_qid['doc_id']))

    rankings['doc_id'] = rankings['doc_id'].astype(int)
    rankings['doc_id'] = Encoder.inverse_transform(rankings['doc_id'])

    rankings.rename(columns={"q_id": "QueryId", "doc_id": "DocumentId"}, inplace = True)
    rankings[['QueryId', 'DocumentId']].to_csv(ranking_file_name, index = False)
    
    clear_output()
    return rankings_dict

## Finding the Scores 

In [12]:
from sklearn.tree import DecisionTreeRegressor

### Query-Document Features

In [13]:
Regressor = DecisionTreeRegressor(max_depth=2, min_samples_split=200).fit(X_train[['q_id', 'doc_id', 'bm25_anchors', 'bm25_content', 'bm25_title','lm_anchors','lm_content','lm_title']], y_train)

rankings = get_rankings(X_val[['q_id', 'doc_id', 'bm25_anchors', 'bm25_content', 'bm25_title','lm_anchors','lm_content','lm_title']], Regressor, ranking_file_name = 'ranking_QD.csv')


get_ndcg_scores(gtruth, rankings)

Average
	NDCG@5: 0.044138193945731284 
	NDCG@10: 0.046388364114609885 
	NDCG@20: 0.043307932502692895 
	NDCG@100: 0.025958321382731006


### Query-Document + Query Features

In [14]:
Regressor = DecisionTreeRegressor(max_depth=2, min_samples_split=200).fit(X_train[['q_id', 'doc_id', 'bm25_anchors', 'bm25_content', 'bm25_title','lm_anchors','lm_content','lm_title', 'q_len', 'q_token_len']], y_train)

rankings = get_rankings(X_val[['q_id', 'doc_id', 'bm25_anchors', 'bm25_content', 'bm25_title','lm_anchors','lm_content','lm_title', 'q_len', 'q_token_len']], Regressor, ranking_file_name = 'ranking_QD_Q.csv')

get_ndcg_scores(gtruth, rankings)

Average
	NDCG@5: 0.044138193945731284 
	NDCG@10: 0.046388364114609885 
	NDCG@20: 0.043307932502692895 
	NDCG@100: 0.025958321382731006


### Query-Document + Document Features

In [15]:
Regressor = DecisionTreeRegressor(max_depth=2, min_samples_split=200).fit(X_train[['q_id', 'doc_id', 'bm25_anchors', 'bm25_content', 'bm25_title','lm_anchors','lm_content','lm_title', 'doc_pagerank', 'doc_main_indx_length', 'doc_anchors_indx_length']], y_train)

rankings = get_rankings(X_val[['q_id', 'doc_id', 'bm25_anchors', 'bm25_content', 'bm25_title','lm_anchors','lm_content','lm_title', 'doc_pagerank', 'doc_main_indx_length', 'doc_anchors_indx_length']], Regressor, ranking_file_name = 'ranking_QD_D.csv')

get_ndcg_scores(gtruth, rankings)

Average
	NDCG@5: 0.044138193945731284 
	NDCG@10: 0.046388364114609885 
	NDCG@20: 0.043307932502692895 
	NDCG@100: 0.025958321382731006


### Query-Document Features + Query + Document Features

In [16]:
Regressor = DecisionTreeRegressor(max_depth=2, min_samples_split=200).fit(X_train, y_train)

rankings = get_rankings(X_val, Regressor, ranking_file_name = 'ranking_all.csv')

get_ndcg_scores(gtruth, rankings)

Average
	NDCG@5: 0.044138193945731284 
	NDCG@10: 0.046388364114609885 
	NDCG@20: 0.043307932502692895 
	NDCG@100: 0.025958321382731006


## Finding the Best Features

In [17]:
from sklearn.feature_selection import SelectKBest, chi2

feature_names = ['bm25_anchors', 'bm25_content', 'bm25_title','lm_anchors','lm_content','lm_title','q_len', 'q_token_len','doc_pagerank','doc_main_indx_length','doc_anchors_indx_length']

k_best = SelectKBest(chi2, k=5).fit(X[feature_names], y)

best_features = []

mask = k_best.get_support()

for bool, feature in zip(mask, feature_names):
    if bool:
        best_features.append(feature)
        
best_features

['bm25_anchors',
 'bm25_content',
 'bm25_title',
 'doc_main_indx_length',
 'doc_anchors_indx_length']

In [18]:
Regressor = DecisionTreeRegressor(max_depth=2, min_samples_split=200).fit(X_train, y_train)

rankings = get_rankings(X_val, Regressor, ranking_file_name = 'ranking_all_test.csv')

get_ndcg_scores(gtruth, rankings)

Average
	NDCG@5: 0.044138193945731284 
	NDCG@10: 0.046388364114609885 
	NDCG@20: 0.043307932502692895 
	NDCG@100: 0.025958321382731006


In [19]:
Regressor = DecisionTreeRegressor(max_depth=2, min_samples_split=200).fit(X_train[['q_id', 'doc_id'] + best_features], y_train)

rankings = get_rankings(X_val[['q_id', 'doc_id'] + best_features], Regressor, ranking_file_name = 'ranking_best_featuers_test.csv')

get_ndcg_scores(gtruth, rankings)

Average
	NDCG@5: 0.033444050971747394 
	NDCG@10: 0.038253602483916245 
	NDCG@20: 0.04039123782635726 
	NDCG@100: 0.02421182661920442
