In [None]:
%reset -sf

This notebook covers
- Dataset Preparation (train-test split)
- The 7 methods mentioned in our report, and the variants we experimented
- Algorithm for evaluation metrics
- Evaluation procedure with the test set
- Preparation of the dataset for hand evaluation
- Evaluting NDCG with the hand-labelled dataset
- Query and update indexes with unseen questions

The following process is done on another notebook
- Spellcheck and SpaCy tokenisation for the training set 
- SentenceTransformer computation of vectors for the training set 
- Downloading of the SpaCy and GenSim models
- Generating the training data for the supervised model



## How to run the GUI

To run the GUI to query and index unseen questions, please
- Run the entire notebook (takes around 5 minutes to install and index)
- Go to the last section on GUI to see the sample to query and index

(If you want to see evaulation results on the full test set instead, see Version 38)

In [None]:
# notebook hyperparameters
TEST_SET_SIZE = 1000
RANKED_LIST_SIZE = 100
RANDOM_STATE = 42
EVALUATING = False  # setting to False will only evaulate 10 queries

In [None]:
import os, collections, random, itertools, functools, time, json

from collections import defaultdict, Counter
from math import log
from copy import deepcopy

import tqdm.notebook as tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [None]:
# load data
df = pd.read_csv("/kaggle/input/quora-question-pairs/train.csv.zip")
df["question1"] = df["question1"].astype(str)  # resolve nan
df["question2"] = df["question2"].astype(str)
df["qid1"] -= 1  #  start index from zero
df["qid2"] -= 1

In [None]:
df.sample(10)

# Preprocessing Dataset

In [None]:
# all questions are identified with its qid
qid_to_question = {}
for qid1, qid2, question1, question2 in zip(df["qid1"], df["qid2"], df["question1"], df["question2"]):
    qid_to_question[qid1] = question1
    qid_to_question[qid2] = question2

In [None]:
# extract 1000 questions for testing
test_query_qids = set()

df_duplicate = df[df["is_duplicate"] == 1].sample(frac=1, random_state=RANDOM_STATE)
for qid1, qid2, is_duplicate in zip(df_duplicate["qid1"], df_duplicate["qid2"], df_duplicate["is_duplicate"]):
    if is_duplicate and qid1 not in test_query_qids and len(test_query_qids) < TEST_SET_SIZE:
        test_query_qids.add(qid2)
    if qid1 in test_query_qids and qid2 in test_query_qids:
        # to guarantee that there is a duplicate question in the training set
        test_query_qids.remove(qid1)
        test_query_qids.remove(qid2)
assert len(test_query_qids) == TEST_SET_SIZE  # if fail, change random_state

test_query_qids_list = sorted(test_query_qids)
train_query_qids_list = sorted(set(qid_to_question.keys()) - test_query_qids)
assert test_query_qids_list[:3] == [331, 489, 501]   # to check random state fixed

In [None]:
# # uncomment this to test only limited queries
if not EVALUATING:
    test_query_qids_list = test_query_qids_list[:10]
    TEST_SET_SIZE = 10

In [None]:
# extract duplicate relationship of training set

qid_to_duplicate_qids = defaultdict(set)
qid_to_nonduplicate_qids = defaultdict(set)

for qid1, qid2, is_duplicate in zip(df["qid1"], df["qid2"], df["is_duplicate"]):
    if not (qid1 in test_query_qids or qid2 in test_query_qids):
        if is_duplicate:
            qid_to_duplicate_qids[qid1].add(qid2)
            qid_to_duplicate_qids[qid2].add(qid1)
        else:
            qid_to_nonduplicate_qids[qid1].add(qid2)
            qid_to_nonduplicate_qids[qid2].add(qid1)

In [None]:
# complete graph of duplicate relationships

qid_to_duplicate_qids_complete = defaultdict(set)
qid_to_qid_group_leader = {}
qid_group_leader_to_duplicate_qid_group = defaultdict(set)

visited_qids = set()
for train_qid in train_query_qids_list:
    if train_qid in visited_qids:
        continue
    current_qids_group = set([train_qid])
    qid_to_qid_group_leader[train_qid] = train_qid
    stack = [train_qid]
    
    while stack:
        cur_qid = stack.pop()
        for nex_qid in qid_to_duplicate_qids[cur_qid]:
            if nex_qid in current_qids_group:
                continue
            qid_to_qid_group_leader[nex_qid] = train_qid
            stack.append(nex_qid)
            current_qids_group.add(nex_qid)

    # complete the graph
    for qid1, qid2 in itertools.combinations(current_qids_group, r=2):
        qid_to_duplicate_qids_complete[qid1].add(qid2)
        qid_to_duplicate_qids_complete[qid2].add(qid1)
    qid_group_leader_to_duplicate_qid_group[train_qid] = current_qids_group
    visited_qids.update(current_qids_group)

In [None]:
# extract duplicate relationship of the test set

test_qid_to_duplicate_qids = defaultdict(set)
test_qid_to_duplicate_qids_complete = defaultdict(set)

for qid1, qid2, is_duplicate in zip(df_duplicate["qid1"], df_duplicate["qid2"], df_duplicate["is_duplicate"]):
    if qid2 in test_query_qids:
        qid1, qid2 = qid2, qid1
    if qid1 in test_query_qids:
        if qid2 in test_query_qids:
            continue
        test_qid_to_duplicate_qids[qid1].add(qid2)
        test_qid_to_duplicate_qids_complete[qid1].add(qid2)
        for train_qid in qid_group_leader_to_duplicate_qid_group[qid_to_qid_group_leader[qid2]]:
            test_qid_to_duplicate_qids_complete[qid1].add(train_qid)

In [None]:
# count inconsistencies in dataset

cnt = 0
for qid1, qid2, is_duplicate in zip(df["qid1"], df["qid2"], df["is_duplicate"]):
    if not is_duplicate and qid1 not in test_query_qids and qid2 not in test_query_qids:
        if qid_to_qid_group_leader[qid1] == qid_to_qid_group_leader[qid2]:
            cnt += 1
print("Number of inconsistencies: ", cnt)  # slightly smaller than 96 because some edges are associated with the test set

In [None]:
test_mask = (df["qid1"].isin(test_query_qids)) | (df["qid2"].isin(test_query_qids))
train_df = df[~test_mask].copy()
test_df = df[test_mask].copy()

In [None]:
# clean up
del qid_to_qid_group_leader, qid_group_leader_to_duplicate_qid_group
del cnt
del test_query_qids   # not sorted, use test_query_qids_list
del df                # all data you can train on is in train_df

# enable use of complete graphs
test_qid_to_duplicate_qids = test_qid_to_duplicate_qids_complete
qid_to_duplicate_qids = qid_to_duplicate_qids_complete

# Evaluation Metrics

In [None]:
def method_random_guess(test_qid):
    # returns ranklist and scores of each size RANKED_LIST_SIZE
    return random.choices(train_query_qids_list, k=RANKED_LIST_SIZE), [0]*RANKED_LIST_SIZE

# 1000 x 100 (the ranked list of similar qn for each of the 1000 test qns)
ranklists_method_random_guess = [method_random_guess(test_qid)[0] for test_qid in test_query_qids_list]

In [None]:
def show_sample_query_results(test_qid, method_ranklist, method_scores=[0]*RANKED_LIST_SIZE, num_to_show=10):
    # not a metric, just print a few examples and its scores
    print("Query: {}".format(qid_to_question[test_qid]))
    for rank, (score, result_qid) in enumerate(zip(method_scores, method_ranklist[:num_to_show]), start=1):
        relevance = "Registered" if result_qid in test_qid_to_duplicate_qids[test_qid] else "Unregistered"
        print("Rank {} - Score {:.4f} - {}:  \t{}".format(rank, score, relevance, qid_to_question[result_qid]))

In [None]:
show_sample_query_results(test_query_qids_list[0], *method_random_guess(test_query_qids_list[0]))

In [None]:
def evaluation_with_first_relevant_rank(method_ranklists, considered=1, eps=10**-6, debug=True, **kwargs):
    # calculation of the statistics of the rank of the first c=considered duplicates
    # if the duplicate does not appear in the ranklist, it has a default rank of RANKED_LIST_SIZE
    assert np.array(method_ranklists).shape == (TEST_SET_SIZE, RANKED_LIST_SIZE)
    reciprocal_ranks = []
    ranks = []
    for test_qid, ranklist in zip(test_query_qids_list, method_ranklists):
        test_qid_to_rank = {result_qid:rank for rank, result_qid in enumerate(ranklist, start=1)}
        rank = []  # may be shorter than `considered` because of lack of duplicates
        for expected_qid in test_qid_to_duplicate_qids[test_qid]:
            if expected_qid in test_qid_to_rank:
                rank.append(test_qid_to_rank[expected_qid])
            else:
                rank.append(RANKED_LIST_SIZE+1)
        rank.sort()
        ranks.extend(rank[:considered])
        if rank[0] > RANKED_LIST_SIZE:
            reciprocal_ranks.append(0)
        else:
            reciprocal_ranks.append(1/rank[0])
    
    plt.figure(figsize=(14,4))
    plt.title("Highest rank of duplicate question")
    plt.hist(ranks, bins=np.arange(RANKED_LIST_SIZE+2))
    plt.xlabel("Rank")
    plt.ylabel("Frequency")
    plt.show()
    
    mrr = sum(reciprocal_ranks)/len(reciprocal_ranks)
    har = 1/(mrr+eps)
    print(f"Mean Reciprocal Rank (MRR) is {mrr:.2f}")
    print(f"Harmonic Average Rank (HAR) is {har:.2f}")    
    
    p50 = np.median(ranks)
    proportion_out_of_result = ranks.count(RANKED_LIST_SIZE+1)/len(ranks)
    if debug:
        print("Median rank: {:.2f}".format(p50))
        print("Proportion out of result: {:.3f}".format(proportion_out_of_result))
    
    return mrr, har, p50, proportion_out_of_result

In [None]:
_ = evaluation_with_first_relevant_rank(ranklists_method_random_guess)

In [None]:
def evaluation_with_auc(method_ranklists, k=10, weights=None, debug=True, **kwargs):
    assert np.array(method_ranklists).shape == (TEST_SET_SIZE, RANKED_LIST_SIZE)
    
    counts = np.array([0.]*k)
    ## Identify duplicates among top K ranks for each test
    for i, (test_qid, ranklist) in enumerate(zip(test_query_qids_list, method_ranklists)):
        topk = ranklist[:k]
        is_duplicate = np.array([1 if (result_qid in test_qid_to_duplicate_qids[test_qid]) else 0 for result_qid in topk])
        counts += is_duplicate 
    
    ## Calculate AUC
    if weights:
        counts *= np.array(weights)/sum(weights)
    else:
        counts /= k
    
    auc = sum(counts)/(TEST_SET_SIZE)
    
    if debug:
        print(f"{auc:.2%} of top {k} results are duplicates")

    return auc # between [0,1], 1 is perfect

In [None]:
_ = evaluation_with_auc(ranklists_method_random_guess)
_ = evaluation_with_auc(ranklists_method_random_guess, weights = [10,9,8,7,6,5,4,3,2,1])

In [None]:
def single_r_precision(test_qid, ranklist):
    # use this to check a single test query
    num_duplicate = len(test_qid_to_duplicate_qids[test_qid]) # this dict needs to be updated when train:test set separation is updated
    if num_duplicate == 0:
        return 0, 0, 0
    top_r = ranklist[:num_duplicate]
    num_duplicates_in_top_r = sum([1 if (result_qid in test_qid_to_duplicate_qids[test_qid]) else 0 for result_qid in top_r])
    r_precision = num_duplicates_in_top_r/num_duplicate
    return num_duplicate, num_duplicates_in_top_r, r_precision


def evaluation_with_r_precision(method_ranklists, k=10, report_k=0, debug=True, **kwargs):
    print(np.array(method_ranklists).shape)
    assert np.array(method_ranklists).shape == (TEST_SET_SIZE, RANKED_LIST_SIZE) # method_ranklists size is (1000,100)
    
    total_num_duplicates = np.array([0 for i in range(TEST_SET_SIZE)])
    r_precision = np.array([0 for i in range(TEST_SET_SIZE)])
    
    ## Iter over 1->1000 tests
    for i, (test_qid, ranklist) in enumerate(zip(test_query_qids_list, method_ranklists)): # iter over 1->1000 tests
        total_num_duplicates[i], num_duplicates_in_top_r, r_precision[i] = single_r_precision(test_qid, ranklist)
    
    # note: if want do error analysis, intervene here to find test cases with low r precision
    if report_k > 0:
        k_lowest_r_precision_idx = np.argpartition(r_precision, k)[:k]
        k_lowest_r_precision_test_qids = np.array(test_query_qids_list)[k_lowest_r_precision_idx]

    ## Calculate metrics
    avg_r_precision = r_precision.mean()
    weighted_avg_r_precision = np.multiply(r_precision, total_num_duplicates).sum() / total_num_duplicates.sum()
    
    if debug:
        print(f"Average R-Precision = {avg_r_precision:.2%}")
        print(f"Weighted Average R-Precision by proportion of duplicates = {weighted_avg_r_precision:.2%}") 
        if avg_r_precision > weighted_avg_r_precision:
            print("A higher average R-Precisions suggests that there are many test queries with high R-Precision but there are some test queries with high number of duplicates that model is not effective with.")
    
    if not report_k: return avg_r_precision, weighted_avg_r_precision
    else:
        return avg_r_precision, weighted_avg_r_precision, k_lowest_r_precision_test_qids

In [None]:
_ = evaluation_with_r_precision(ranklists_method_random_guess, k=10)

In [None]:
def evaluation_with_precision_recall_at_k(method_ranklists, k=10, exclude_precision=False, exclude_recall=False, debug=True, **kwargs):
    assert np.array(method_ranklists).shape == (TEST_SET_SIZE, RANKED_LIST_SIZE)
    ## Evaluation returns the macro average P@K and R@Kfor test set
    ## Interpretation P@K: what % of top k retrieved is relevant?
    ## Interpretation R@K: what % of all duplicates for query is retrieved within top k?
    
    ## Iter thru each test
    precisions_at_k = []
    recalls_at_k = []
    for i, (test_qid, ranklist) in enumerate(zip(test_query_qids_list, method_ranklists)):
        ## 1. Set rank threshold K, ignore all docs after K
        ## 2. Count num_relevant in top-K
        ## 3. Count total_num_duplicates_for_query
        ## 4. P@K = num_relevant/k
        ## 5. R@K = num_relevant/total_num_duplicates_for_query
        topk = ranklist[:k]
        num_relevant = sum([1 if (result_qid in test_qid_to_duplicate_qids[test_qid]) else 0 for result_qid in topk])
        
        precision_at_k = num_relevant/k
        precisions_at_k.append(precision_at_k)
        
        total_num_duplicates_for_query = len(test_qid_to_duplicate_qids[test_qid])
        recall_at_k = num_relevant/total_num_duplicates_for_query
        recalls_at_k.append(recall_at_k)
    
    mean_precision_at_k = sum(precisions_at_k)/len(precisions_at_k) # macro average
    mean_recall_at_k = sum(recalls_at_k)/len(recalls_at_k) # macro average
    print(f"Macro Average Precision@k={k} is {mean_precision_at_k:.2%}")
    print(f"Macro Average Recall@k={k} is {mean_recall_at_k:.2%}")
    return (mean_precision_at_k, mean_recall_at_k)

_ = evaluation_with_precision_recall_at_k(ranklists_method_random_guess, k=10)

In [None]:
def evaluation_with_map(method_ranklists, debug=True, **kwargs):
    assert np.array(method_ranklists).shape == (TEST_SET_SIZE, RANKED_LIST_SIZE)
    ## Interpretation: what is the average precision for all relevant docs across all queries?

    ## Iter thru each test
    average_precisions = []
    for i, (test_qid, ranklist) in enumerate(zip(test_query_qids_list, method_ranklists)):
        ## 1. Find the rank positions of each of the R relevant docs: K1, K2, ... KR and sort 
        ## 2. Compute P@K for each K1, K2, ... If K >=RANKED_LIST_SIZE, assume never retrieved
        ## 3. AP = average of P@K for query
        ## 4. MAP = macro average of AP across queries

        ## 1. Find the rank positions of each of the R relevant docs: K1, K2, ... and sort 
        dup_qids_in_train_set = [dup_qid for dup_qid in test_qid_to_duplicate_qids[test_qid] if dup_qid in train_query_qids_list] # find all the dup_qid that can be found in the train set so you know total dup qn that could be found
        total_num_dup_qid = len(dup_qids_in_train_set) # how many dup qn to expect

        dup_ranks = []
        for dup_qid in dup_qids_in_train_set:
            if dup_qid not in ranklist: # not found
                dup_ranks.append(RANKED_LIST_SIZE) # give "out of range" rank which would be checked later during calculation
                continue
            dup_ranks.append(list(ranklist).index(dup_qid)+1) # append the rank of the retrieved dup qn
        
        dup_ranks, dup_qids_in_train_set = (list(t) for t in zip(*sorted(zip(dup_ranks, dup_qids_in_train_set)))) # sort by rank
        ## 2. Compute P@K for each K1, K2, ... If K >=RANKED_LIST_SIZE, assume never retrieved
        precisions_at_k = []
        for j, rank in enumerate(dup_ranks, start=1): # dup_ranks is sorted
            if rank >= RANKED_LIST_SIZE: # handle "unretrieved" duplicates
                precisions_at_k.append(0)
            else: 
                precision_at_k = j / rank # = num_dup_so_far / rank_of_latest_dup_found
                precisions_at_k.append(precision_at_k)
        
        ## 3. AP = average of P@K for query
        average_precisions.append(sum(precisions_at_k)/len(precisions_at_k))
    
    ## Out of test query loop
    ## 4. MAP = macro average of AP across queries
    MAP = sum(average_precisions)/len(average_precisions)
    print(f"Mean Average Precision (MAP) is {MAP:.2%}")
    return MAP

_ = evaluation_with_map(ranklists_method_random_guess)

In [None]:
def evaluation_process(method, test_query_qids_list=test_query_qids_list, 
                       calculate_metrics=True, use_tqdm=True, **kwargs):
    # executes the method and runs the evaluation functions 
    ranklists, scorelists = [], []
    
    iterator = tqdm.tqdm if use_tqdm else iter
        
    for test_qid in iterator(test_query_qids_list):
        ranklist, scores = method(test_qid)
        ranklists.append(ranklist)
        scorelists.append(scores)
    
    if calculate_metrics:
        evaluation_with_first_relevant_rank(ranklists, **kwargs)
        # evaluation_with_auc(ranklists, **kwargs)
        evaluation_with_r_precision(ranklists, **kwargs)

        evaluation_with_precision_recall_at_k(ranklists, k=10, **kwargs)
        evaluation_with_map(ranklists, **kwargs)

    return ranklists, scorelists

In [None]:
results_random_guess = evaluation_process(method_random_guess)

# Preprocessing the Text

In [None]:
## This entire cell is important to enable tokeniser pipeline 
## Use this to replace tokenise function if using Tokenise then Spellcheck (TSC) pipeline

######### spacy basic tokenizer
import spacy
print("Spacy version: ", spacy.__version__)
from spacy.tokenizer import Tokenizer  # https://spacy.io/api/tokenizer

# !python3 -m spacy download en_core_web_sm
print("Loading Spacy en_core_web_sm loaded")
nlp = spacy.load("en_core_web_sm")
tokenizer = Tokenizer(nlp.vocab)
tokenizer.add_special_case("[math]", [{"ORTH": "[math]"}]) # see qid=7: '[math]23^{24}[/math]' becomes one token
# add more special cases here if found

In [None]:
def spacy_tokenise(text, lower=False, split_last_punc=True):
    """
    returns a list of tokens given a question text
    note: each punctuation is also considered a token
    note: "\n" is a token
    note: "'s" is a token
    note: '(Koh-i-Noor)' is a token
    see tokenizer instantiation code for special cases or to add
    lowercase text only after spell check
    """
    if lower: text = text.lower()
    tokens = tokenizer(text)
    token_list = [token.text for token in tokens]

    # further split tokens that end with certain punct e.g. "me?" => "me", "?"
    if split_last_punc: 
        split_lists = [[token[:-1], token[-1]] if (token[-1] in ["!","?",",",":"]) else [token] for token in token_list]
        token_list = [token for sublist in split_lists for token in sublist]
    return token_list

######### symspell spellchecker
print("Loading symspell")
!pip install symspellpy
from symspellpy.symspellpy import SymSpell, Verbosity  # https://github.com/mammothb/symspellpy
import pkg_resources

# instantiate spellchecker
sym = SymSpell(max_dictionary_edit_distance=2, prefix_length=7, count_threshold=1)
# https://symspellpy.readthedocs.io/en/latest/api/symspellpy.html
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
sym.load_dictionary(dictionary_path, 0, 1) # might take a short while

def spellcheck_single(word):
    # returns top correct spelling or the same word if no correction found within max_edit_distance
    if not word.isascii(): return word # do not spellcheck non ascii words e.g. シ

    # obtain list of suggestions
    suggestions = sym.lookup(word, Verbosity.CLOSEST, max_edit_distance=2,
        include_unknown=True, # a mispelled word with no found corrections is returned as is
        ignore_token=r"[:,.!?\\-]" # use if want to avoid correcting certain phrases
        )
    # get the term from the suggestItem object
    suggested_words = [suggestion._term for suggestion in suggestions]
    
    # check if the input word is legit and return if so else return corrected word
    word_lower = word.lower()
    if word_lower in suggested_words: return word_lower # do not correct if input is a legit word
    else: return suggested_words[0] # top suggestion

def spellcheck_compound(sent):
    # spellchecks a sentence
    suggestions = sym.lookup_compound(sent, max_edit_distance=2)
    return suggestions[0]._term # returns the top suggestion

######### tokenise pipeline
def tokenise_then_spellcheck(sent):
    # 8 times faster than spellcheck_then_tokenise
    tokens = spacy_tokenise(sent) # NOTE: replace tokenise with spacy_tokenise
    checked_tokens = [spellcheck_single(token).lower() for token in tokens] # lower after spell check
    return checked_tokens

def spellcheck_then_tokenise(sent):
    checked_sent = spellcheck_compound(sent)
    tokens = spacy_tokenise(checked_sent, lower=True) # lower after spell check
    return tokens

In [None]:
# define tokenisation process

import pickle
qid_to_tokens_preprocessed_filename = "../input/quora-question-pairs-tokenise-pipeline/qid_to_processed_token_list_tokenise_then_spellcheck.pkl"
with open(qid_to_tokens_preprocessed_filename, "rb") as f:
    qid_to_tokens_preprocessed = pickle.load(f)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopword_set = set(stopwords.words())
stopword_set.update(["?", ","])

def nltk_tokenize(sentence):
    return word_tokenize(sentence.lower())

def tokenise_qid(qid, qid_to_tokens_preprocessed=qid_to_tokens_preprocessed, 
                 tokenise_method=tokenise_then_spellcheck):
    # return a list of tokens, does not remove stopwords or duplicates
    if qid_to_tokens_preprocessed and qid in qid_to_tokens_preprocessed:
        return qid_to_tokens_preprocessed[qid]
    return tokenise_method(qid_to_question[qid])

In [None]:
def preprocess_vsm(train_query_qids_list=train_query_qids_list, stopword_set=stopword_set, exclude_stopwords=True):
    '''
    Input:
        qid_to_question = {qid: question string}
            Note: only use the test subset of qids
    
    Outputs:
        qid_to_tokens = {qid: set(tokens)}
        token_to_qids = {token: set(qids)}
        tf = {token: {qid: TF as int}}
        df = {token: DF as int}
        L = {qid: question length as int}
    '''
    qid_to_tokens = defaultdict(set)
    token_to_qids = defaultdict(set)
    tf = defaultdict(Counter)
    df = defaultdict(int)
    L = defaultdict(int)

    qid_processed = set()
    for qid in tqdm.tqdm(train_query_qids_list):
        qid_tokenised = tokenise_qid(qid)

        for token in set(qid_tokenised):
            if token not in stopword_set or not exclude_stopwords:
                # store qid-to-token mapping
                qid_to_tokens[qid].add(token)
                token_to_qids[token].add(qid)

                # compute and store term frequency
                tf[token][qid] += 1 

                # store doc frequency in df
                df[token] += 1

        # store doc length in L (double-count repeated tokens)
        L[qid] = len(qid_tokenised)
        
    # output
    return qid_to_tokens, token_to_qids, tf, df, L

In [None]:
qid_to_tokens, token_to_qids, tf, df, L = preprocess_vsm()

# save a copy of the original to allow reset later
qid_to_tokens_original, token_to_qids_original = deepcopy(qid_to_tokens), deepcopy(token_to_qids)
tf_original, df_original, L_original = deepcopy(tf), deepcopy(df), deepcopy(L)

# Model 0 - Baseline
Order by the number of overlapping non-stopword words. Random if tie.

In [None]:
def method_overlapping_root_word_count(query_qid, ignore_stopwords=True):
    query_tokens = set(tokenise_qid(query_qid))
    if ignore_stopwords:
        query_tokens = [token for token in query_tokens if token not in stopword_set]
    counter = collections.Counter()
    
    for dummy_qid in random.choices(train_query_qids_list, k=RANKED_LIST_SIZE):
        # prefill with random results to address the possibility of no matches
        counter[dummy_qid] = 0.01
    
    for query_token in query_tokens:
        counter += collections.Counter(token_to_qids[query_token])
    
    query_results = list(counter.items())
    random.shuffle(query_results)  # so that qids are not ordered
    query_results = sorted(query_results, key=lambda x:x[1], reverse=True)[:RANKED_LIST_SIZE]

    return [x[0] for x in query_results], [x[1] for x in query_results]

In [None]:
show_sample_query_results(test_query_qids_list[0], *method_overlapping_root_word_count(test_query_qids_list[0]))

In [None]:
results_overlapping_root_word_count = evaluation_process(method_overlapping_root_word_count)

# Model 1 - TFIDF

In [None]:
def compute_idf(doc_freq, N):
    '''
    Inputs:
        doc_freq = document frequency of some token
        N = corpus size including query
    
    Output:
        idf = IDF as float
    '''
    return log(N/doc_freq)

In [None]:
from functools import reduce
import operator

def prod(iterable):
    return reduce(operator.mul, iterable, 1)


def use_vsm(qid_query, \
    # qid_to_tokens=qid_to_tokens, tf=tf, df=df, L=L,
    method='tf-idf', compute_idf=compute_idf,
    k1=1.5, k3=1.5, b=0.75,
    smoothing='add-one', alpha=0.75, eps=10**(-6),
    exclude_stopwords=True,
    return_top=RANKED_LIST_SIZE):
    
    '''
    Inputs:
        qid_query = qid of question match   # this comes from "test" set
        qid_to_tokens = {qid: set(tokens)}  # this is the "training" corpus
        tf = {token: term freq}             # required for all methods
        df = {token: doc freq}              # required for method='tf-idf','bm25'
        L = {qid: doc length}               # required for method='bm25','unigram'

        method = model to apply
        k1, k3, b = tuning params           # required for method='bm25'
        smoothing = type of smoothing       # required for method='unigram'
        return_top = num of docs to return
    
    Procedure:
        0. Corpus is already tokenised, tf, df, L already computed
        1. Tokenise query, expand tf, df, L with query information
        
        if method='boolean':
            Remove idf calculation, then use method='tf-idf'

        if method='tf-idf':
            2. Compute tf-idf weights only for relevant (t,d) pairs
            3. Compute cosine similarity only for docs containing query terms
        
        if method='bm25':
            2. Compute RSV summation terms only for relevant (t,d) pairs
            3. Compute RSV only for docs containing query terms
        
        if method='unigram':
            2. Compute probabilities only for relevant (t,d) pairs
            3. Compute query probability only for docs containing query terms
        
        4. Return docs in ranked order

    Output:
        ranking = [qids in decreasing order of match]
        scoring = [corresponding scores]
    '''
    
    assert method in ['boolean','tf-idf','bm25','unigram'], "Supported methods: 'boolean', 'tf-idf', 'bm25', 'unigram'"
    assert len(L.keys()) > 0 if method=='bm25' else True, "Please include L for bm25"
    assert len(L.keys()) > 0 if method=='unigram' else True, "Please include L for unigram"
    assert smoothing in ['add-one','linear-interpolation'] if method=='unigram' else True
    assert alpha >= 0 and alpha <= 1 if smoothing=='linear-interpolation' else True

    qid_tmp = time.time()

    ''' STEP 1: PROCESS QUERY '''
    query_tokenised = tokenise_qid(qid_query)
    
    for token in set(query_tokenised):
        if token not in stopword_set or not exclude_stopwords:
            # store qid-to-token mapping
            # store query as qid=0 (corpus starts from qid=1)
            qid_to_tokens[qid_tmp].add(token)

            # compute and store term frequency
            tf[token][qid_tmp] = sum([1 if t==token else 0 for t in query_tokenised])
            
            # update doc frequency in df
            df[token] += 1
    
    # store query length
    L[qid_tmp] = len(query_tokenised)

    if method=='boolean':
        def compute_idf(doc_freq, N):
            return 1
        method = 'tf-idf'
        
    if method=='tf-idf':
        
        ''' STEP 2: COMPUTE TF-IDF WEIGHTS '''
        weights = defaultdict(lambda: defaultdict(float))
        N = len(qid_to_tokens) # original corpus + query

        # only bother computing for tokens in the query
        for token in set(query_tokenised):
            if token not in stopword_set or exclude_stopwords==False:
                weights[qid_tmp][token] = tf[token][qid_tmp] * compute_idf(df[token], N)
                
                for qid in tf[token].keys():
                    weights[qid][token] = tf[token][qid] * compute_idf(df[token], N)
                    
                    # also compute weight for other tokens contained by these qids
                    # needed for computing qid vector length
                    for other_token in qid_to_tokens[qid]:
                        weights[qid][other_token] = tf[other_token][qid] * compute_idf(df[other_token], N)

                        
        ''' STEP 3: COMPUTE COSINE SIMILARITY TO QUERY '''
        cosine_similarities = defaultdict(float)
        # compute denominator (part 1), i.e., |q| * |d|
        query_vector_length = (sum([w**2 for w in weights[qid_tmp].values()]))**0.5
        
        for qid in weights.keys():
            
            # compute numerator, i.e., dot product of q and d
            cosine_numerator = 0
            
            for token in weights[qid].keys():
                if token in weights[qid_tmp]:
                    cosine_numerator += weights[qid][token] * weights[qid_tmp][token]
            
            # compute denominator (part 2), i.e., |q| * |d|
            qid_vector_length = (sum([w**2 for w in weights[qid].values()]))**0.5

            # compute and store cosine similarity between q and d
            cosine_similarities[qid] = cosine_numerator / (query_vector_length+eps) / (qid_vector_length+eps)
        
        scores = cosine_similarities

    if method=='bm25':

        ''' STEP 2: COMPUTE RSV TERMS '''
        rsv_terms = defaultdict(lambda: defaultdict(float))
        N = len(qid_to_tokens) # original corpus + query
        L_avg = sum(L.values())/len(L.values())

        # only bother computing for tokens in the query
        for token in set(query_tokenised):
            for qid in tf[token].keys():
                rsv_terms[qid][token] = compute_idf(df[token], N) \
                    * (k1+1)*tf[token][qid] / (k1*((1-b)+b*L[qid]*L_avg) + tf[token][qid]) \
                        * (k3+1)*tf[token][qid_tmp] / (k3 + tf[token][qid_tmp])

        ''' STEP 3: COMPUTE RSV '''
        rsv = {qid: sum(rsv_terms[qid].values()) for qid in rsv_terms.keys()}
        scores = rsv
    
    if method=='unigram':
        
        ''' STEP 2: COMPUTE PROBABILITIES '''
        probabilities = defaultdict(lambda: defaultdict(float))
        corpus_model = defaultdict(float)
        
        # only bother computing for tokens in the query
        for token in set(query_tokenised):
            for qid in tf[token].keys():

                if smoothing=='add-one':
                    probabilities[qid][token] = (tf[token][qid]+1) / (L[qid]+len(query_tokenised))
                else:
                    probabilities[qid][token] = (tf[token][qid]) / (L[qid])

                # for linear-interpolation smoothing, build corpus language model
                if smoothing=='linear-interpolation':
                    corpus_model[token] += tf[token][qid]

        # remaining operations for linear-interpolation smoothing        
        if smoothing=='linear-interpolation':
            # finish building corpus language model by dividing corpus tf by corpus L
            total_corpus_length = sum(L.values())
            for token in corpus_model.keys():
                corpus_model[token] = corpus_model[token] / total_corpus_length
            
            # then update the probabilities
            for qid in probabilities.keys():
                for token in probabilities[qid].keys():
                    probabilities[qid][token] = alpha*probabilities[qid][token] + (1-alpha)*corpus_model[token]

        ''' STEP 3: COMPUTE QUERY PROBABILITY '''
        query_prob = {qid: -log(prod(probabilities[qid].values())) for qid in probabilities.keys()}
        scores = query_prob

    ''' STEP 4: RANK DOCUMENTS AND RETURN RESULT '''
    # cleanup
    if qid_tmp in qid_to_tokens:
        del qid_to_tokens[qid_tmp]
    for token in set(query_tokenised):
        if token not in stopword_set or not exclude_stopwords:
            del tf[token][qid_tmp]
            df[token] -= 1
    
    if qid_tmp in scores:
        del scores[qid_tmp] # remove query from result
    ranking = sorted(scores, key=scores.get, reverse=True)
    scoring = sorted(scores.values(), reverse=True)

    # if too few documents match the query, add dummy documents
    if len(ranking) < return_top:
        ranking.extend([0]*(return_top-len(ranking)))
        scoring.extend([0]*(return_top-len(ranking)))

    # return top k results
    return ranking[:return_top], scoring[:return_top]

In [None]:
def method_boolean(qid):
    return use_vsm(qid, method='boolean')

In [None]:
show_sample_query_results(test_query_qids_list[0], *method_boolean(test_query_qids_list[0]))

In [None]:
results_boolean = evaluation_process(method_boolean)

In [None]:
def method_tf_idf(qid):
    return use_vsm(qid, method='tf-idf')

In [None]:
show_sample_query_results(test_query_qids_list[0], *method_tf_idf(test_query_qids_list[0]))

In [None]:
results_tf_idf = evaluation_process(method_tf_idf)

# Model 2 - BM25

In [None]:
def method_bm25(qid):
    return use_vsm(qid, method='bm25')

In [None]:
show_sample_query_results(test_query_qids_list[0], *method_bm25(test_query_qids_list[0]))

In [None]:
results_bm25 = evaluation_process(method_bm25)

# Model 3 - Unigram Language Model

In [None]:
def method_unigram(qid):
    # see previous versions for results for other alpha values and smoothing='add-one'
    return use_vsm(qid, method='unigram', smoothing='linear-interpolation', alpha=0)

In [None]:
show_sample_query_results(test_query_qids_list[0], *method_unigram(test_query_qids_list[0]))

In [None]:
results_unigram = evaluation_process(method_unigram)

# Model 4 - Word Embeddings

#### Using spaCy models

In [None]:
def to_vec(token_or_list):
    # converts a token string or a list of tokens into a word or doc vec respectively
    if type(token_or_list) == list:
        # token list needs to be joined into a sentence first
        token_or_list = ' '.join(token_or_list)
    return nlp(token_or_list).vector

nlp2 = spacy.load("en_core_web_lg")
def to_vec2(token_or_list):
    # converts a token string or a list of tokens into a word or doc vec respectively
    if type(token_or_list) == list:
        # token list needs to be joined into a sentence first
        token_or_list = ' '.join(token_or_list)
    return nlp2(token_or_list).vector

In [None]:
# Load pre-processed dict
with open("../input/quora-question-pairs-tokenise-pipeline/qid_to_vec.pkl", "rb") as f:
    qid_to_vec = pickle.load(f)

print("Pre-processed question vector is of shape {}".format(qid_to_vec[0].shape))

In [None]:
from numpy import dot
from numpy.linalg import norm

def method_spacy_embedding_similarity(test_qid):
    tokens = tokenise_then_spellcheck(qid_to_question[test_qid])
    test_vec = to_vec(tokens)
    
    ## Run baseline model as a filter
    qid_list, scores = method_overlapping_root_word_count(test_qid)
    
    cos_sims = [] # bigger better
    for train_qid in qid_list:# train_query_qids_list:
        train_vec = qid_to_vec[train_qid]
        cos_sim = dot(test_vec, train_vec)/(norm(test_vec)*norm(train_vec))
        cos_sims.append(cos_sim)

    cos_sims = np.array(cos_sims)
    qid_list = np.array(qid_list) # train_query_qids_list
    inds = cos_sims.argsort()[::-1] # reverse so biggest come first
    cos_sims = cos_sims[inds]
    ranklist = qid_list[inds] 

    return ranklist[:RANKED_LIST_SIZE], cos_sims[:RANKED_LIST_SIZE]

In [None]:
show_sample_query_results(test_query_qids_list[0], *method_spacy_embedding_similarity(test_query_qids_list[0]))

In [None]:
results_spacy_embedding_similarity = evaluation_process(method_spacy_embedding_similarity)

In [None]:
with open("../input/quora-question-pairs-tokenise-pipeline/qid_to_vec_trf.pkl", "rb") as f: # note, actually lg not trf
    qid_to_vec2 = pickle.load(f)

print("Pre-processed question vector is of shape {}".format(qid_to_vec2[0].shape)) # 300 dim vec

In [None]:
def method_spacy_embedding_similarity_lg(test_qid):
    tokens = tokenise_then_spellcheck(qid_to_question[test_qid])
    test_vec = to_vec2(tokens)
    
    ## Run baseline model as a filter
    qid_list, scores = method_overlapping_root_word_count(test_qid)
    
    cos_sims = [] # bigger better
    for train_qid in qid_list:# train_query_qids_list:
        train_vec = qid_to_vec2[train_qid]
        cos_sim = dot(test_vec, train_vec)/(norm(test_vec)*norm(train_vec))
        cos_sims.append(cos_sim)

    cos_sims = np.array(cos_sims)
    qid_list = np.array(qid_list) # train_query_qids_list)
    inds = cos_sims.argsort()[::-1] # reverse so biggest come first
    cos_sims = cos_sims[inds]
    ranklist = qid_list[inds] 

    return ranklist[:RANKED_LIST_SIZE], cos_sims[:RANKED_LIST_SIZE]

In [None]:
show_sample_query_results(test_query_qids_list[0], *method_spacy_embedding_similarity_lg(test_query_qids_list[0]))

In [None]:
results_spacy_embedding_similarity_lg = evaluation_process(method_spacy_embedding_similarity_lg)

#### Gensim WordMover Distance on Boolean Retrieval
* Applies further sorting by wordmover distance on the output ranklist of Boolean Retrieval  
* Current pre-trained model: `glove-wiki-gigaword-50`


In [None]:
import gensim
import gensim.downloader
# gensim.downloader.info() # find more models to download

from gensim.models import KeyedVectors

try: model = KeyedVectors.load("../input/ir-project-download-keyed-vectors/glove-wiki-gigaword-50.keyedvectors")
except: # gs_model not downloaded
    model = gensim.downloader.load('glove-wiki-gigaword-50')
    # model.save("/kaggle/working/glove-wiki-gigaword-50.keyedvectors") # if not already saved

In [None]:
def method_wordmover_distance(test_qid, model):
    # out of box duplicate finder does not work!
    # returns ranklist and scores of each size RANKED_LIST_SIZE
    
    ## Run baseline model as a filter
    ranklist, scores = method_overlapping_root_word_count(test_qid)
    
    ## Process test question
    test_qn = tokenise_qid(test_qid)
    
    ## Get wordmover distance from every candidate
    distances = []
    qid_list = ranklist
    for candidate_qid in qid_list:
        candidate_qn = tokenise_qid(candidate_qid)
        distances.append(1-model.wmdistance(test_qn, candidate_qn))
    
    ## Sort by distance
    sorted_dist_and_candidate_qid = sorted(zip(distances,qid_list))[::-1]
    sorted_candidate_qid = [qid for _,qid in sorted_dist_and_candidate_qid]
    sorted_dist = [dist for dist,_ in sorted_dist_and_candidate_qid]
    return sorted_candidate_qid[:RANKED_LIST_SIZE], sorted_dist[:RANKED_LIST_SIZE]

In [None]:
def method_wordmover_distance_glovewiki50(test_qid):
    return method_wordmover_distance(test_qid, model)

In [None]:
show_sample_query_results(test_query_qids_list[0], *method_wordmover_distance_glovewiki50(test_query_qids_list[0]))

In [None]:
results_wordmover_distance_glovewiki50 = evaluation_process(method_wordmover_distance_glovewiki50)

In [None]:
models_to_try = ['glove-wiki-gigaword-300', 'glove-twitter-50','word2vec-google-news-300','fasttext-wiki-news-subwords-300']

if not EVALUATING:
    models_to_try = []

for m in models_to_try:
    print("Model: ",m)
    try:
        model = KeyedVectors.load(f"../input/ir-project-download-keyed-vectors/{m}.keyedvectors")
    except:
        model = gensim.downloader.load(m)

    def method_wordmover_distance_new_model(test_qid):
        return method_wordmover_distance(test_qid, model)

    show_sample_query_results(test_query_qids_list[0], *method_wordmover_distance_new_model(test_query_qids_list[0]))

    _ = evaluation_process(method_wordmover_distance_new_model)

# Model 5 - Sentence Embeddings

Each sentence can be embedded as a vector with SentenceTransformer

In [None]:
!pip install sentence-transformers > /dev/null

In [None]:
from sentence_transformers import SentenceTransformer
model_name = 'bert-base-nli-stsb-mean-tokens'
model_tf = SentenceTransformer(model_name)

In [None]:
model_name = "bert-base-nli-stsb-mean-tokens"
sentence_vectors = np.load(f"../input/quora-question-pairs-bert-sentence-vectors/sentence_vectors_{model_name}.npy")
sentence_vectors = {i:vec for i,vec in enumerate(sentence_vectors)}

In [None]:
from scipy.spatial.distance import cosine

def method_sentence_vector(query_qid, method_preliminary=method_overlapping_root_word_count, preliminary_factor=1):
    # method_preliminary can be either of the previous methods
    # recommended method_overlapping_root_word_count, method_boolean, method_tf_idf
    sentence_vectors[query_qid] = model_tf.encode(qid_to_question[query_qid], show_progress_bar=False)

    qid_list, preliminary_scores = method_preliminary(query_qid)
    
    # sort by cosine similarity
    query_sentence_vector = sentence_vectors[query_qid]
    query_results = [(qid, preliminary_factor*preliminary_score+1-abs(cosine(query_sentence_vector, sentence_vectors[qid])))
                     for qid,preliminary_score in zip(qid_list,preliminary_scores)]
    query_results = sorted(query_results, key=lambda x:x[1], reverse=True)[:RANKED_LIST_SIZE]
    
    return [x[0] for x in query_results], [x[1] for x in query_results]

In [None]:
show_sample_query_results(test_query_qids_list[0], *method_sentence_vector(test_query_qids_list[0], preliminary_factor=0))

In [None]:
show_sample_query_results(test_query_qids_list[0], *method_sentence_vector(test_query_qids_list[0], preliminary_factor=1))

In [None]:
results_sentence_vector = evaluation_process(method_sentence_vector)

# Model 6 - Supervised Model

In [None]:
SUPERVISED_MODEL_TRAINING_SET_SIZE = 10000
LOAD_DATA_FOR_SUPERVISED = True
DIR_DATA_FOR_SUPERVISED = "../input/ir-project-supervised-model-data-preparation/"

supervised_query_qids = random.sample(set(qid_to_duplicate_qids.keys()) - set(test_query_qids_list), 
                                      SUPERVISED_MODEL_TRAINING_SET_SIZE)

def create_supervised_features(qids, testing=True):
    
    kwargs = {"test_query_qids_list": qids, "calculate_metrics": False, "use_tqdm": False}
    method_to_ranklists_scorelists_supervised = {
        "overlapping_root_word_count": evaluation_process(method_overlapping_root_word_count, **kwargs),
        "boolean": evaluation_process(method_boolean, **kwargs),
        "tf_idf": evaluation_process(method_tf_idf, **kwargs),
        "bm25": evaluation_process(method_bm25, **kwargs),
        "unigram": evaluation_process(method_unigram, **kwargs),
        "spacy_embedding_similarity": evaluation_process(method_spacy_embedding_similarity, **kwargs),
        "spacy_embedding_similarity_lg": evaluation_process(method_spacy_embedding_similarity_lg, **kwargs),
        "wordmover_distance_glovewiki50": evaluation_process(method_wordmover_distance_glovewiki50, **kwargs),
        "sentence_vector": evaluation_process(method_sentence_vector, **kwargs),
    }
    return method_to_ranklists_scorelists_supervised

def parse_ndarray(obj):  # https://stackoverflow.com/a/52604722/5894029
    if isinstance(obj, np.ndarray):
        return obj.tolist()

if not LOAD_DATA_FOR_SUPERVISED:
    method_to_ranklists_scorelists_supervised = create_supervised_features(supervised_query_qids)

    with open(DIR_DATA_FOR_SUPERVISED+'supervised_query_qids.json', 'w') as f:
        json.dump(supervised_query_qids, f, indent=4, default=parse_ndarray)    

    with open(DIR_DATA_FOR_SUPERVISED+'method_to_ranklists_scorelists_supervised.json', 'w') as f:
        json.dump(method_to_ranklists_scorelists_supervised, f, indent=4, default=parse_ndarray)

with open(DIR_DATA_FOR_SUPERVISED+'supervised_query_qids.json') as f:
    supervised_query_qids = json.load(f)

with open(DIR_DATA_FOR_SUPERVISED+'method_to_ranklists_scorelists_supervised.json') as f:
    method_to_ranklists_scorelists_supervised = json.load(f)

In [None]:
def parse_supervised_features_into_df(method_to_ranklists_scorelists_supervised, training=False, 
                                      supervised_query_qids_set=set(supervised_query_qids)):
    supervised_scores = defaultdict(dict)
    for method, (ranklists, scorelists) in method_to_ranklists_scorelists_supervised.items():
        for supervised_query_qid, ranklist, scorelist in zip(supervised_query_qids, ranklists, scorelists):
            for candidate_qid, score in zip(ranklist, scorelist):
                if training and candidate_qid in supervised_query_qids_set:
                    continue
                supervised_scores[supervised_query_qid, candidate_qid][method] = score
                
    df_supervised = pd.DataFrame.from_dict(supervised_scores, orient='index')
    return df_supervised

def extract_supervised_labels_from_df(df_supervised):
    supervised_labels = [int(candidate_qid in qid_to_duplicate_qids[supervised_query_qid]) 
                         for supervised_query_qid, candidate_qid in df_supervised.index]
    return supervised_labels

df_supervised = parse_supervised_features_into_df(method_to_ranklists_scorelists_supervised, training=True)
supervised_labels = extract_supervised_labels_from_df(df_supervised)

# extracted and total number of positive labels
sum(supervised_labels), sum(len(qid_to_duplicate_qids[supervised_query_qid]) for supervised_query_qid in supervised_query_qids)

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, class_weight='balanced').fit(np.nan_to_num(df_supervised.values), supervised_labels)
for coef, feature in zip(clf.coef_[0], df_supervised.columns):
    print("{:.4f}".format(coef), feature)

In [None]:
def method_supervised_model_logr(query_qid):
    df_predict = parse_supervised_features_into_df(create_supervised_features([query_qid]))
    scores = clf.predict_proba(np.nan_to_num(df_predict.values))[:,1]
    candidate_qids = df_predict.reset_index()["level_1"]  # resolve dataframe multi-index
    results = sorted(list(zip(scores, candidate_qids)))[::-1]
    return [x[1] for x in results][:RANKED_LIST_SIZE], [x[0] for x in results][:RANKED_LIST_SIZE]  # qid, scores

In [None]:
show_sample_query_results(test_query_qids_list[0], *method_supervised_model_logr(test_query_qids_list[0]))

In [None]:
results_supervised_model_logr = evaluation_process(method_supervised_model_logr)

#### LightGBM classification

In [None]:
import lightgbm as lgb

df_train = df_supervised.copy()
target_train = np.array(supervised_labels)
eval_set = np.array([True if i < len(df_train)*0.2 else False for i in range(len(df_train))])
lgb_train = lgb.Dataset(df_train[~eval_set], target_train[~eval_set])
lgb_eval = lgb.Dataset(df_train[eval_set], target_train[eval_set], reference=lgb_train)
lgb_all = lgb.Dataset(df_train, target_train)

In [None]:
params = {
#     'boosting_type': 'gbdt',
    'objective': 'binary',
    'monotone_constraints': [1]*len(df_supervised.columns),
#     'scale_pos_weight': 0.360,
#     'metric': {'auc'},
#     'num_leaves': 15,
#     'learning_rate': 0.05,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.8,
#     'bagging_freq': 5,
    'verbose': -1,
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                verbose_eval=-1,
                early_stopping_rounds=10)

pd.DataFrame({"feature": df_train.columns, "importance": gbm.feature_importance(importance_type="gain")})[:20]

In [None]:
def method_supervised_model_lgbm(query_qid):
    df_predict = parse_supervised_features_into_df(create_supervised_features([query_qid]))
    scores = gbm.predict(df_predict)
    candidate_qids = df_predict.reset_index()["level_1"]
    results = sorted(list(zip(scores, candidate_qids)))[::-1]
    return [x[1] for x in results][:RANKED_LIST_SIZE], [x[0] for x in results][:RANKED_LIST_SIZE]  # qid, scores

In [None]:
show_sample_query_results(test_query_qids_list[0], *method_supervised_model_lgbm(test_query_qids_list[0]))

In [None]:
results_supervised_model_lgbm = evaluation_process(method_supervised_model_lgbm)

# Preparation for Hand Evaluation Dataset

In [None]:
method_to_ranklists_scorelists = {
#     "random_guess": results_random_guess,
    "overlapping_root_word_count": results_overlapping_root_word_count,
    "boolean": results_boolean,
    "tf_idf": results_tf_idf,
    "bm25": results_bm25,
    "unigram": results_unigram,
    "spacy_embedding_similarity": results_spacy_embedding_similarity,
    "spacy_embedding_similarity_lg": results_spacy_embedding_similarity_lg,
    "wordmover_distance_glovewiki50": results_wordmover_distance_glovewiki50,
    "sentence_vector": results_sentence_vector,
    "supervised_model_logr": results_supervised_model_logr,
    "supervised_model_lgbm": results_supervised_model_lgbm
}

import json

def parse_ndarray(obj):  # https://stackoverflow.com/a/52604722/5894029
    if isinstance(obj, np.ndarray):
        return obj.tolist()

# with open('method_to_ranklists_scorelists.json', 'w') as f:
#     json.dump(method_to_ranklists_scorelists, f, indent=4, default=parse_ndarray)

In [None]:
QUESTIONS_TO_HANDEVAL = set(x-1 for x in [
    332, 490, 1955, 6319, 9690, 17279, 19619, 20557, 26378, 33734, 38984, 
    49864, 57291, 89903, 116882, 126992, 131214, 144297, 159628, 201409, 
    273666, 284107, 286721, 312887, 318523, 378759, 384832, 405081, 
    405877, 423313, 464279, 480116, 533401])
HANDEVAL_RANK_THRESHOLD = 10

map_qid_to_handeval = defaultdict(set)

for ranklists, scorelists in method_to_ranklists_scorelists.values():
    for test_qid, ranklist in zip(test_query_qids_list, ranklists):
        if test_qid in QUESTIONS_TO_HANDEVAL:
            for candidate_qid in ranklist[:HANDEVAL_RANK_THRESHOLD]:
                map_qid_to_handeval[test_qid].add(candidate_qid)
            
for qid in map_qid_to_handeval:
    map_qid_to_handeval[qid] = sorted(map_qid_to_handeval[qid])

In [None]:
dataframe_columns = ["test_qid", "test_question", "candidate_qid", "candidate_question"]
dataframe_entries = []
for qid in sorted(map_qid_to_handeval.keys()):
    for candidate_qid in map_qid_to_handeval[qid]:
        line_entry = [qid, qid_to_question[qid], candidate_qid, qid_to_question[candidate_qid]]
        dataframe_entries.append(line_entry)
        
random.shuffle(dataframe_entries)
dataframe_entries = sorted(dataframe_entries, key = lambda x: x[0])

In [None]:
df_handeval = pd.DataFrame(dataframe_entries, columns=dataframe_columns)
# labeller columns
df_handeval["jh"] = np.nan
df_handeval["hk"] = np.nan
df_handeval["wt"] = np.nan

# df_handeval.to_csv("df_handeval.csv", index=None)

# Calculate NDCG with Hand Evaluation Dataset

This calculates NDCG from a snapshot version of `method_to_ranklists_scorelists`, and a hand annotated `df_handeval`.

Due to randomness, the `method_to_ranklists_scorelists` may not be reproduced exactly.

In [None]:
df_handeval = pd.read_csv("../input/quoraquestionpairhandannotateddataset/df_handeval.csv")
with open('../input/quoraquestionpairhandannotateddataset/method_to_ranklists_scorelists.json') as f:
    method_to_ranklists_scorelists = json.load(f)

In [None]:
import math

def calculate_dcg_at_k(r, k, method=0):
    if method == 0:
        logn = [1.] + [1/math.log(i,2) for i in range(2, k+1)]
    else:
        logn = [1/math.log(i,2) for i in range(2, k+2)]
    
    dcg = 0.
    for gain,disc in zip(r[:k], logn):
        dcg += gain*disc
    return dcg

def calculate_ndcg_at_k(scores, ref, k=10, method=0):
    denom = calculate_dcg_at_k(ref, k, method=method)
    numer = calculate_dcg_at_k(scores, k, method=method)
    if denom == 0:
        return 0.
    return numer/denom

In [None]:
test_qid_to_candidate_qid_to_scores = collections.defaultdict(dict)

for _,row in df_handeval.iterrows():
    test_qid = row["test_qid"]
    candidate_qid = row["candidate_qid"]
    score = row["average"]
    test_qid_to_candidate_qid_to_scores[test_qid][candidate_qid] = score
    
test_qid_to_ideal_scores = collections.defaultdict(list)
for test_qid, candidate_qid_to_scores in test_qid_to_candidate_qid_to_scores.items():
    ideal_scores = sorted(candidate_qid_to_scores.values())[::-1]
    test_qid_to_ideal_scores[test_qid] = ideal_scores

method_to_ndcg_score = collections.defaultdict(list)
count_out_of_eval = 0

for method_name, (ranklists, _) in method_to_ranklists_scorelists.items():
    for test_qid, ranklist in zip(test_query_qids_list, ranklists):
        if test_qid in QUESTIONS_TO_HANDEVAL:
            scores = []
            for candidate_qid in ranklist[:HANDEVAL_RANK_THRESHOLD]:
                if candidate_qid not in test_qid_to_candidate_qid_to_scores[test_qid]:
                    scores.append(1)
                    print(method_name, len(scores))
                    count_out_of_eval += 1
                else:
                    scores.append(test_qid_to_candidate_qid_to_scores[test_qid][candidate_qid])
            ref = test_qid_to_ideal_scores[test_qid]
            ndcg_at_k = calculate_ndcg_at_k(scores, ref)
            method_to_ndcg_score[method_name].append(ndcg_at_k)

count_out_of_eval

In [None]:
if EVALUATING:
  for method_name, scores in method_to_ndcg_score.items():
    scores = scores[3:]  # first three are not labelled
    print(method_name)
    print(f"{sum(scores)/len(scores):.5f}")
    print(" ".join(f"{x:.2f}" for x in scores))
    print()

# Indexing and Querying of Unseen Questions

This is the Graphical User Interface that we are presenting

In [None]:
def index_unseen_question(unseen_question_text_list):
    unseen_sentence_vectors = model_tf.encode(unseen_question_text_list, show_progress_bar=True)
    qids_new = [time.time() for _ in unseen_question_text_list]

    for qid_new, unseen_sentence_vector, unseen_question_text in zip(qids_new, unseen_sentence_vectors, unseen_question_text_list):
        qid_to_question[qid_new] = unseen_question_text
        
        # compute and update word embedding
        token_list = tokenise_then_spellcheck(unseen_question_text)
        qid_to_vec[qid_new] = to_vec(token_list)
        qid_to_vec2[qid_new] = to_vec2(token_list)

        # update sentence embedding
        sentence_vectors[qid_new] = unseen_sentence_vector    

    # update tf-idf
    qid_to_tokens_, token_to_qids_, tf_, df_, L_  = preprocess_vsm(qids_new)
    for qid in qid_to_tokens_:
        qid_to_tokens[qid] = qid_to_tokens_[qid]
    for token in token_to_qids_:
        token_to_qids[token].update(token_to_qids_[token])
    for token in tf_:
        for qid in tf_[token]:
            tf[token][qid] += tf_[token][qid]
    for token in df_:
        df[token] += df_[token]
    for qid in L_:
        L[qid] = L_[qid]

In [None]:
def query_unseen_question(unseen_question_text, method):
    qid_new = time.time()
    qid_to_question[qid_new] = unseen_question_text
    
    show_sample_query_results(qid_new, *method(qid_new))

## Choose your query method

The following cell is the list of methods tested in this repository.

Uncomment the line for the method that you want to use.

In [None]:
# method = method_random_guess
# method = method_overlapping_root_word_count  # method 0
# method = method_boolean
# method = method_tf_idf  # method 1
# method = method_bm25  # method 2
# method = method_unigram  # method 3
# method = method_spacy_embedding_similarity
# method = method_spacy_embedding_similarity_lg  # method 4
# method = method_wordmover_distance_glovewiki50
method = method_sentence_vector  # method 5
# method = method_supervised_model_logr  # method 6
# method = method_supervised_model_lgbm

## Query an unseen question

Write the question you want to query in the following cell.

You will see the top results retrieved, the score according to the retrieval method, and the retrieved question string.

In [None]:
query_unseen_question("Why are computer screens dark in color?", method=method)

## Index unseen questions

Now we index two questions of a similar meaning to the queried question.

In [None]:
index_unseen_question([
    "Why are computer screens black when unpowered?",
    "Why are computer screens manufactured black?"])

## Query a question related to indexed questions

We make the same query and see that it manages to retrieve the added questions at a high ranking.

In [None]:
query_unseen_question("Why are computer screens dark in color?", method=method)

Run the following cell if you want to reset the indexes.

In [None]:
qid_to_tokens, token_to_qids = deepcopy(qid_to_tokens_original), deepcopy(token_to_qids_original)
tf, df, L = deepcopy(tf_original), deepcopy(df_original), deepcopy(L_original)