In [None]:
#Necessary libraries
import os
import numpy as np
import pandas as pd
import pathlib
import shutil
from tqdm import tqdm 
import csv
from csv import reader
import json
import ast
import sys
import sent2vec
import datetime

import sklearn 
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
begin_time = datetime.datetime.now()

## [Sent2vec](https://github.com/epfml/sent2vec)

In [None]:
#Define Model

Model_path = "path to the pretrained sent2vec model"

In [None]:
sent2vec_model = sent2vec.Sent2vecModel()
sent2vec_model.load_model(Model_path)

print("*********"*8)
print(sent2vec_model)
print("Loading successful...")
print("*********"*8)

In [None]:
#CREATE QUERY DB
#define the path for query database
path = "local directory path for preprocessed Query Database"
#create dataframe for storing the average vectors
query_df = pd.DataFrame(columns=['Document ID','Document Vector'])
#create a list of all the paths for the documents in the query database
path_list = []
for p in pathlib.Path(path).iterdir():
    path_list.append(p)
    

#iterate throught the query database list in sorted manner
for file in tqdm(sorted(path_list),desc = "query documents"):
    #open the file -> read the file -> split the lines
    open_file = open(file, 'r', encoding="utf-8")
    text = open_file.read()
    
    raw_str_list = text.splitlines()
    #Append short length sentences to previous sentence. (to avoid wrongly splitted sentences)
    str_list = []
    #if first sentence is very short append it at start of 2nd sentence and start from 3rd
    if len(raw_str_list[0])<100:
        start_ind = 2
        str_list.append(raw_str_list[0] + " " + raw_str_list[1])
    else:
        start_ind = 1
        str_list.append(raw_str_list[0])
    
    for line in raw_str_list[start_ind:]:
        #if phrase/line is less than 100 characters, then we append it to the previous line
        if len(line)<101:
            str_list[-1] += " "
            str_list[-1] += line
            continue
        else:
            str_list.append(line)
    
    str_list_3 = []
    
#     print(file)
    for i in range(len(str_list)):
        if "CITATION_SUPPRESSED" in str_list[i] or "FRAGMENT_SUPPRESSED" in str_list[i] or "REFERENCE_SUPPRESSED" in str_list[i]:
            str_list_3 += str_list[max(0,i-3):i+4] # consider three previous and only three next sentences
    
    sent_vec_list = []
    # encode the sentences using model
    for line in str_list_3:
        emb = sent2vec_model.embed_sentence(line).flatten().tolist()
        sent_vec_list.append(np.array(emb))
    
    #add the document name and its vector into the dataframe
    query_df = query_df.append({'Document ID':os.path.basename(file),'Document Vector':sent_vec_list},ignore_index=True)
    #close the query file
    open_file.close()
    del raw_str_list, str_list, sent_vec_list

In [None]:
#CONVERT citation DB
#define the path for citation database
path = "local directory path for preprocessed citation Database"
#create dataframe for storing the average vectors
citation_df = pd.DataFrame(columns=['Document ID','Document Vector'])
#create a list of all the paths for the documents in the citation database
path_list = []
for p in pathlib.Path(path).iterdir():
    path_list.append(p)

#iterate throught the citation database list in sorted manner
for file in tqdm(sorted(path_list),desc = "avg cite vect"):
    #open the file -> read the file -> split the lines
    open_file = open(file, 'r', encoding="utf-8")
    f = open_file.read()
    raw_str_list = f.splitlines()
    #Append short length sentences to previous sentence. (to avoid wrongly splitted sentences)
    str_list = []
    #if first sentence is very short append it at start of 2nd sentence and start from 3rd
    if len(raw_str_list[0])<100:
        start_ind = 2
        str_list.append(raw_str_list[0] + " " + raw_str_list[1])
    else:
        start_ind = 1
        str_list.append(raw_str_list[0])
        
    for line in raw_str_list[start_ind:]:
        #if phrase/line is less than 100 characters, then we append it to the previous line
        if len(line)<101:
            str_list[-1] += " "
            str_list[-1] += line
            continue
        else:
            str_list.append(line)
    sent_vec_list = []
    # encode the sentences using model
    for line in str_list:
        emb = sent2vec_model.embed_sentence(line).flatten().tolist()
        sent_vec_list.append(np.asarray(emb))
    #add the document name and its vector into the dataframe
    citation_df = citation_df.append({'Document ID':os.path.basename(file),'Document Vector':sent_vec_list},ignore_index=True)
    #close the citation document
    open_file.close()
#     del raw_str_list, str_list, sent_vec_list

In [None]:
#STORE ACTUAL NUMBER OF CITATIONS IN DICTIONARY
golden_citations = {}
golden = {}

with open("Give path reference to Golden Citation CSV file for Task-1",'r') as actual_csv:
    #read the csv and iterate through it
    a = reader(actual_csv)
    for row in a:
        if row[1] == 'current case':
            continue
#         print(row)
        #for each row, store the number of citations and true list of citations
        golden[row[1]] = int(row[2])
        golden_citations[row[1]] = ast.literal_eval(row[3])
actual_csv.close()

In [None]:
golden_citations

In [None]:
golden

In [None]:
citation_df

In [None]:
query_df

In [None]:
def get_score(x,y):
    k = cosine_similarity(x,y)
    return k.max(axis=1).max()

In [None]:
# Predicitons
#define the dataframe for storing the predictions
pred_df = pd.DataFrame(columns=['Documend id','No of Golden Citations','Min Cosine Sim Value in TOP R','Actual Citations','Prediction List'])
nan_score_files = []

#iterate through the document vector dataframe
for q_ind in tqdm(query_df.index):
    #define the query case using iloc

    query_case = query_df.iloc[q_ind,0]
    #define the query vector
    query_vec = np.asarray(query_df.iloc[q_ind,1])
    #i-> count of citations, pred_tup_list-> list of tuples of cosine values and citation name, predictions->final ranked predictions
    i = 0
    pred_tup_list = []
    predictions = []
    #get the true number of citation as R 
    R = golden[query_case]
    #iterate through the citation dataframe
    for cite_case in search_space_dict[query_case]:
        #define the citation document name and document vector
#         cite_case = citation_df.iloc[c_ind,0]
        c_ind = citation_df.index[citation_df["Document ID"] == cite_case][0]
        cite_vec = np.asarray(citation_df.iloc[c_ind,1])
        #find the cosine similarity value
        #print("Calculate Score between", query_case, " --> ", cite_case)
        score = get_score(query_vec, cite_vec)
        
#         score = cos_sim_matrix(query_vec, cite_vec)
        
        if np.isnan(score):
            #print(query_case)
            nan_score_files.append(query_case)
            
        #skip if cosine sim value == 1 (same document)
        if score != 1.0:
            #increase the count of citations
            i += 1
            #add the tup to pred_tup_list
            pred_tup_list.append((score,cite_case))
        del cite_vec
    #sort the pred_tup_list based on the cosine similarity values
    pred_tup_list_sorted = sorted(pred_tup_list,key = lambda x: x[0],reverse=True)
    
    #find the min similarity value amongst the top R citations predicted
    min_tup = pred_tup_list_sorted[R-1]
    #iterate through list of tuples and get the citation names in sorted manner
    for tup in pred_tup_list_sorted:
        predictions.append(tup[1])
    #add the necessary details to the dataframe
    pred_df = pred_df.append({'Documend id':query_case,'No of Golden Citations':R,'Min Cosine Sim Value in TOP R':min_tup[0],'Actual Citations':golden_citations.get(query_case),'Prediction List':predictions},ignore_index=True)
    del pred_tup_list, predictions, query_vec

In [None]:
len(list(set(nan_score_files)))

In [None]:
pred_df

In [None]:
pred_df.to_csv("Give path to save the prediction CSV file")

In [None]:
#Precision@K Function
def prec_at_k(true_list,pred_list,k):
    #define list of top k predictions
    count = 0
    top_k_pred = pred_list[0:k].copy()
    #iterate throught the top k predictions
    for doc in top_k_pred:
        #if document in true list, then increment count of relevant predictions
        if doc in true_list:
            count += 1
    #return total_relevant_predictions_in_top_k/k
    return count/k


In [None]:
#Recall@K Function
def recall_at_k(true_list,pred_list,k,r):
    #define top k predictions
    count=0
    top_k_pred = pred_list[0:k].copy()
    #iterate through the top k predictions
    for doc in top_k_pred:
        #if doc in true list, then increment count
        if doc in true_list:
            count+=1
    #return number of relevant documents in top k predictions/total number of relevant predictions
    return count/r


In [None]:
#Average Precision Function
def AP(true_list,pred_list):
    #P-> relative precision list, rel_vec-> relevance vector 
    P = []
    rel_vec = []
    val = 0 
    #iterate through the entire prediction list 
    for i in range(len(pred_list)):
        #if predicted citation in true list increment numberator (number of relevant docs) by 1 and also append 1 for rel_vec
        if pred_list[i] in true_list:
            val += 1
            rel_vec.append(1)
        else:
            #otherwise just append 0 for rel_vec
            rel_vec.append(0)
        #append the relative precision for each query document while iterating
        # so append (number of relevant docs so far ie., val) divided by total number of documents iterated so far
        P.append(val/(i+1))
    count = 0
    total = 0
    #find the relatve precision of all the relevant documents and take sum
    for rank in range(len(P)):
        # for index in P list
        # if rel_vec[i] is 1 that means it is relevant document thus increment count and add to total, else dont count
        if rel_vec[rank] == 1:
            count += 1
            total += P[rank]
    # boundary case where there is no relevent document found
    if count == 0:
        return 0
    #return the Average Precision
    return total/count


In [None]:
#Reciprocal Rank Function
def RR(true_list,pred_list):
    #iterate through the ranked prediction list, break at first relevant case and return reciprocal of that rank
    for i in range(len(pred_list)):
        if pred_list[i] in true_list:
            return 1/(i+1)
        
    return 0


## Get all the results based on the Golden citation list

In [None]:
#define the dataframe for results
results = pd.DataFrame(columns=['Document id','Prec@1','Prec@5','Prec@10','Prec@R','Recall@100','AP','RR'])
for i in tqdm(pred_df.index):
    #fetch the details from prediction dataframe
    query_case = pred_df.iloc[i,0]
#     print(query_case)
    #r = pred_df.iloc[i,1]
    #true_list = pred_df.iloc[i,3].copy()
    true_list = golden_citations.get(query_case)
    r = len(true_list)
    #pred_list = pred_df.iloc[i,4].copy()
    pred_list = pred_df.iloc[i,4].copy()
    prec_at_1 = prec_at_k(true_list,pred_list,1)
    prec_at_5 = prec_at_k(true_list,pred_list,5)
    prec_at_10 = prec_at_k(true_list,pred_list,10)
    prec_at_r = prec_at_k(true_list,pred_list,r)
    
    recall_at_100 = recall_at_k(true_list,pred_list,100,r)
    ap = AP(true_list,pred_list)
    rr = RR(true_list,pred_list)
    #add the details to the result dataframe
    results = results.append({'Document id':query_case, 'Prec@1': prec_at_1, 'Prec@5': prec_at_5 , 'Prec@10': prec_at_10, 'Prec@R': prec_at_r, 'Recall@100': recall_at_100, 'AP': ap, 'RR': rr}, ignore_index=True)

In [None]:
len(results[results['Recall@100']<1]) #print number of rows whose recall score is less than 1

In [None]:
results.to_csv("save the results in CSV file for future reference")

In [None]:
results.describe()

Include segment csv path contains which file have:
* Document ID	
* FRAGMENT_SUPPRESSED
* REFERENCE_SUPPRESSED
* CITATION_SUPPRESSED
* Golden_Citations
* Difference (#FRAGMENT_SUPPRESSED + #REFERENCE_SUPPRESSED - #Golden_Citations)

In [None]:
segment_csv_path = "path to the segment CSV file"

In [None]:
df_segment_csv = pd.read_csv(segment_csv_path, index_col=0)
df_segment_csv

In [None]:
#Micro Precision Function
def micro_prec(true_list,pred_list,k):
    #define list of top k predictions
    cor_pred = 0
    top_k_pred = pred_list[0:k].copy()
    #iterate throught the top k predictions
    for doc in top_k_pred:
        #if document in true list, then increment count of relevant predictions
        if doc in true_list:
            cor_pred += 1
    #return total_relevant_predictions_in_top_k/k
    return cor_pred, k



In [None]:
#define the dataframe for results
results = pd.DataFrame(columns=['Document id','Prec@1','Prec@5','Prec@10','Prec@R','Recall@100','AP','RR','Correct_pred','Retrived_cases', 'Relevant_cases'])
correct_pred = 0
retri_cases = 0
relevant_cases = 0

for i in tqdm(pred_df.index):
    #fetch the details from prediction dataframe
    query_case = pred_df.iloc[i,0]
#     print(query_case)
    #r = pred_df.iloc[i,1]
    #true_list = pred_df.iloc[i,3].copy()
    true_list = golden_citations.get(query_case)
#     print(query_case, type(query_case))
    nf_FS = df_segment_csv.loc[df_segment_csv["Document ID"]==query_case, "#FRAGMENT_SUPPRESSED"].iloc[0]
    nf_RS = df_segment_csv.loc[df_segment_csv["Document ID"]==query_case, "#REFERENCE_SUPPRESSED"].iloc[0]
    
# predict based on the number of FRAGMENT_SUPPRESSED or REFERENCE_SUPPRESSED mentioned in the query documents

#     if nf_FS > 0:
#         r = nf_FS
#     else:
#         r = nf_RS
#     if r > 20:
#         r = int(r/5)

    r = 5 # for constant predictions for each query case
    
#     r = len(true_list) #predict based on actual predictions based on given golden citation file
    #pred_list = pred_df.iloc[i,4].copy()
#     c_p = 0
#     r_c = 0
    pred_list = pred_df.iloc[i,4].copy()
    prec_at_1 = prec_at_k(true_list,pred_list,1)
    prec_at_5 = prec_at_k(true_list,pred_list,5)
    prec_at_10 = prec_at_k(true_list,pred_list,10)
    prec_at_r = prec_at_k(true_list,pred_list,r)
    
    c_p, r_c = micro_prec(true_list,pred_list,r)
    correct_pred += c_p
    retri_cases += r_c
    relevant_cases += len(true_list)
    
    recall_at_100 = recall_at_k(true_list,pred_list,100,r)
    ap = AP(true_list,pred_list)
    rr = RR(true_list,pred_list)
    #add the details to the result dataframe
    results = results.append({'Document id':query_case, 'Prec@1': prec_at_1, 'Prec@5': prec_at_5 , 'Prec@10': prec_at_10, 'Prec@R': prec_at_r, 'Recall@100': recall_at_100, 'AP': ap, 'RR': rr, 'Correct_pred':c_p, 'Retrived_cases':r_c, 'Relevant_cases':len(true_list)}, ignore_index=True)


In [None]:
results

In [None]:
results.describe()

In [None]:
print("Correct Predictions: ", correct_pred)
print("Retrived Cases Predictions: ", retri_cases)
print("Relevant Cases: ", relevant_cases)

M_pre = correct_pred/retri_cases
M_recall = correct_pred/relevant_cases
M_F = 2*M_pre*M_recall/ (M_pre + M_recall)

print("Micro Precision: ", M_pre)
print("Micro Recall: ", M_recall)
print("Micro F-Measure: ", M_F)
print(correct_pred, "\t", retri_cases, "\t", relevant_cases, "\t", M_pre, "\t", M_recall, "\t", M_F)

In [None]:
len(pred_df[pred_df["Min Cosine Sim Value in TOP R"]<100])

In [None]:
pred_df["Min Cosine Sim Value in TOP R"].describe()

In [None]:
sorted(pred_df["Min Cosine Sim Value in TOP R"])