In [None]:
import os
import progressbar
import codecs
import spacy
import en_core_web_sm
from spacy.attrs import ORTH
import re
import string

import nltk
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
import nltk.data

from time import time
import random
import multiprocessing as mp

import tqdm
from tqdm import tqdm
import pandas as pd
import csv 
from csv import reader
import ast

In [None]:
path_prior_cases = "local directory path for preprocessed citation Database"

In [None]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
import re # regular expression


def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z\-]", " ", str_input).lower().split() # delete non letter charactors
    #words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split() # include numbers
    words = [porter_stemmer.stem(word) for word in words]
    return words

Reference: To check and update TfidfVectorizer [sklearn.feature_extraction.text.TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

In [None]:
""" Implementation of OKapi BM25 with sklearn's TfidfVectorizer
"""

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse


class BM25(object):
    def __init__(self, b=0.7, k1=1.6):
#         self.vectorizer = TfidfVectorizer(tokenizer=stemming_tokenizer, 
#                                           max_df=.90, min_df=1,
#                                           stop_words='english', 
#                                           use_idf=True, 
#                                           ngram_range=(2, 2))
        self.vectorizer = TfidfVectorizer(max_df=.65, min_df=1,
                                  use_idf=True, 
                                  ngram_range=(1, 1))
        
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        self.vectorizer.fit(X)
        y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = y.sum(1).mean()

    def transform(self, q, X):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = X.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        X = X.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
        # to idf(t) = log [ n / df(t) ] with minus 1
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1


# create corpus for prior cases

In [None]:
my_suffixes = (".txt")
citation_file_paths = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path_prior_cases):
#     print(r,len(r))
    for file in f:
#         print(file)
        if file.endswith(my_suffixes):
            citation_file_paths.append(os.path.join(r, file))

In [None]:
name_dict = {}
corpus =[]
citation_names = []
for file in sorted(citation_file_paths):
#     print(file)
    f = codecs.open(file, "r", "utf-8", errors='ignore')
    text = f.read()
    corpus.append(text)
    citation_names.append(os.path.basename(file))
    name_dict[text] = os.path.basename(file)

In [None]:
len(corpus)

# create a query corpus

In [None]:
path_current_cases = "local directory path for preprocessed Query Database"

In [None]:
my_suffixes = (".txt")
query_file_paths = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path_current_cases):
#     print(r,len(r))
    for file in f:
#         print(file)
        if file.endswith(my_suffixes):
            query_file_paths.append(os.path.join(r, file))

In [None]:
query_corpus = []
query_names = [] 

#iterate throught the query database list in sorted manner
for file in tqdm(sorted(query_file_paths),desc = "query documents"):
    #open the file -> read the file -> split the lines
    open_file = open(file, 'r', encoding="utf-8")
    text = open_file.read()
    
    raw_str_list = text.splitlines()
    #Append short length sentences to previous sentence. (to avoid wrongly splitted sentences)
    str_list = []
    #if first sentence is very short append it at start of 2nd sentence and start from 3rd
    if len(raw_str_list[0])<100:
        start_ind = 2
        str_list.append(raw_str_list[0] + " " + raw_str_list[1])
    else:
        start_ind = 1
        str_list.append(raw_str_list[0])
    
    for line in raw_str_list[start_ind:]:
        #if phrase/line is less than 100 characters, then we append it to the previous line
        if len(line)<101:
            str_list[-1] += " "
            str_list[-1] += line
            continue
        else:
            str_list.append(line)
    
    str_list_3 = []
    
#     print(file)
    for i in range(len(str_list)):
        if "CITATION_SUPPRESSED" in str_list[i] or "FRAGMENT_SUPPRESSED" in str_list[i] or "REFERENCE_SUPPRESSED" in str_list[i]:
            str_list_3 += str_list[max(0,i-3):i+4] # consider three previous and only three next sentences
    
#     print("".join(str_list_3))
    query_corpus.append(''.join(str_list_3))
    #query_corpus += str_list_3
#     break
    
    query_names.append(os.path.basename(file))
    
    #close the query file
    open_file.close()

In [None]:
len(query_corpus)

In [None]:
#STORE ACTUAL NUMBER OF CITATIONS IN DICTIONARY
golden_citations = {}
golden = {}


with open("Give path reference to Golden Citation CSV file for Task-1",'r') as actual_csv:
    #read the csv and iterate through it
    a = reader(actual_csv)
    for row in a:
        if row[1] == 'current case':
            continue
#         print(row)
        #for each row, store the number of citations and true list of citations
        golden[row[1]] = int(row[2])
        golden_citations[row[1]] = ast.literal_eval(row[3])
actual_csv.close()

In [None]:
golden_citations

In [None]:
golden

In [None]:
bm25 = BM25()
bm25.fit(corpus)

In [None]:
score_dict = {}
prediction_dict = {}
pred_df = pd.DataFrame(columns=['Documend id','No of Golden Citations','Min BM25 Sim Value in TOP R','Actual Citations','Prediction List'])

# tokenized_query = [doc.split(" ") for doc in tqdm(query_corpus)]
for i in tqdm(range(len(query_corpus))):
    qu = query_corpus[i]
    qu_n = query_names[i]
    
    R = golden[qu_n]
#     print(qu_n,R)
    
    doc_scores = bm25.transform(qu, corpus)
    rev_doc_score = sorted(doc_scores, reverse=True)
    score_dict[qu_n] = doc_scores
    doc_sort_index = np.argsort(doc_scores)
    do_sort_index_rev = doc_sort_index[::-1]
    prediction_dict[qu_n] = do_sort_index_rev
    
    min_tup = rev_doc_score[R-1]
#     print(min_tup)
    
    predictions = [citation_names[case] for case in prediction_dict[qu_n]]
    
#     print(predictions)
    pred_df = pred_df.append({'Documend id':qu_n,'No of Golden Citations':R,'Min BM25 Sim Value in TOP R':min_tup,'Actual Citations':golden_citations.get(qu_n),'Prediction List':predictions},ignore_index=True)
#     print(qu_n)

In [None]:
pred_df

In [None]:
len(prediction_dict)

In [None]:
pred_df.to_csv("Give path to save the prediction CSV file")

In [None]:
#create rsults directory
cwd = os.getcwd()

In [None]:
#Precision@K Function
def prec_at_k(true_list,pred_list,k):
    #define list of top k predictions
    count = 0
    top_k_pred = pred_list[0:k].copy()
    #iterate throught the top k predictions
    for doc in top_k_pred:
        #if document in true list, then increment count of relevant predictions
        if doc in true_list:
            count += 1
    #return total_relevant_predictions_in_top_k/k
    return count/k

In [None]:
#Recall@K Function
def recall_at_k(true_list,pred_list,k,r):
    #define top k predictions
    count=0
    top_k_pred = pred_list[0:k].copy()
    #iterate through the top k predictions
    for doc in top_k_pred:
        #if doc in true list, then increment count
        if doc in true_list:
            count+=1
    #return number of relevant documents in top k predictions/total number of relevant predictions
    return count/r

In [None]:
#Average Precision Function
def AP(true_list,pred_list):
    #P-> relative precision list, rel_vec-> relevance vector 
    P = []
    rel_vec = []
    val = 0 
    #iterate through the entire prediction list 
    for i in range(len(pred_list)):
        #if predicted citation in true list increment numberator (number of relevant docs) by 1 and also append 1 for rel_vec
        if pred_list[i] in true_list:
            val += 1
            rel_vec.append(1)
        else:
            #otherwise just append 0 for rel_vec
            rel_vec.append(0)
        #append the relative precision for each query document while iterating
        # so append (number of relevant docs so far ie., val) divided by total number of documents iterated so far
        P.append(val/(i+1))
    count = 0
    total = 0
    #find the relatve precision of all the relevant documents and take sum
    for rank in range(len(P)):
        # for index in P list
        # if rel_vec[i] is 1 that means it is relevant document thus increment count and add to total, else dont count
        if rel_vec[rank] == 1:
            count += 1
            total += P[rank]
    # boundary case where there is no relevent document found
    if count == 0:
        return 0
    #return the Average Precision
    return total/count


In [None]:
#Reciprocal Rank Function
def RR(true_list,pred_list):
    #iterate through the ranked prediction list, break at first relevant case and return reciprocal of that rank
    for i in range(len(pred_list)):
        if pred_list[i] in true_list:
            return 1/(i+1)

## Get all the results based on the Golden citation list

In [None]:
#define the dataframe for results
results = pd.DataFrame(columns=['Document id','Prec@1','Prec@5','Prec@10','Prec@R','Recall@100','AP','RR'])
for i in tqdm(pred_df.index):
    #fetch the details from prediction dataframe
    query_case = pred_df.iloc[i,0]
#     print(query_case)
    #r = pred_df.iloc[i,1]
    #true_list = pred_df.iloc[i,3].copy()
    true_list = golden_citations.get(query_case)
    r = len(true_list)
    #pred_list = pred_df.iloc[i,4].copy()
    pred_list = pred_df.iloc[i,4].copy()
    prec_at_1 = prec_at_k(true_list,pred_list,1)
    prec_at_5 = prec_at_k(true_list,pred_list,5)
    prec_at_10 = prec_at_k(true_list,pred_list,10)
    prec_at_r = prec_at_k(true_list,pred_list,r)
    
    recall_at_100 = recall_at_k(true_list,pred_list,100,r)
    ap = AP(true_list,pred_list)
    rr = RR(true_list,pred_list)
    #add the details to the result dataframe
    results = results.append({'Document id':query_case, 'Prec@1': prec_at_1, 'Prec@5': prec_at_5 , 'Prec@10': prec_at_10, 'Prec@R': prec_at_r, 'Recall@100': recall_at_100, 'AP': ap, 'RR': rr}, ignore_index=True)

In [None]:
len(results[results['Recall@100']<1]) #print number of rows whose recall score is less than 1

In [None]:
# # cwd = os.getcwd()
results.to_csv("save the results in CSV file for future reference")


In [None]:
results.describe()

Include segment csv path contains which file have:
* Document ID	
* FRAGMENT_SUPPRESSED
* REFERENCE_SUPPRESSED
* CITATION_SUPPRESSED
* Golden_Citations
* Difference (#FRAGMENT_SUPPRESSED + #REFERENCE_SUPPRESSED - #Golden_Citations)

In [None]:
segment_csv_path = "path to the segment CSV file"

In [None]:
df_segment_csv = pd.read_csv(segment_csv_path, index_col=0)
df_segment_csv

In [None]:
#Micro Precision Function
def micro_prec(true_list,pred_list,k):
    #define list of top k predictions
    cor_pred = 0
    top_k_pred = pred_list[0:k].copy()
    #iterate throught the top k predictions
    for doc in top_k_pred:
        #if document in true list, then increment count of relevant predictions
        if doc in true_list:
            cor_pred += 1
    #return total_relevant_predictions_in_top_k/k
    return cor_pred, k



In [None]:
#define the dataframe for results
results = pd.DataFrame(columns=['Document id','Prec@1','Prec@5','Prec@10','Prec@R','Recall@100','AP','RR','Correct_pred','Retrived_cases', 'Relevant_cases'])
correct_pred = 0
retri_cases = 0
relevant_cases = 0

for i in tqdm(pred_df.index):
    #fetch the details from prediction dataframe
    query_case = pred_df.iloc[i,0]
#     print(query_case)
    #r = pred_df.iloc[i,1]
    #true_list = pred_df.iloc[i,3].copy()
    true_list = golden_citations.get(query_case)
#     print(query_case, type(query_case))
    nf_FS = df_segment_csv.loc[df_segment_csv["Document ID"]==query_case, "#FRAGMENT_SUPPRESSED"].iloc[0]
    nf_RS = df_segment_csv.loc[df_segment_csv["Document ID"]==query_case, "#REFERENCE_SUPPRESSED"].iloc[0]
    
# predict based on the number of FRAGMENT_SUPPRESSED or REFERENCE_SUPPRESSED mentioned in the query documents

#     if nf_FS > 0:
#         r = nf_FS
#     else:
#         r = nf_RS
#     if r > 20:
#         r = int(r/5)

    r = 5 # for constant predictions for each query case
    
#     r = len(true_list) #predict based on actual predictions based on given golden citation file
    #pred_list = pred_df.iloc[i,4].copy()
#     c_p = 0
#     r_c = 0
    pred_list = pred_df.iloc[i,4].copy()
    prec_at_1 = prec_at_k(true_list,pred_list,1)
    prec_at_5 = prec_at_k(true_list,pred_list,5)
    prec_at_10 = prec_at_k(true_list,pred_list,10)
    prec_at_r = prec_at_k(true_list,pred_list,r)
    
    c_p, r_c = micro_prec(true_list,pred_list,r)
    correct_pred += c_p
    retri_cases += r_c
    relevant_cases += len(true_list)
    
    recall_at_100 = recall_at_k(true_list,pred_list,100,r)
    ap = AP(true_list,pred_list)
    rr = RR(true_list,pred_list)
    #add the details to the result dataframe
    results = results.append({'Document id':query_case, 'Prec@1': prec_at_1, 'Prec@5': prec_at_5 , 'Prec@10': prec_at_10, 'Prec@R': prec_at_r, 'Recall@100': recall_at_100, 'AP': ap, 'RR': rr, 'Correct_pred':c_p, 'Retrived_cases':r_c, 'Relevant_cases':len(true_list)}, ignore_index=True)


In [None]:
results

In [None]:
results.describe()

In [None]:
print("Correct Predictions: ", correct_pred)
print("Retrived Cases Predictions: ", retri_cases)
print("Relevant Cases: ", relevant_cases)

M_pre = correct_pred/retri_cases
M_recall = correct_pred/relevant_cases
M_F = 2*M_pre*M_recall/ (M_pre + M_recall)

print("Micro Precision: ", M_pre)
print("Micro Recall: ", M_recall)
print("Micro F-Measure: ", M_F)

In [None]:
len(pred_df[pred_df["Min BM25 Sim Value in TOP R"]<100])

In [None]:
pred_df["Min BM25 Sim Value in TOP R"].describe()

In [None]:
sorted(pred_df["Min BM25 Sim Value in TOP R"])