In [None]:
import os
import progressbar
import codecs
import spacy
import en_core_web_sm
from spacy.attrs import ORTH
import re
import string
import pathlib 

import nltk
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
import nltk.data

from time import time
import random
import multiprocessing as mp

import tqdm
from tqdm import tqdm
import pandas as pd
import csv 
from csv import reader
import ast
import json

In [None]:
def custom_sentencizer(doc):
    ''' Look for sentence start tokens by scanning for periods only. '''
    split_lowercase = all(w.text.islower() for w in doc)
    
    for i, token in enumerate(doc[:-2]):  # The last token cannot start a sentence
        if token.text[0] == "." or token.text[-1] == ".":
            if not split_lowercase and (not doc[i+1].text[0].isupper() or doc[i+2].text[0] == '.'):# or doc[i+1].text[0] == '.':
                    doc[i+1].is_sent_start = False  # Tell the default sentencizer to ignore this token
            # pass
        else:
            doc[i+1].is_sent_start = False  # Tell the default sentencizer to ignore this token
    return doc




def custom_splitter(text = None):
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(custom_sentencizer, before = "parser")
    
    special_cases = {"Rs.": "rs.", "No.": "no.", "no.": "no.", "i.e.": "i.e.", "viz.": "viz.", "M/s.": "m/s.", "Mohd.": "mohd.", "Ex.": "exhibit", "Art." : "article", "Arts." : "articles", "S.": "section", "s.": "section", "ss.": "sections", "u/s.": "section", "u/ss.": "sections", "art.": "article", "arts.": "articles", "u/arts." : "articles", "u/art." : "article", "hon'ble" : "honourable", "ITO" : "Ito", "UBI" : "Ubi", "Ors." : "ors."}    
#     special_cases = {"Rs.": "rs.", "No.": "no.", "no.": "no.", "v.": "vs", "vs.": "vs", "i.e.": "i.e.", "viz.": "viz.", "M/s.": "m/s.", "Mohd.": "mohd.", "Ex.": "exhibit", "Art." : "article", "Arts." : "articles", "S.": "section", "s.": "section", "ss.": "sections", "u/s.": "section", "u/ss.": "sections", "art.": "article", "arts.": "articles", "u/arts." : "articles", "u/art." : "article", "hon'ble" : "honourable"}
    #Ltd. Pvt. Corp.


    for case, orth in special_cases.items():
    	nlp.tokenizer.add_special_case(case, [{ORTH: orth}])
    
    
    if text is None: return nlp
    #text = text.strip()
    #print (text)
    text = text.replace('\n', ' ')
    #text = re.sub(' +', ' ', text)
    
    
    
    parsed = nlp(text)
    
    sentences = []
    
    for sent in parsed.sents:
        sentences.append(sent.text)
    
    return sentences, nlp




class custom_tokenizer:
        def __init__(self):
                # self.NLP = spacy.load('en_core_web_sm')
                self.NLP = custom_splitter()
                puncts = string.punctuation.replace('.', '').replace('-', '')
                self.trans = str.maketrans('.-','  ', puncts)
                
        def to_words(self, text):
                text = re.sub('\n', ' ', text.lower())
                text = re.sub('\s+', ' ', text).strip()
                
                words = [s.text.lower() if s.text[0] == "'" and len(s.text) == 2 else s.text.translate(self.trans).strip().lower() for s in self.NLP(text.strip()) if not s.is_punct]
                
                return words 
        
        def to_sentences(self, text):
                #remove extra dots
                text = re.sub('\.\s*\.\s*\.', '. ', text)
                text = re.sub('\.\s*\.', '. ', text)
                
                #remove dash
#                 text = re.sub('-', ' ', text)
                
                # remove extra whitespace
                text = re.sub('\n', ' ', text)
                text = re.sub('\s+', ' ', text).strip()
                
                
                
                sentences = [s.text for s in self.NLP(text).sents if len(s.text.strip()) > 5]
                # if re.match('\d+\.?.*', text):
                #         text = text[4:]
                
                return sentences
        
        def to_cleaned_sents(self, text):
                sents = self.to_sentences(text)
                words = [' '.join(self.to_words(s)) + '.' for s in sents]
                return words        
        
       
        
       
        
class simple_tokenizer:
        def to_words(self, s):
                s = s.strip().strip('.').strip()
                return s.split()
        
        def to_sentences(self, s):
                return [sent.strip() + '.' for sent in s.split('.') if len(sent.strip()) > 5]


In [None]:
tokenizer = custom_tokenizer()

In [None]:
""" Implementation of OKapi BM25 with sklearn's TfidfVectorizer
Distributed as CC-0 (https://creativecommons.org/publicdomain/zero/1.0/)
"""

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse


class BM25(object):
    def __init__(self, b=0.7, k1=1.6):
#         self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
#         self.vectorizer = TfidfVectorizer(tokenizer=stemming_tokenizer, 
#                                           max_df=.90, min_df=1,
#                                           stop_words='english', 
#                                           use_idf=True, 
#                                           ngram_range=(2, 2))
        self.vectorizer = TfidfVectorizer(max_df=.65, min_df=1,
                                  use_idf=True, 
                                  ngram_range=(1, 1))
        
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        self.vectorizer.fit(X)
        y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = y.sum(1).mean()

    def transform(self, q, X):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = X.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        X = X.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
        # to idf(t) = log [ n / df(t) ] with minus 1
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1


# create corpus for prior cases

In [None]:
db_path = "local directory path for preprocessed citation Database"

db_path_list = []
for p in pathlib.Path(db_path).iterdir():
    db_path_list.append(p)

dir_dict = {}

for case_path in tqdm(sorted(db_path_list)):
    if(os.path.basename(case_path) == "__MACOSX"):
        continue;
    for r,d,f in os.walk(case_path):
        for file in f:
            if(file=="entailed_fragment.txt"):
                dir_dict[str(case_path/file)] = []
            elif(file=="base_case.txt"):
                continue;
            else:
                if(os.path.basename(case_path) == ".DS_Store"):
                    continue
                if(file == ".DS_Store"):
                    continue
                dir_dict[str(case_path/"entailed_fragment.txt")].append(str(case_path/"paragraphs"/file))
                
for k in dir_dict.keys():
    dir_dict[k] = sorted(dir_dict[k])

In [None]:
score_dict = {}
prediction_dict = {}
count = 0
final_pred = {} 

for case_path in tqdm(dir_dict.keys()):
    #create query file
    f = codecs.open(case_path, "r", "utf-8", errors='ignore')
    q_case_text = f.read()
#     print(case_path)
#     #basic preprocessing
#     raw_str_list = q_case_text.splitlines()
    #basic preprocessing
    raw_str_list = tokenizer.to_sentences(q_case_text.replace('\n', ' '))
    query_case = ''.join(raw_str_list)
    
    cite_corpus = []
    citation_names = []
    #create citation corpus for that case
    for cite_path in dir_dict[case_path]:
        cite_file = codecs.open(cite_path, "r", "utf-8", errors='ignore')
        c_case_text = cite_file.read()
        
        #basic preprocessing
        prepro_c_case_text = tokenizer.to_sentences(c_case_text.replace('\n', ' '))
        
        citation_text = ''.join(prepro_c_case_text)
        cite_corpus.append(citation_text)
        citation_names.append(os.path.basename(cite_path))
    
    
    bm25 = BM25()
    bm25.fit(cite_corpus)
    
    qu = query_case
    qu_n = os.path.dirname(case_path)
    
    doc_scores = bm25.transform(qu, cite_corpus)
    score_dict[qu_n] = doc_scores
    doc_sort_index = np.argsort(doc_scores)
    do_sort_index_rev = doc_sort_index[::-1]
    final_pred[os.path.basename(qu_n)] = [citation_names[case] for case in do_sort_index_rev]
#     count += 1
    del bm25


In [None]:
pred_df = pd.DataFrame(list(zip(final_pred.keys(),final_pred.values())), columns = ["Document id", "Prediction List"])
pred_df['Document id'] = pred_df['Document id'].apply(lambda x : os.path.basename(x))

In [None]:
pred_df

In [None]:
#STORE ACTUAL NUMBER OF CITATIONS IN DICTIONARY
golden_citations = {}
with open("Give path reference to Golden Citation JSON file for Task-2", 'r') as actual_json:
    golden_citations = json.load(actual_json)
golden_citations

In [None]:
c1 = 0
c2 = 0
c3 = 0
c4 = 0
for k in golden_citations.keys():
    if len(golden_citations[k])== 1:
        c1 +=1
    if len(golden_citations[k])== 2:
        c2 +=1
    if len(golden_citations[k])== 3:
        c3 +=1
    if len(golden_citations[k])== 4:
        c4 +=1
print(len(golden_citations.keys()), c1, c2 , c3, c4)

In [None]:
tot = 461 + 57*2 + 4*3 + 3*4
avg = tot/525
avg

In [None]:
#Precision@K Function
def prec_at_k(true_list,pred_list,k):
    #define list of top k predictions
    count = 0
    top_k_pred = pred_list[0:k].copy()
    #iterate throught the top k predictions
    for doc in top_k_pred:
        #if document in true list, then increment count of relevant predictions
        if doc in true_list:
            count += 1
    #return total_relevant_predictions_in_top_k/k
    return count/k


In [None]:
#Recall@K Function
def recall_at_k(true_list,pred_list,k,r):
    #define top k predictions
    count=0
    top_k_pred = pred_list[0:k].copy()
    #iterate through the top k predictions
    for doc in top_k_pred:
        #if doc in true list, then increment count
        if doc in true_list:
            count+=1
    #return number of relevant documents in top k predictions/total number of relevant predictions
    return count/r


In [None]:
#Average Precision Function
def AP(true_list,pred_list):
    #P-> relative precision list, rel_vec-> relevance vector 
    P = []
    rel_vec = []
    val = 0 
    #iterate through the entire prediction list 
    for i in range(len(pred_list)):
        #if predicted citation in true list increment numberator (number of relevant docs) by 1 and also append 1 for rel_vec
        if pred_list[i] in true_list:
            val += 1
            rel_vec.append(1)
        else:
            #otherwise just append 0 for rel_vec
            rel_vec.append(0)
        #append the relative precision for each query document while iterating
        # so append (number of relevant docs so far ie., val) divided by total number of documents iterated so far
        P.append(val/(i+1))
    count = 0
    total = 0
    #find the relatve precision of all the relevant documents and take sum
    for rank in range(len(P)):
        # for index in P list
        # if rel_vec[i] is 1 that means it is relevant document thus increment count and add to total, else dont count
        if rel_vec[rank] == 1:
            count += 1
            total += P[rank]
    # boundary case where there is no relevent document found
    if count == 0:
        return 0
    #return the Average Precision
    return total/count


In [None]:
#Reciprocal Rank Function
def RR(true_list,pred_list):
    #iterate through the ranked prediction list, break at first relevant case and return reciprocal of that rank
    for i in range(len(pred_list)):
        if pred_list[i] in true_list:
            return 1/(i+1)


## Get all the results based on the Golden citation list

In [None]:
#define the dataframe for results
results = pd.DataFrame(columns=['Document id','Prec@1','Prec@5','Prec@10','Prec@R','Recall@100','AP','RR'])
for i in tqdm(pred_df.index):
    #fetch the details from prediction dataframe
    query_case = pred_df.iloc[i,0]
#     print(query_case)
    #r = pred_df.iloc[i,1]
    #true_list = pred_df.iloc[i,3].copy()
    true_list = golden_citations.get(query_case)
    r = len(true_list)
    #pred_list = pred_df.iloc[i,4].copy()
    pred_list = pred_df.iloc[i,1].copy()
    prec_at_1 = prec_at_k(true_list,pred_list,1)
    prec_at_5 = prec_at_k(true_list,pred_list,5)
    prec_at_10 = prec_at_k(true_list,pred_list,10)
    prec_at_r = prec_at_k(true_list,pred_list,r)
    
    recall_at_100 = recall_at_k(true_list,pred_list,100,r)
    ap = AP(true_list,pred_list)
    rr = RR(true_list,pred_list)
    #add the details to the result dataframe
    results = results.append({'Document id':query_case, 'Prec@1': prec_at_1, 'Prec@5': prec_at_5 , 'Prec@10': prec_at_10, 'Prec@R': prec_at_r, 'Recall@100': recall_at_100, 'AP': ap, 'RR': rr}, ignore_index=True)

In [None]:
print(results.describe())

In [None]:
#Micro Precision Function
def micro_prec(true_list,pred_list,k):
    #define list of top k predictions
    cor_pred = 0
    top_k_pred = pred_list[0:k].copy()
    #iterate throught the top k predictions
    for doc in top_k_pred:
        #if document in true list, then increment count of relevant predictions
        if doc in true_list:
            cor_pred += 1
    #return total_relevant_predictions_in_top_k/k
    return cor_pred, k

In [None]:
#define the dataframe for results
results = pd.DataFrame(columns=['Document id','Prec@1','Prec@5','Prec@10','Prec@R','Recall@100','AP','RR','Correct_pred','Retrived_cases', 'Relevant_cases'])
correct_pred = 0
retri_cases = 0
relevant_cases = 0

for i in tqdm(pred_df.index):
    #fetch the details from prediction dataframe
    query_case = pred_df.iloc[i,0]
#     print(query_case)
    #r = pred_df.iloc[i,1]
    #true_list = pred_df.iloc[i,3].copy()
    true_list = golden_citations.get(query_case)
#     print(query_case, type(query_case))

    r = 1 # for constant predictions for each query case
#     r = len(true_list)
    pred_list = pred_df.iloc[i,1].copy()
    prec_at_1 = prec_at_k(true_list,pred_list,1)
    prec_at_5 = prec_at_k(true_list,pred_list,5)
    prec_at_10 = prec_at_k(true_list,pred_list,10)
    prec_at_r = prec_at_k(true_list,pred_list,r)
    
    c_p, r_c = micro_prec(true_list,pred_list,r)
    correct_pred += c_p
    retri_cases += r_c
    relevant_cases += len(true_list)
    
    recall_at_100 = recall_at_k(true_list,pred_list,100,r)
    ap = AP(true_list,pred_list)
    rr = RR(true_list,pred_list)
    #add the details to the result dataframe
    results = results.append({'Document id':query_case, 'Prec@1': prec_at_1, 'Prec@5': prec_at_5 , 'Prec@10': prec_at_10, 'Prec@R': prec_at_r, 'Recall@100': recall_at_100, 'AP': ap, 'RR': rr, 'Correct_pred':c_p, 'Retrived_cases':r_c, 'Relevant_cases':len(true_list)}, ignore_index=True)


In [None]:
results

In [None]:
(results.describe())

In [None]:
print("Correct Predictions: ", correct_pred)
print("Retrived Cases Predictions: ", retri_cases)
print("Relevant Cases: ", relevant_cases)

M_pre = correct_pred/retri_cases
M_recall = correct_pred/relevant_cases
M_F = 2*M_pre*M_recall/ (M_pre + M_recall)

print("Micro Precision: ", M_pre)
print("Micro Recall: ", M_recall)
print("Micro F-Measure: ", M_F)
print(correct_pred, "\t", retri_cases, "\t", relevant_cases, "\t", M_pre, "\t", M_recall, "\t", M_F)