In [1]:
import re
import pandas as pd

In [2]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
import io
import os

fp = open('clause.pdf', 'rb')
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
print(type(retstr))
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
para_dict = {}
page_no = 0
for pageNumber, page in enumerate(PDFPage.get_pages(fp)):
    if pageNumber == page_no:
        interpreter.process_page(page)
        data = retstr.getvalue()
        data = data.split('\n\n')
        para_dict[page_no] = data
        retstr.truncate(0)
        retstr.seek(0)

    page_no += 1

<class '_io.StringIO'>


In [4]:
#filtering short sentences as they are mostly titles and sub headings. 
sentence_list = []
for key, values in list(para_dict.items()):
    for sentence in values:
        if len(sentence.split()) < 5:
            values.remove(sentence)
        else: 
            sentence = re.sub('\n|\x0c', ' ', sentence)
            sentence_list.append(sentence)
            
    para_dict[key] = sentence_list
    sentence_list = []

#### Questions 
* When will be the termination date?
* How are the payments made?


In [5]:
ques1 = "When will be the termination date?"

### Finding top 30 similar paragraphs for question 1 using Tfidf and cosine Similarity

#### *When will be the termination date?

In [6]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt') # if necessary...


stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/syamprakash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

In [8]:
def spacy_similarity(ques, text):
    ques = nlp(ques)
    text = nlp(text)
    similarity = ques.similarity(text)
    return similarity

In [9]:
def spacy_similarity_generator(paragraph_dictionary,ques1):
    sim_score_list = {}
    sim_dict = {}
    for key, values in paragraph_dictionary.items():
        for sentence in values:
            similarity_score = spacy_similarity(ques1, sentence)
            if similarity_score > 0:
                sim_score_list[similarity_score] = sentence
        sim_dict[key] = sim_score_list
        sim_score_list = {}
        return sim_dict

In [10]:
def tfidf_cosine_similarity_generator(paragraph_dictionary,ques1):
    sim_score_list = {}
    sim_dict = {}
    for key, values in para_dict.items():
        for sentence in values:
            similarity_score = cosine_sim(ques1, sentence)
            if similarity_score > 0:
                sim_score_list[similarity_score] = sentence
        sim_dict[key] = sim_score_list
        sim_score_list = {}
    return sim_dict

### Setting up the Bert model pipeline

In [11]:
from transformers import BertForQuestionAnswering, AutoTokenizer
modelname = 'deepset/bert-base-cased-squad2'
model = BertForQuestionAnswering.from_pretrained(modelname)
tokenizer = AutoTokenizer.from_pretrained(modelname)

In [12]:
from transformers import pipeline
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer, top_k = 10)


In [13]:
tfidf_sim_dict = tfidf_cosine_similarity_generator(para_dict,ques1)



In [14]:
key_values_list = []
for key, values in tfidf_sim_dict.items():
    if len(values.values())>0:
        page_number = key
        paragraph = list(values.values())[0]
        similarity_score = list(values.keys())[0]
        key_values_list.append([page_number,paragraph,similarity_score])

In [15]:
df = pd.DataFrame(key_values_list, columns = ['Page Number','Paragraph','Similarity Score'])
df['Length'] = df['Paragraph'].str.len() 

In [16]:
df['Start Logits'] = 0
for i in range(1,len(df)):
    df.loc[i,'Start Logits'] = df.loc[i-1,'Length']+df.loc[i-1,'Start Logits']

In [17]:
df['End Logits'] = df['Length'].cumsum()

In [18]:
df.tail()

Unnamed: 0,Page Number,Paragraph,Similarity Score,Length,Start Logits,End Logits
4,7,"awarded Vendor shall, at their sole expense, m...",0.072147,2383,4523,6906
5,27,552 CFR PART 200 ContractsContracts for more t...,0.064023,3003,6906,9909
6,38,82Texas Business and Commerce Code § 272 Requi...,0.017423,2593,9909,12502
7,40,89Felony Conviction NoticeTexas Education Code...,0.013465,2997,12502,15499
8,43,101CERTIFICATION PROHIBITING DISCRIMINATION AG...,0.017007,4906,15499,20405


In [19]:
text = ""
for key, values in tfidf_sim_dict.items():
    for sentences in values.values():
        text = text+". "+sentences

In [20]:
context = text
predicted_answers = nlp({
    'question': ques1,
    'context': context
})

In [21]:
predicted_answer = pd.DataFrame(predicted_answers)

In [86]:
predicted_answer

Unnamed: 0,score,start,end,answer
0,0.646293,4943,4955,one (1) year
1,0.519622,4943,4955,one (1) year
2,0.168845,734,802,the last day of the month of the month of the...
3,0.132105,738,802,last day of the month of the month of the ori...
4,0.12097,4943,4997,one (1) year from the effective date of ter...
5,0.097723,734,773,the last day of the month of the month
6,0.078935,4948,4955,1) year
7,0.076459,738,773,last day of the month of the month
8,0.054853,4943,4998,one (1) year from the effective date of ter...
9,0.044819,4931,4955,a period of one (1) year


The required answer here is "One(1) year". We can adjust topk according to our need to remove duplicates

##### To improve this feature we can join the paragraph data and bert output to find which paragraph lies in. Another way of enhancing this feature would be to backtrack to the PDF and highlight the portion where the answer lies using tools like PDF-plumber. 