In [14]:
import math
import numpy as np
import nltk
import re
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer 
import heapq
!pip install PyPDF2
from PyPDF2 import PdfFileReader



file = open('Article 26 Increase of the loan.pdf','rb')
PDF_read = PdfFileReader(file)
txt=""
for i in range(PDF_read.numPages):
    page_temp = PDF_read.getPage(i)
    page_String= page_temp.extractText()
    txt=txt+ page_String
#class for preprocessing and creating word embedding
class Preprocessing:
    #constructor
    def __init__(self,txt):
        # Tokenization
        nltk.download('punkt')  #punkt is nltk tokenizer 
        # breaking text to sentences
        tokens = nltk.sent_tokenize(txt) 
        self.tokens = tokens
        self.tfidfvectoriser=TfidfVectorizer()

    # Data Cleaning
    # remove extra spaces
    # convert sentences to lower case 
    # remove stopword
    def clean_sentence(self, sentence, stopwords=False):
        sentence = sentence.lower().strip()
        sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
        if stopwords:
            sentence = remove_stopwords(sentence)
        return sentence

    # store cleaned sentences to cleaned_sentences
    def get_cleaned_sentences(self,tokens, stopwords=False):
        cleaned_sentences = []
        for line in tokens:
            cleaned = self.clean_sentence(line, stopwords)
            cleaned_sentences.append(cleaned)
        return cleaned_sentences

    #do all the cleaning
    def cleanall(self):
        cleaned_sentences = self.get_cleaned_sentences(self.tokens, stopwords=True)
        cleaned_sentences_with_stopwords = self.get_cleaned_sentences(self.tokens, stopwords=False)
        # print(cleaned_sentences)
        # print(cleaned_sentences_with_stopwords)
        return [cleaned_sentences,cleaned_sentences_with_stopwords]

    # TF-IDF Vectorizer
    def TFIDF(self,cleaned_sentences):
        self.tfidfvectoriser.fit(cleaned_sentences)
        tfidf_vectors=self.tfidfvectoriser.transform(cleaned_sentences)
        return tfidf_vectors

    #tfidf for question
    def TFIDF_Q(self,question_to_be_cleaned):
        tfidf_vectors=self.tfidfvectoriser.transform([question_to_be_cleaned])
        return tfidf_vectors

    # main call function
    def doall(self):
        cleaned_sentences, cleaned_sentences_with_stopwords = self.cleanall()
        tfidf = self.TFIDF(cleaned_sentences)
        return [cleaned_sentences,cleaned_sentences_with_stopwords,tfidf]
  
class TS_SS:
    
    #cosine similarity
    def Cosine(self, question_vector, sentence_vector):
        dot_product = np.dot(question_vector, sentence_vector.T)
        denominator = (np.linalg.norm(question_vector) * np.linalg.norm(sentence_vector))
        return dot_product/denominator
    
    #Euclidean distance
    def Euclidean(self, question_vector, sentence_vector):
        vec1 = question_vector.copy()
        vec2 = sentence_vector.copy()
        if len(vec1)<len(vec2): vec1,vec2 = vec2,vec1
        vec2 = np.resize(vec2,(vec1.shape[0],vec1.shape[1]))
        return np.linalg.norm(vec1-vec2)
    
    # angle between two vectors
    def Theta(self, question_vector, sentence_vector):
        return np.arccos(self.Cosine(question_vector, sentence_vector)) + np.radians(10)
    
    # triangle formed by two vectors and ED as third side
    def Triangle(self, question_vector, sentence_vector):
        theta = np.radians(self.Theta(question_vector, sentence_vector))
        return ((np.linalg.norm(question_vector) * np.linalg.norm(sentence_vector)) * np.sin(theta))/2
    
    # difference in magnitude of two vectors
    def Magnitude_Difference(self, vec1, vec2):
        return abs((np.linalg.norm(vec1) - np.linalg.norm(vec2)))
    
    # sector area similarity
    def Sector(self, question_vector, sentence_vector):
        ED = self.Euclidean(question_vector, sentence_vector)
        MD = self.Magnitude_Difference(question_vector, sentence_vector)
        theta = self.Theta(question_vector, sentence_vector)
        return math.pi * (ED + MD)**2 * theta/360

    #function which is acivated on call
    def __call__(self, question_vector, sentence_vector,method):
        if method==1: return self.Euclidean(question_vector, sentence_vector)
        elif method==2: return self.Cosine(question_vector, sentence_vector)
        else: return self.Triangle(question_vector, sentence_vector) * self.Sector(question_vector, sentence_vector)
        
        
def RetrieveAnswer(question_embedding, tfidf_vectors,method=1):
    similarity_heap = []
    if method==1: max_similarity = float('inf')
    else: max_similarity = -1
    index_similarity = -1

    for index, embedding in enumerate(tfidf_vectors):  
        find_similarity = TS_SS()
        similarity = find_similarity((question_embedding).toarray(),(embedding).toarray(),method).mean()
        if method==1:
            heapq.heappush(similarity_heap,(similarity,index))
        elif method==2:
            heapq.heappush(similarity_heap,(-similarity,index))
        else:
            heapq.heappush(similarity_heap,(similarity,index))
    return similarity_heap
  
# question here
user_question = "How to increase mortgage amount ? Do I need more loan sections?"
#define method
method = 3

preprocess = Preprocessing(txt)
cleaned_sentences,cleaned_sentences_with_stopwords,tfidf_vectors = preprocess.doall()
question = preprocess.clean_sentence(user_question, stopwords=True)
question_embedding = preprocess.TFIDF_Q(question)
similarity_heap = RetrieveAnswer(question_embedding , tfidf_vectors ,method)

print("Question: ", user_question)

# number of relevant solutions you want
number_of_sentences_to_print = 5
while number_of_sentences_to_print>0 and len(similarity_heap)>0:
    x = similarity_heap.pop(0)
    print(cleaned_sentences_with_stopwords[x[1]])
    number_of_sentences_to_print-=1

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Question:  How to increase mortgage amount ? Do I need more loan sections?
to increase your money loan obvion adds one or more new loan sections
the total money loan may not exceed the amount for which the mortgage was granted
you must arrange to increase your money loan through an intermediary
then you may increase your loan to that amount without having to 
 
then you can increase your loan up to that amount without having to go to the notary
obvion is however not obliged to agree to a request for an increase


