<a href="https://colab.research.google.com/github/Pusse-01/Question-answering-model-with-text-comparision/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install googletrans==4.0.0-rc1

In [None]:
pip install transformers

In [None]:
pip install PyPDF2

In [15]:
from googletrans import Translator
import PyPDF2 
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import math
import string
import sys

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [24]:
translator = Translator()

In [23]:
def pdfExtractor(file):
    # creating a pdf file object 
    pdfFileObj = open(file, 'rb') 
    
    # creating a pdf reader object 
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj) 
        
    # printing number of pages in pdf file 
    print(pdfReader.numPages) 
        
    # creating a page object 
    pageObj = pdfReader.getPage(1) 
        
    # extracting text from page 
    pageObj.extractText()
        
    # closing the pdf file object 
    pdfFileObj.close()
    return pageObj.extractText()

In [39]:
def translateE2S(text): 
    result = translator.translate(text,dest='si')
    #print(result.text)
    return result.text

def translateS2E(text):
    result = translator.translate(text,dest='en')
    #print(result.text)
    return result.text

In [45]:
predicted_answers = []
def question_answer(question, text):
    
    input_ids = tokenizer.encode(question, text)
    
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    sep_idx = input_ids.index(tokenizer.sep_token_id)    
    num_seg_a = sep_idx+1    
    num_seg_b = len(input_ids) - num_seg_a
    
    segment_ids = [0]*num_seg_a + [1]*num_seg_b    
    assert len(segment_ids) == len(input_ids)

    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)    
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    
    print("\nPredicted answer:\n{}".format(answer.capitalize()))
    sin = translateE2S(answer.capitalize())
    print(sin)
    predicted_answers.append(answer.capitalize())

In [46]:
# splitting the text lines into words
# translation table is a global variable
# mapping upper case to lower case and
# punctuation to spaces
translation_table = str.maketrans(string.punctuation+string.ascii_uppercase,
                                     " "*len(string.punctuation)+string.ascii_lowercase)
       
# returns a list of the words
# in the file
def get_words_from_line_list(text): 
      
    text = text.translate(translation_table)
    word_list = text.split()
      
    return word_list
  
  
# counts frequency of each word
# returns a dictionary which maps
# the words to  their frequency.
def count_frequency(word_list): 
      
    D = {}
      
    for new_word in word_list:
          
        if new_word in D:
            D[new_word] = D[new_word] + 1
              
        else:
            D[new_word] = 1
              
    return D
  
# returns dictionary of (word, frequency)
# pairs from the previous dictionary.
def word_frequencies_for_file(text): 
      
    word_list = get_words_from_line_list(text)
    freq_mapping = count_frequency(word_list)
  
    #print(len(text), "characters, ", )
    #print(len(word_list), "words, ", )
    #print(len(freq_mapping), "distinct words")
  
    return freq_mapping
  
  
# returns the dot product of two documents
def dotProduct(D1, D2): 
    Sum = 0.0
      
    for key in D1:
          
        if key in D2:
            Sum += (D1[key] * D2[key])
              
    return Sum
  
# returns the angle in radians 
# between document vectors
def vector_angle(D1, D2): 
    numerator = dotProduct(D1, D2)
    denominator = math.sqrt(dotProduct(D1, D1)*dotProduct(D2, D2))
      
    return math.acos(numerator / denominator)
  
  
def documentSimilarity(text1, text2):
      
   # filename_1 = sys.argv[1]
   # filename_2 = sys.argv[2]
    sorted_word_list_1 = word_frequencies_for_file(text1)
    sorted_word_list_2 = word_frequencies_for_file(text2)
    distance = vector_angle(sorted_word_list_1, sorted_word_list_2)
      
    print("The distance between the documents is: % 0.6f (radians)"% distance)
    

In [47]:
# reading the text file
# This functio will return a 
# list of the lines of text 
# in the file.
def read_file(filename): 
      
    try:
        with open(filename, 'r') as f:
            data = f.read()
        return data
      
    except IOError:
        print("Error opening or reading input file: ", filename)
        sys.exit()

In [48]:
file = input("Enter your file path : ")

#text = pdfExtractor(file)   
 
text = input("\nPlease enter your text: \n")
question = input("\nPlease enter your question: \n")
en_text = translateS2E(text)
while True:
    question_answer(question, en_text)
    
    flag = True
    flag_N = False
    
    while flag:
        response = input("\nDo you want to ask another question based on this text (Y/N)? ")
        if response[0] == "Y":
            question = input("\nPlease enter your question: \n")
            question = translateS2E(question)
            flag = False
        elif response[0] == "N":
            print("\nEnd of the program!")
            flag = False
            flag_N = True
            
    if flag_N == True:
        break

Enter your file path : j

Please enter your text: 
කොරෝනා වෛරසය පාලනය සඳහා ෆයිසර් එන්නත භාවිතාවේදී අවශ්‍ය හානිකර තත්ත්වයන් ඇති වීමේ අවදානමක් ඇතැයි ශ්‍රී ලංකා පොදුජන පෙරමුණේ පාර්ලිමේන්තු මන්ත්‍රී මහාචාර්ය තිස්ස විතාරණ මහතා සඳහන් කරයි.  එම එන්නතේ ක‍්‍රියාකාරීත්වය පිළිබඳ විද්‍යානුකූල සැකයක් ඇතිව තිබෙන බවද ඔහු පෙන්වා දෙයි.  එනිසා එම එන්නත මෙරටට ගෙන්වීම හැකිතාක් සීමා කර විකල්ප එන්නත් ගෙන්වීමට කටයුතු කරන්නැයි ද ඔහු ඉල්ලා සිටී.  මිනිස් ශරීරයට සෘජුව ජාන ඇතුළු කිරීම සම්බන්ධයෙන් බැලූ විට කිසිවෙකුට අනාවකි කළ නොහැකි බවද පැවසූ ඔහු අනවශ්‍ය හානිකර දේ සිදුවීමට ඉඩ ඇතැයි ද පැවසීය.

Please enter your question: 
What did Thissa say?

Predicted answer:
The korona virus is at risk of improving the necessary harmful conditions in pbiser vaccine . he pointed out that there is a scientific suspicion of action
කොරෝනා වෛරසය PBISER එන්නතෙහි අත්යවශ්ය හානිකර තත්වයන් වැඩිදියුණු කිරීමේ අවදානමට ලක්ව ඇත.ඔහු පෙන්වා දුන්නේ ක්රියාව පිළිබඳ විද්යාත්මක සැකයක් ඇති බවයි

Do you want to ask another question based on this text (

In [50]:
for index in range(len(predicted_answers)):
  answer_given = input("Child's answer")
  answer_given = translateS2E(answer_given)
  documentSimilarity(predicted_answers[index], answer_given)

Child's answerකොරෝනා වෛරසය පාලනය සඳහා ෆයිසර් එන්නත භාවිතාවේදී අවශ්‍ය හානිකර තත්ත්වයන් ඇති වීමේ අවදානමක් ඇතැයි 
The distance between the documents is:  0.755332 (radians)
