#**DocyQA**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!pwd

/content


In [3]:
import os 
os.chdir('gdrive/My Drive/Colab Notebooks')

In [13]:
!pip install pdfplumber
!pip install nltk
!pip install -U gensim
!pip install flask-ngrok
!pip install werkzeug
!pip install numpy



In [14]:
def pdf_extract(file_name):
  import pdfplumber
  directory = "docs"
  pdf_txt = ""
  for file in os.listdir(directory):
      filename = os.fsdecode(file)
      if(filename == file_name):
          pdf_txt = '' # new line
          with pdfplumber.open(directory + '/' + filename) as pdf:
              for pdf_page in pdf.pages:
                single_page_text = pdf_page.extract_text()
                pdf_txt = pdf_txt + single_page_text
  return pdf_txt

##**Naive Approch**

In [33]:
import re
import gensim
from gensim.parsing.preprocessing import remove_stopwords

def clean_sentence(sentence, stopwords=False):
  sentence = sentence.lower().strip()
  sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
  if stopwords:
    sentence = remove_stopwords(sentence)
  return sentence

def get_cleaned_sentences(tokens, stopwords=False):
  cleaned_sentences = []
  for row in tokens:
    cleaned = clean_sentence(row, stopwords)
    cleaned_sentences.append(cleaned)
  return cleaned_sentences

In [34]:
def retrieveAndPrintFAQAnswer(question_embedding, sentence_embeddings, sentences):
  import sklearn
  from sklearn.metrics.pairwise import cosine_similarity
  max_sim = -1
  index_sim = -1
  for index, embedding in enumerate(sentence_embeddings):
    sim = cosine_similarity(embedding, question_embedding)[0][0]
    if sim > max_sim:
      max_sim = sim
      index_sim = index
  
  return index_sim

In [19]:
def naive_drive(file_name, question):
  pdf_txt = pdf_extract(file_name)
  import nltk
  import numpy
  import pprint
  nltk.download('punkt')
  tokens = nltk.sent_tokenize(pdf_txt)
  cleaned_sentences = get_cleaned_sentences(tokens, stopwords=True)
  cleaned_sentences_with_stopwords = get_cleaned_sentences(tokens, stopwords=False)
  sentences = cleaned_sentences_with_stopwords
  sentence_words = [[word for word in document.split()]
                    for document in sentences]

  from gensim import corpora
  dictionary = corpora.Dictionary(sentence_words)
  bow_corpus = [dictionary.doc2bow(text) for text in sentence_words]

  question = clean_sentence(question, stopwords=False)
  question_embedding = dictionary.doc2bow(question.split())

  index = retrieveAndPrintFAQAnswer(question_embedding, bow_corpus, sentences)
  #return len(sentences)
  return sentences[index]

##**Word2Vec Approach**

In [20]:
from gensim.models import Word2Vec
import gensim.downloader as api

v2w_model = None
try:
  v2w_model = gensim.models.Keyedvectors.load('./w2vecmodel.mod')
  print("w2v Model Successfully loaded")
except:
  v2w_model = api.load('word2vec-google-news-300')
  v2w_model.save("./w2vecmodel.mod")
  print("w2v Model Saved")

w2vec_embedding_size = len(v2w_model['pc']) 

w2v Model Saved


In [21]:
def getWordVec(word, model):
  import numpy
  samp = model['pc']
  vec = [0]*len(samp)
  try:
    vec = model[word]
  except:
    vec = [0]*len(samp)
  return (vec)


def getPhraseEmbedding(phrase, embeddingmodel):
  import numpy
  samp = getWordVec('computer', embeddingmodel)
  vec = numpy.array([0]*len(samp))
  den = 0;
  for word in phrase.split():
    den = den+1
    vec = vec+numpy.array(getWordVec(word, embeddingmodel))
  return vec.reshape(1, -1)

In [22]:
def word2vec_drive(file_name, question):
  pdf_txt = pdf_extract(file_name)

  import nltk
  import numpy
  import pprint
  
  nltk.download('punkt')
  tokens = nltk.sent_tokenize(pdf_txt)
  cleaned_sentences = get_cleaned_sentences(tokens, stopwords=True)
  cleaned_sentences_with_stopwords = get_cleaned_sentences(tokens, stopwords=False)
  sentences = cleaned_sentences_with_stopwords
  sentence_words = [[word for word in document.split()]
                    for document in sentences]

  sent_embeddings = []
  for sent in sentences:
    sent_embeddings.append(getPhraseEmbedding(sent, v2w_model))

  question_embedding = getPhraseEmbedding(question, v2w_model)
  index = retrieveAndPrintFAQAnswer(question_embedding, sent_embeddings, cleaned_sentences_with_stopwords)
  return cleaned_sentences_with_stopwords[index]

##**Glove Approach**

In [35]:
from gensim.models import Word2Vec
import gensim.downloader as api

glove_model = None
try:
  glove_model = gensim.models.Keyedvectors.load('./glovemodel.mod')
  print("Glove Model Successfully loaded")
except:
  glove_model = api.load('glove-twitter-25')
  glove_model.save("./glovemodel.mod")
  print("Glove Model Saved")

glove_embedding_size = len(glove_model['pc'])

Glove Model Saved


In [51]:
def glove_drive(file_name, question):
  pdf_txt = pdf_extract(file_name)

  import nltk
  import numpy
  import pprint

  nltk.download('punkt')
  tokens = nltk.sent_tokenize(pdf_txt)
  cleaned_sentences = get_cleaned_sentences(tokens, stopwords=True)
  cleaned_sentences_with_stopwords = get_cleaned_sentences(tokens, stopwords=False)
  sentences = cleaned_sentences_with_stopwords
  sentence_words = [[word for word in document.split()]
                    for document in sentences]

  sent_embeddings = []
  for sent in cleaned_sentences:
    sent_embeddings.append(getPhraseEmbedding(sent, glove_model))

  question_embedding = getPhraseEmbedding(question, glove_model)
  index = retrieveAndPrintFAQAnswer(question_embedding, sent_embeddings, cleaned_sentences_with_stopwords)
  return cleaned_sentences_with_stopwords[index]

In [52]:
# answer = glove_drive("1637834603676.pdf", "What was the highest fdi inflow in 2019?");
# print(answer)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
as a result of the measures taken to improve the countrys investment climate india 
jumped to 63rd place in world banks ease of doing business ranking as per world banks doing 
business report dbr 2020


#**Simple Transformer**

In [None]:
!pip install simpletransformers



In [None]:
import json


with open('data/SQuAD_1.1/train-v1.1.json', 'r') as f:
    train_data = json.load(f)

train_data = [item for topic in train_data['data'] for item in topic['paragraphs'] ]

In [None]:
with open('data/SQuAD_1.1/dev-v1.1.json', 'r') as f:
    test_data = json.load(f)

test_data = [item for topic in test_data['data'] for item in topic['paragraphs'] ]

In [None]:
train_data[0]

{'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'qas': [{'answers': [{'answer_start': 515,
     'text': 'Saint Bernadette Soubirous'}],
   'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
   'id': '5733be284776f41900661182'},
  {'answers': [{'answer_start': 188, 'text': 'a copper statue of Christ

In [None]:
test_data[0]

{'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
 'qas': [{'answers': [{'answer_start': 177, 'text': 'Denver Broncos'},
    {'answer_start': 177, 'text': 'Denver Broncos'},
    {'answer_start': 177, 'text': 'Denver Broncos'}],
   'question': 'Which NFL team

In [None]:
import logging

from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs

In [None]:
model_type="bert"
model_name= "bert-base-cased"

In [None]:
train_args = {
    'learning_rate': 3e-5,
    'num_train_epochs': 2,
    'max_seq_length': 384,
    'doc_stride': 128,
    'overwrite_output_dir': True,
    'reprocess_input_data': False,
    'train_batch_size': 6,
    'gradient_accumulation_steps': 8,
}


In [None]:
model = QuestionAnsweringModel(
    model_type,model_name, use_cuda=False, args=train_args
)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [None]:
!rm -rf outputs

In [None]:
model.train_model(train_data[:50], eval_data=test_data[:50])

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/41 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/41 [00:00<?, ?it/s]

(10, 5.612370938062668)

In [None]:
result, texts = model.eval_model(test_data[:50])

convert squad examples to features: 100%|██████████| 754/754 [00:08<00:00, 91.69it/s]
add example index and unique id: 100%|██████████| 754/754 [00:00<00:00, 714690.44it/s]


Running Evaluation:   0%|          | 0/97 [00:00<?, ?it/s]

In [None]:
result

{'correct': 5,
 'similar': 181,
 'incorrect': 568,
 'eval_loss': -0.191129378269945}

In [None]:
texts

{'correct_text': {'56bebd713aeaaa14008c9330': '$2 million',
  '56d9b43edc89441400fdb700': '$2 million',
  '56bebec43aeaaa14008c9349': '25',
  '56d7205e0d65d21400198391': 'Verizon',
  '56d9bc13dc89441400fdb75f': 'Verizon'},
 'similar_text': {'56be4db0acb8001400a502ec': {'truth': 'Denver Broncos',
   'predicted': '(AFC) champion Denver Broncos defeated the National Football Conference',
   'question': 'Which NFL team represented the AFC at Super Bowl 50?'},
  '56be4db0acb8001400a502ef': {'truth': 'Denver Broncos',
   'predicted': '(AFC) champion Denver Broncos defeated the National Football Conference',
   'question': 'Which NFL team won Super Bowl 50?'},
  '56beace93aeaaa14008c91df': {'truth': 'Denver Broncos',
   'predicted': '(AFC) champion Denver Broncos defeated the National Football Conference',
   'question': 'Who won Super Bowl 50?'},
  '56beace93aeaaa14008c91e3': {'truth': '2015',
   'predicted': '(NFL) for the 2015 season. The American Football',
   'question': 'Super Bowl 50 d

In [None]:
to_predict = [
    {
        "context": "Vin is a Mistborn of great power and skill.",
        "qas": [
            {
                "question": "What is Vin's speciality?",
                "id": "0",
            }
        ],
    }
]

In [None]:
answers, probabilities = model.predict(to_predict)


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 169.36it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 8473.34it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['Mistborn of great power', 'power', 'Mistborn of great power and skill', 'is a Mistborn of great power', 'skill', 'Mistborn of great power and', 'power and skill', 'Vin is a Mistborn of great power', 'Mi', 'power and', 'is a Mistborn of great power and skill', 'Vin', 'stborn of great power', 'is a Mistborn of great power and', 'Vin is a Mistborn of great power and skill', 'is a Mi', 'Mistborn of great power and skill.', 'Vin is a Mistborn of great power and', 'stborn of great power and skill', 'skill.']}]
[{'id': '0', 'probability': [0.05736599194874784, 0.05406497241962602, 0.053004244496729705, 0.05250881889239606, 0.05219853223253319, 0.0516830207555171, 0.04995421362885291, 0.04908136691613107, 0.048904401319102266, 0.0487090172554923, 0.04851638018032756, 0.04790397874361128, 0.04781864996233897, 0.047307024327026034, 0.04534952998948529, 0.044763670332764925, 0.044433970255127686, 0.044219113430512595, 0.04418282205198882, 0.04375853387976895]}]


In [None]:
len(answers[0]['answer'])

20

In [None]:
len(probabilities[0]['probability'])

20

In [None]:
max_prob = 0;
correct_answer = " ";
for i in range(len(answers[0]['answer'])):
  if probabilities[0]['probability'][i] > max_prob:
    max_prob = probabilities[0]['probability'][i]
    correct_answer = answers[0]['answer'][i]



In [None]:
print(max_prob)
print(correct_answer)

0.05736599194874784
Mistborn of great power


#**Bert**

In [4]:
!pip install transformers==3.1.0
import torch
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')



In [5]:
def answer_question_bert(question, answer_text):

    input_ids = tokenizer.encode(question, answer_text, max_length=512, truncation=True)

    print('Query has {:,} tokens.\n'.format(len(input_ids)))

    sep_index = input_ids.index(tokenizer.sep_token_id)

    num_seg_a = sep_index + 1

    num_seg_b = len(input_ids) - num_seg_a

    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    assert len(segment_ids) == len(input_ids)

    start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

    all_tokens = tokenizer.convert_ids_to_tokens(input_ids)

    #print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
    #print(f'score: {torch.max(start_scores)}')
    score = float(torch.max(start_scores))
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = tokens[answer_start]

    for i in range(answer_start + 1, answer_end + 1):

        if tokens[i][0:2] == '##':
          answer += tokens[i][2:]
        else:
          answer += ' ' + tokens[i]
        #if tokens[i][0:2] == ' ':
         #   answer += tokens[i][2:]

        #else:
           # answer += ' ' + tokens[i]
    return answer, score, start_scores, end_scores, tokens
    #print('Answer: "' + answer + '"')

In [6]:
def expand_split_sentences(pdf_txt):
  import nltk
  nltk.download('punkt')
  new_chunks = nltk.sent_tokenize(pdf_txt)
  length = len(new_chunks)
  #for i in range(length):
    #tmp_token = tokenizer.encode(new_chunks[i])
    #print('The input has a total of {:} tokens.'.format(len(tmp_token)))

  new_df = [];
  for i in range(length):
    paragraph = ""
    for j in range(i, length):
      #tmp_str = paragraph + new_chunks[j]
      tmp_token = tokenizer.encode(paragraph + new_chunks[j])
      length_token = len(tmp_token)
      if length_token < 510:
        #print(length_token)
        paragraph = paragraph + new_chunks[j]
      else:
        #print(length_token)
        break;
    #print(len(tokenizer.encode(paragraph)))
    new_df.append(paragraph)
  return new_df
  #for i in new_df:
    #print(i)

In [7]:
def bert_drive(file_name, question):
  import numpy
  text = pdf_extract(file_name)
  max_score = 0;
  final_answer = ""
  new_df = expand_split_sentences(text)
  tokens = []
  s_scores = numpy.array([])
  e_scores = numpy.array([])
  for new_context in new_df:
    #new_paragrapgh = new_paragrapgh + answer_question(question, answer_text)
    ans, score, start_score, end_score, token = answer_question_bert(question, new_context)
    if score > max_score:
      max_score = score
      s_scores = start_score.detach().numpy().flatten()
      e_scores = end_score.detach().numpy().flatten()
      tokens = token
      final_answer = ans
  return final_answer, s_scores, e_scores, tokens

##**Flask Server**

In [8]:
!pip install Flask
!pip install pyrebase
!pip install flask-ngrok
!pip install werkzeug



In [9]:
!pip install crypto #install due to error occured in pyrebase
!pip install pycrypto ##install due to error occured in pyrebase

Collecting pycrypto
  Using cached pycrypto-2.6.1.tar.gz (446 kB)
Building wheels for collected packages: pycrypto
  Building wheel for pycrypto (setup.py) ... [?25l[?25hdone
  Created wheel for pycrypto: filename=pycrypto-2.6.1-cp37-cp37m-linux_x86_64.whl size=499932 sha256=46062d1f9e9c96670a2e5a7d087c434d46801ff277e8443eb511aeb41d330b61
  Stored in directory: /root/.cache/pip/wheels/cf/85/ba/bbd7c96add459de7598fb424e5ff2309baf2095c844ac0f191
Successfully built pycrypto
Installing collected packages: pycrypto
Successfully installed pycrypto-2.6.1


In [53]:
import pyrebase
import os
import glob


config = {
  "apiKey": "AIzaSyC5FBYgV2gML7C1bqUEjiIbrkMCqlJ9Ba8",
  "authDomain": "docyqa-b6c3c.firebaseapp.com",
  "projectId": "docyqa-b6c3c",
  "databaseURL" : "",
  "storageBucket": "docyqa-b6c3c.appspot.com",
  "messagingSenderId": "152415243594",
  "appId": "1:152415243594:web:77f52516470de3642005b3",
  "measurementId": "G-9985TB71HF"
}

firebase = pyrebase.initialize_app(config)
storage = firebase.storage()

def clear_folder():
  files = glob.glob('docs/*')
  for f in files:
      os.remove(f)

def fetching_pdf(file_name):
  clear_folder()
  storage.child("docs").child(file_name).download("docs/" + file_name)

In [56]:
from flask import Flask
from flask import request
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)

@app.route('/', methods = ['POST'])
def question_answering_system():
  approach = request.form['approach']
  file_name = request.form['file_name']
  question = request.form['question']
  fetching_pdf(file_name)
  if(approach == "1"):
    answer = naive_drive(file_name, question)
    return answer
  elif(approach == "2"):
    answer = word2vec_drive(file_name, question)
    return answer
  elif(approach == "3"):
    answer = glove_drive(file_name, question)
    return answer
  elif(approach == "4"):
    answer, s_scores, e_scores, tokens = bert_drive(file_name, question)
    return answer
  else:
    return "null"



In [57]:
app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://fc4f-35-230-161-168.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [25/Nov/2021 10:34:36] "[37mPOST / HTTP/1.1[0m" 200 -


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


Query has 504 tokens.

Query has 495 tokens.

Query has 477 tokens.

Query has 454 tokens.

Query has 512 tokens.

Query has 512 tokens.

Query has 512 tokens.

Query has 502 tokens.

Query has 464 tokens.

Query has 437 tokens.

Query has 393 tokens.

Query has 342 tokens.

Query has 323 tokens.

Query has 276 tokens.

Query has 249 tokens.

Query has 228 tokens.

Query has 198 tokens.

Query has 166 tokens.

Query has 73 tokens.

Query has 50 tokens.

Query has 29 tokens.



127.0.0.1 - - [25/Nov/2021 10:36:20] "[37mPOST / HTTP/1.1[0m" 200 -
