## Question Answering - Transformer Approach

In [1]:
import pandas as pd
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

In [3]:
model_name = "deepset/electra-base-squad2"

In [2]:
question_answer = pipeline("question-answering", model="deepset/electra-base-squad2")

Downloading:   0%|          | 0.00/635 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/415M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/200 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [4]:
df=pd.read_csv("../Dataset/final_paraphrased_df.csv")

In [5]:
df.head()

Unnamed: 0,text
0,The Bus starts from Rwanda at 02:51 AM and goe...
1,The bus arrives at Switzerland by 8:13 AM afte...
2,The Bus starts from United Kingdom at 09:58 AM...
3,The bus arrives at United States Minor Outlyin...
4,The Bus starts from United Arab Emirates at 06...


In [6]:
context = ""
for sent in df.iterrows():
    context += sent[1]['text'] + ". "

In [106]:
with open('../Dataset/context.txt', 'w') as f:
    f.write(context)

In [8]:
QA_input = {
    'question': 'Where does the bus from Rwanda reach to if it starts from there at 02:51 AM',
    'context': context
}

In [9]:
res1 = question_answer(QA_input)
print(res1)

  tensor = as_tensor(value)
  p_mask = np.asarray(


{'score': 0.9962866306304932, 'start': 51, 'end': 62, 'answer': 'Switzerland'}


In [13]:
QA_input = {
    'question': 'At what time does the bus which reaches south georgia from mexico leave at',
    'context': context,
}
res1 = nlp(QA_input)
print(res1)

{'score': 0.9991130232810974, 'start': 69522, 'end': 69530, 'answer': '02:45 PM'}


This takes a lot of time as our context is very big. We can extract keywords and reduce this context.

## Performing NER and reducing context

In [10]:
t = "Where does the bus from Rwanda reach to if it starts from there at 02:51 AM"

In [11]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [12]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

In [19]:
ner = pipeline("ner", model=model, tokenizer=tokenizer)

In [20]:
ner_results = ner(t)
ner_results

[{'entity': 'B-LOC',
  'score': 0.99984074,
  'index': 6,
  'word': 'Rwanda',
  'start': 24,
  'end': 30}]

## Keyword Extractor

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
import numpy as np
import itertools
import re

In [27]:
def max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    if(candidate):  
        return [words_vals[idx] for idx in candidate]
    else:
        return []

In [28]:
def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        if(mmr.size > 0):
            mmr_idx = candidates_idx[np.argmax(mmr)]

            # Update keywords & candidates
            keywords_idx.append(mmr_idx)
            candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [29]:
def get_keywords_keyBert(sentences, model_name='distilbert-base-nli-mean-tokens', n_gram_range=(1, 2), stop_words="english", top_n=10, diversification=None, nr_candidates=15, diversity=0.5):
    #Get candidate phrases
    count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([sentences])
    candidates = count.get_feature_names_out()
    
    #Load Model
    model = SentenceTransformer(model_name)
    doc_embedding = model.encode([sentences])
    candidate_embeddings = model.encode(candidates)
    
    #Calculate distance between embedding to find similarty
    if(diversification == None):
        distances = cosine_similarity(doc_embedding, candidate_embeddings)
        keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
    elif(diversification == 'max_sum_sim'):
        keywords = max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=top_n, nr_candidates=nr_candidates)
    elif(diversification == 'mmr'):
        keywords = mmr(doc_embedding, candidate_embeddings, candidates, top_n=top_n, diversity=diversity)
    
    return list(set(keywords))

In [70]:
get_keywords_keyBert(t, model_name='all-MiniLM-L6-v2', n_gram_range=(1, 1), diversification='mmr', top_n=3, diversity=0.8)

['rwanda', 'bus', '02']

In [72]:
possible_context = set()
for keyword in ['02', 'rwanda']:
    for sent in context.split('. '):
        if keyword in sent.lower():
            possible_context.add(sent)

In [73]:
len(possible_context)

201

In [74]:
len(context.split('. '))

1717

In [87]:
def get_short_context(question, context):
    keywords = get_keywords_keyBert(question, model_name='all-MiniLM-L6-v2', n_gram_range=(1, 1), diversification='mmr', top_n=3, diversity=0.8)
    possible_context = set()
    for keyword in keywords:
        for sent in context.split('. '):
            if keyword in sent.lower():
                possible_context.add(sent)
    possible_context = list(possible_context)
    possible_context = '. '.join(possible_context)
    
    return possible_context

In [88]:
quest1 = 'Where does the bus from Rwanda reach to if it starts from there at 02:51 AM'

In [89]:
short_context1 = get_short_context(quest1, context)

In [113]:
QA_input = {
    'question': 'Where does the bus from Rwanda reach to if it starts from there at 02:51 AM',
    'context': short_context1
}

res = question_answer(QA_input)
print(res)

  tensor = as_tensor(value)
  p_mask = np.asarray(


{'score': 0.9983903765678406, 'start': 31592, 'end': 31603, 'answer': 'Switzerland'}


In [140]:
def get_answers(question, context):
    short_context = get_short_context(question, context)
    qa_input = {
        'question': question,
        'context': short_context
    }
    res = question_answer(qa_input)
    
    final_answers = set()
    for r in res:
        if(r['score'] > 0.99):
            final_answers.add(r['answer'])
        
    return list(final_answers)

## Top k sentences as answers based on score

In [99]:
question_answer2 = pipeline("question-answering", model="deepset/electra-base-squad2", topk=5)



In [100]:
quest2 = "Where does the bus reach as 4:30 A.M."

In [104]:
get_answers(question_answer2, quest2, context)

  tensor = as_tensor(value)
  p_mask = np.asarray(


[{'score': 0.9983903765678406,
  'start': 31592,
  'end': 31603,
  'answer': 'Switzerland'},
 {'score': 0.9956767559051514,
  'start': 77798,
  'end': 77809,
  'answer': 'Switzerland'},
 {'score': 0.991287887096405,
  'start': 61969,
  'end': 61977,
  'answer': 'Kiribati'},
 {'score': 0.9760979413986206,
  'start': 36098,
  'end': 36103,
  'answer': 'Kenya'},
 {'score': 0.9745321869850159,
  'start': 117837,
  'end': 117842,
  'answer': 'Kenya'}]

In [130]:
quest3 = "At what time does the bus which reache Finland from United Kingdom"
t = get_answers(pipeline, quest3, context)

In [131]:
short_context3 = get_short_context(quest3, context)

In [141]:
qa_input = {
    'question': quest3,
    'context': short_context3
}

res = question_answer2(qa_input)
print(res)

  tensor = as_tensor(value)
  p_mask = np.asarray(


[{'score': 0.9998883605003357, 'start': 69999, 'end': 70006, 'answer': '8:26 PM'}, {'score': 0.9998598694801331, 'start': 114148, 'end': 114155, 'answer': '4:03 AM'}, {'score': 0.9993850588798523, 'start': 114148, 'end': 114155, 'answer': '4:03 AM'}, {'score': 0.9993388056755066, 'start': 65663, 'end': 65670, 'answer': '3:22 AM'}, {'score': 0.9992659687995911, 'start': 54771, 'end': 54778, 'answer': '5:27 AM'}]


## Final Testing

In [153]:
def get_answers(question, context):
    short_context = get_short_context(question, context)
    qa_input = {
        'question': question,
        'context': context
    }
    res = question_answer2(qa_input)
    print(res)
    final_answers = set()
    for r in res:
        if(r['score'] > 0.99):
            final_answers.add(r['answer'])
        
    answers = list(final_answers)
    if(len(answers) == 0):
        return 'No buses available for this route and time'
    else:
        return ', '.join(answer)

In [149]:
q = "All busses that Bus starts at 09:31 PM and goes to Angola"
get_answers(q, context)

  tensor = as_tensor(value)
  p_mask = np.asarray(


[{'score': 0.026512378826737404, 'start': 1459, 'end': 1477, 'answer': 'starts from Rwanda'}, {'score': 0.02013932168483734, 'start': 1451, 'end': 1477, 'answer': 'The Bus starts from Rwanda'}, {'score': 0.013502407819032669, 'start': 1471, 'end': 1477, 'answer': 'Rwanda'}, {'score': 0.005046140402555466, 'start': 15403, 'end': 15459, 'answer': 'The bus arrives at Montenegro by 3:07 PM after departing'}, {'score': 0.0028550198767334223, 'start': 15436, 'end': 15459, 'answer': '3:07 PM after departing'}]


[]

In [151]:
answers = list(final_answers)
answers

['Kiribati', 'Switzerland']