In [None]:
! pip install git+https://github.com/deepset-ai/haystack.git

In [None]:
from haystack import Finder
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

import pandas as pd
import numpy as np
import json
import math
import random

In [None]:
!apt-get update
!apt-get install wget

In [None]:
#Installing Elasticsearch
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.6.2

import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )
# wait until ES has started
! sleep 30

In [None]:
# Connect to Elasticsearch
from haystack.database.elasticsearch import ElasticsearchDocumentStore

# We need to set `embedding_field` and `embedding_dim`, when we plan to use a dense retriever
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", 
                                            embedding_field="embedding", embedding_dim=768)

08/02/2020 09:08:33 - INFO - elasticsearch -   PUT http://localhost:9200/document [status:400 request:0.009s]
08/02/2020 09:08:33 - INFO - elasticsearch -   PUT http://localhost:9200/label [status:400 request:0.009s]


In [None]:
#Recording task types from csv
queries = pd.read_csv('data/questions/allQuestions.csv') 
queries['PageNum'] = pd.Series(queries['PageNum']).fillna(method='ffill')
queries['Context'] = pd.Series(queries['Context']).fillna(method='ffill')
taskType = {}
for index in range(len(queries['Question'])):
  # Set each question as a key, and the task type as its value
  taskType[queries['Question'][index]] = queries['TaskType'][index]

In [None]:
file1 = open("data/questions/squadQA.json","r+") 
string = file1.read()
mydict = json.loads(string)
strings = mydict['data'][0]['paragraphs']
numTest = int(math.floor(0.5 * len(task)))
#Generate indices to extract test data
random.seed(5)
randomlist = random.sample(range(len(task)), numTest)
test = {}
index = 0
for row in range(len(strings)):
  #Iterate through the questions for each context
  qas = strings[row]['qas']
  qa = 0
  while qa < len(qas):
    if index in randomlist:
      test[qas[qa]['question']] = qas[qa]['answers'][0]['text']
      del qas[qa]
      #qa should remain at the same value during next iteration
      qa -= 1
    qa += 1
    index += 1
with open('data/questions/trainqa.json','w') as f:
  json.dump(mydict,f)

In [None]:
#Read in corpus
file1 = open("data/cleaned_text/summary.txt","r+",encoding='UTF-8') 
string = file1.read()
paragraphs =string.split('.\n')
list_par=[]
for p in range(len(paragraphs)):
  #Remove headers and footers
  #if ('©' in paragraphs[p]) or ('AIP SingaporeGEN' in paragraphs[p]):
    #continue
  #turn lists into sentences by replacing - with ,
  paragraphs[p] = paragraphs[p].replace(';\n -',',')
  #first element of list does not require a comma
  paragraphs[p]= paragraphs[p].replace('\n -','')
  paragraphs[p]=paragraphs[p].replace(';','')
  result = paragraphs[p].split('\n')
  for r in range(len(result)):
    #Remove whitespace between lines
    result[r] = result[r].strip()
    #Remove whitespace between words
    words = result[r].split(' ')
    s = ' '
    result[r] = s.join([word for word in words if word != ''])
  paragraphs[p] = s.join(result)
  paragraphs[p]=paragraphs[p].replace('\n','')
  paragraphs[p] = paragraphs[p].replace('°',' degrees ')
  list_par.append(paragraphs[p].strip())
sep = '. '
passage = sep.join(list_par)
dict1= [{'meta':{'name':'summary.txt'},'text':passage}]
document_store.write_documents(dict1)

08/02/2020 09:09:20 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.862s]


In [None]:
#Initialize retriever
from haystack.retriever.dense import DensePassageRetriever
retriever = DensePassageRetriever(document_store=document_store, embedding_model="dpr-bert-base-nq",
                                  do_lower_case=True, use_gpu=True)
document_store.update_embeddings(retriever)

In [None]:
reader = FARMReader(model_name_or_path="ktrapeznikov/albert-xlarge-v2-squad-v2", use_gpu=True)
reader2 = FARMReader(model_name_or_path="ahotrod/electra_large_discriminator_squad2_512", use_gpu=True)
#reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
#reader2 = FARMReader(model_name_or_path="deepset/bert-base-cased-squad2", use_gpu=True)

In [None]:
reader.train(data_dir='mydata', train_filename="trainqa.json",save_dir="models/albert",num_processes=128, n_epochs=1,use_gpu=True)
new_reader = FARMReader(model_name_or_path="models/albert")
finder = Finder(new_reader, retriever)

reader2.train(data_dir='mydata', train_filename="trainqa.json",save_dir="models/electra",num_processes=128, n_epochs=1,use_gpu=True)
new_reader2 = FARMReader(model_name_or_path="models/electra")
finder2 = Finder(new_reader2, retriever)

In [None]:
#Evaluation
from collections import Counter
import string
import re

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)
    def white_space_fix(text):
        return " ".join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1
def best_prediction(predictions, ground_truth):
    f1 = 0
    ans = ''
    for pred in predictions['answers']:
        if f1_score(pred['answer'],ground_truth) >= f1:
            f1 = f1_score(pred['answer'],ground_truth)
            ans = pred['answer']
    return (ans,f1)
def ensemble_prediction(input1,input2,ground_truth):
    weights = [0.5744746037802809,0.5471326379596643]
    output = []
    prediction1 = input1['answers']
    prediction2 = input2['answers']
    #Select the most likely prediction, weighted by the f1-score of each model
    if (weights[0]*prediction1[0]['score']) > (weights[1]*prediction2[0]['score']):
        top =  prediction1.pop(0)
    else:
        top = prediction2.pop(0)
    output = [[top['answer'],top['score']],
              [prediction1[0]['answer'],prediction1[0]['score']],
              [prediction2[0]['answer'],prediction2[0]['score']]]
    f1 = 0
    ans = ''
    for o in output:
        if f1_score(o[0],ground_truth) >= f1:
            f1 = f1_score(o[0],ground_truth)
            ans = o[0]
    return (ans,f1)

    

In [None]:
#Single query for testing purposes
qn = 'What is the normal permit fee for 10 return flights?'
input1 = finder.get_answers(question=qn, top_k_retriever=30, top_k_reader=3)
input2 = finder2.get_answers(question=qn, top_k_retriever=30, top_k_reader=3)
ensemble_prediction(input1,input2,test[qn])

08/02/2020 09:44:15 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.006s]
08/02/2020 09:44:15 - INFO - haystack.finder -   Reader is looking for detailed answer in 177632 chars ...
Inferencing Samples: 100%|██████████| 2/2 [00:09<00:00,  4.91s/ Batches]
Inferencing Samples: 100%|██████████| 2/2 [00:09<00:00,  4.91s/ Batches]
Inferencing Samples: 100%|██████████| 2/2 [00:09<00:00,  4.94s/ Batches]
Inferencing Samples: 100%|██████████| 2/2 [00:10<00:00,  5.01s/ Batches]
08/02/2020 09:44:55 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.006s]
08/02/2020 09:44:55 - INFO - haystack.finder -   Reader is looking for detailed answer in 177632 chars ...
Inferencing Samples: 100%|██████████| 2/2 [00:02<00:00,  1.13s/ Batches]
Inferencing Samples: 100%|██████████| 2/2 [00:02<00:00,  1.13s/ Batches]
Inferencing Samples: 100%|██████████| 2/2 [00:02<00:00,  1.14s/ Batches]
Inferencing Samples: 100%|██████████|

[['S$810', 12.293428421020508], ['iv. S$810', 1.7623662948608398], ['S$810', 11.129851341247559]]
[['S$810', 12.293428421020508], ['iv. S$810', 1.7623662948608398], ['S$810', 11.129851341247559]]
[['S$810', 12.293428421020508], ['iv. S$810', 1.7623662948608398], ['S$810', 11.129851341247559]]





('S$810', 0)

In [None]:
#Consolidating predictions into a dataframe
preds1 = []
preds2 = []
scores1 = []
scores2 = []
combinedPreds = []
combinedScores=[]
qns = list(test.keys())
test1 = []
for qn in qns:
  #top_k_retriever indicates the maximum number of documents to be shortlisted by the retriever
  #top_k_reader indicates the number of predictions to be returned
    prediction1 = finder.get_answers(question=qn, top_k_retriever=30, top_k_reader=3)
    prediction2 = finder2.get_answers(question=qn, top_k_retriever=30, top_k_reader=3)
    pred1,score1 = best_prediction(prediction1,test[qn])
    pred2,score2 = best_prediction(prediction2,test[qn])
    combinedPred,combinedScore = ensemble_prediction(prediction1,prediction2,test[qn])
    preds1.append(pred1)
    preds2.append(pred2)
    scores1.append(score1)
    scores2.append(score2)
    combinedPreds.append(combinedPred)
    combinedScores.append(combinedScore)

08/02/2020 09:45:30 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.006s]
08/02/2020 09:45:30 - INFO - haystack.finder -   Reader is looking for detailed answer in 177632 chars ...
Inferencing Samples: 100%|██████████| 2/2 [00:09<00:00,  4.92s/ Batches]
Inferencing Samples: 100%|██████████| 2/2 [00:09<00:00,  4.96s/ Batches]
Inferencing Samples: 100%|██████████| 2/2 [00:09<00:00,  4.97s/ Batches]
Inferencing Samples: 100%|██████████| 2/2 [00:10<00:00,  5.05s/ Batches]
08/02/2020 09:46:10 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.008s]
08/02/2020 09:46:10 - INFO - haystack.finder -   Reader is looking for detailed answer in 177632 chars ...
Inferencing Samples: 100%|██████████| 2/2 [00:02<00:00,  1.16s/ Batches]
Inferencing Samples: 100%|██████████| 2/2 [00:02<00:00,  1.16s/ Batches]
Inferencing Samples: 100%|██████████| 2/2 [00:02<00:00,  1.16s/ Batches]
Inferencing Samples: 100%|██████████|

In [None]:
types = []
answers = []
for qn in qns:
    types.append(taskType[qn])
for t in test:
    #Retrieve the corresponding answers for each question
    index = queries[queries['Question'] == t].index[0]
    answers.append(queries['Answer'][index])
    
df = pd.DataFrame({'Question':qns,'Type':types,'Answer':answers,'Prediction1':preds1,'Score1':scores1,'Prediction2':preds2,'Score2':scores2,
                  'OverallPrediction':combinedPreds,'OverallScore':combinedScores})
df = df.sort_values(by=['OverallScore'])
df.to_csv('results.csv',index=False)

In [None]:
#Only two models can be used at once, so I combined both csv files before uploading them again to be processed
models = ['Albert','Electra','Ensemble']
scoreBreakdown = {models[0]:{'Overall':[]},models[1]:{'Overall':[]},models[2]:{'Overall':[]}}
qTypes = {1:'Single Supporting Fact',6:'Yes/No',8:'Lists',9:'Simple Negation'}
for index in range(len(df)):
  scores = [df['Score1'][index],df['Score2'][index],df['OverallScore'][index]]
  taskType = qTypes[df['Type'][index]]
  for model,score in zip(models,scores):
    currModel = scoreBreakdown[model]
    if taskType not in currModel:
      currModel[taskType] = []
    currModel[taskType].append(score)
    currModel['Overall'].append(score)
for model,values in scoreBreakdown.items():
  for taskType in values.keys():
    values[taskType] = np.mean(values[taskType])
scoreBreakdown

In [None]:
#Choosing models to be combined into an ensemble.
#We don't want to just choose models which have the best performance. Instead, we should choose pairs that are able to best compensate for each other's weaknesses 
# in predicting certain types of tasks. This is achieved by shortlisting model pairs which have the highest maximum score for each of the tasks.

comparison = {'ALBERT':{'Overall':0.5745,1:0.5952,6:0.2899,8:0.5931,9:0.8488},'ROBERTA':{'Overall':0.5781,1:0.7437,6:0.2222,8:0.4477,9:0.3140},
              'ELECTRA':{'Overall': 0.5471,1:0.6708,6:0.6667,8:0.1333,9:0.2465},'BERT':{'Overall':0.3472,1:0.4400,6:0.3939,8:0.1200,9:0.0612}}

models = list(comparison.keys())
import math
output = []
for i in range(len(models)-1):
  for j in range(i+1,len(models)):
    score = 0
    for key in comparison[models[i]].keys():
      if key != 'Overall':
        score += max(comparison[models[i]][key],comparison[models[j]][key])
    output.append('{}, {} have an expected compatibility of {}'.format(models[i],models[j],score))
output.sort(key = lambda x:float(x.split(' ')[-1]),reverse=True)
output

['ALBERT, ELECTRA have an expected compatibility of 2.7794',
 'ALBERT, ROBERTA have an expected compatibility of 2.4755000000000003',
 'ALBERT, BERT have an expected compatibility of 2.431',
 'ROBERTA, ELECTRA have an expected compatibility of 2.1721',
 'ROBERTA, BERT have an expected compatibility of 1.8993',
 'ELECTRA, BERT have an expected compatibility of 1.7172999999999998']