# Load Libs

In [1]:
import os
import time

from IPython.display import clear_output

from bot import BertGoggles

from logistic import Logistic

import pickle as pkl
import numpy as np

import sklearn as sk
from sklearn import linear_model
from sklearn import preprocessing as pre

import matplotlib.pyplot as plt

import tensorflow as tf

from squad_test import compute_exact, compute_f1

## To run with CPU

In [None]:
# tf.config.experimental.set_visible_devices(devices=[], device_type='GPU')

# Load Models

### Index Dir

In [None]:
INDEX = 'Index'

### Model Dir

In [None]:
BERT_MODEL = 'pretrained/model'
BERT_MODEL

### CHKPT Dir

In [None]:
BERT_CHKPT = 'fine-tuned/checkpoint'
BERT_CHKPT

### BERT length

In [None]:
MAXLEN = 350

### Logistic Dir

In [None]:
LOGISTIC = 'log_1.pkl'

### Load Model

In [None]:
bert_goggles = BertGoggles(BERT_MODEL, BERT_CHKPT, MAXLEN, INDEX, logistic_dir=LOGISTIC, top_n=10)

# Load Questions

In [None]:
with open('squad_val_questions.pkl', 'rb') as file:
    questions = pkl.load(file)

### Keep last half of questions

In [None]:
questions = questions[len(questions)//2:]

### Remove impossible questions

In [None]:
new_questions = []

for q in questions:
    if not q['label']:
        new_questions.append(q)
        
questions = new_questions

# Anserini Results

In [None]:
def get_anserini_results(anserini, collection, top_n=10):
    results = []
    not_found = []
    times = []
    positions = []
    
    wait_iter = 50

    for i, c in enumerate(collection):

        question = c['question']

        correct_id = c['id']
        
        start = time.time()
        hits = anserini.search(question, top_n=top_n)
        times.append(time.time() - start)
        
        found = False
        position = -1
        for j, hit in enumerate(hits):
            if int(hit.id) == int(correct_id):
                found = int(hit.id) == int(correct_id)
                position = j
                
        positions.append(position)

        if found:
            results.append(hits)
        else:
            results.append(hits)
            not_found.append(i)
            
        if (i + 1) % wait_iter == 0:
            from IPython.display import clear_output
            clear_output(wait=True)
            print('{0:.2f}%'.format(round(i / len(collection), 4) * 100))
            
            avg = np.mean(times)
            time_left = (len(collection) - i) * avg / 60 
            
            print('Time remaining: {0:.2f} mins'.format(time_left))
            
    return results, not_found, positions, times

In [None]:
ans_results, ans_not_found, ans_positions, ans_times = \
    get_anserini_results(bert_goggles.answerini, questions, top_n=10)

# BERT Results

In [None]:
def get_bert_results(bert, anserini_results, collection):
    results = []
    not_found = []
    times = []
    positions = []
    
    wait_iter = 5

    for i, (c, r) in enumerate(zip(collection, anserini_results)):

        question = c['question']

        correct_id = c['id']
        
        start = time.time()
        hits = bert.search(question, r)
        times.append(time.time() - start)
        
        found = False
        position = -1
        dict_hits = []
        for j, hit in enumerate(hits):
            if int(hit.id) == int(correct_id):
                found = True
                position = j
            dict_hits.append(hit._asdict())
            
        if not found:
            not_found.append(i)
            
        print(len(not_found))
                
        positions.append(position)

        results.append(dict_hits)
            
        if (i + 1) % wait_iter == 0:
            from IPython.display import clear_output
            clear_output(wait=True)
            print('{0:.2f}%'.format(round(i / len(collection), 4) * 100))
            
            avg = np.mean(times)
            time_left = (len(collection) - i) * avg / 60 
            
            print('Time remaining: {0:.2f} mins'.format(time_left))
            
    return results, not_found, positions, times

In [None]:
BERT_results = get_bert_results(bert_goggles.bert_model, ans_results, questions)

### Save BERT results

In [None]:
with open('bert_results.pkl', 'wb') as file:
    pkl.dump(BERT_results, file)

### Load BERT results

In [None]:
with open('bert_results.pkl', 'rb') as file:
    bert_results, bert_not_found, bert_positions, bert_times = pkl.load(file)

### BERT Without Impossible Questions Test

In [None]:
EM = []
F1 = []

for br, p, q in zip(bert_results, bert_positions, questions):
    
    if p == -1:
        F1.append(0)
        EM.append(0)
        continue
    
    result = br[p]
    
    golds = q['answers']
    answer = result['text']
    em = 0
    f1 = 0
    
    for gold in golds:
        em = max(em, compute_exact(gold, answer))
        new_f1, _ = compute_f1(gold, answer)
        f1 = max(f1, new_f1)
    
    F1.append(f1)
    EM.append(em)

### F1 Score

In [None]:
round(np.mean(F1), 4)

### EM Score

In [None]:
round(np.mean(EM), 4)

# System Scores

In [None]:
def rScore(correct_index):
    
    score = 0
    
    if correct_index < 10 and correct_index >= 5:
        score = 0.1
    elif correct_index < 5:
        score = 1 - 0.1 * correct_index
    
    return score

### EM and F1 Scores

In [None]:
EM = []
F1 = []

for p, q, b in zip(bert_positions, questions, bert_results):

    if p == -1:
        continue

    predicted_answer = b[p]['text']
    answers = q['answers']

    f1 = 0
    em = 0

    for ans in answers:
        new_f1, _ = compute_f1(ans, predicted_answer)
        f1 = max(f1, new_f1)
        em = max(em, compute_exact(ans, predicted_answer))

    F1.append(rf1)
    EM.append(rem)

EM = np.asarray(EM)
F1 = np.asarray(F1)

### rScores for Anserini

In [None]:
bert_positions = np.asarray(bert_positions)

ans_rScores = []

bert_positions_present = bert_positions[bert_positions != -1]

for pos in bert_positions_present:
    ans_rScores.append(rScore(pos))

np.sum(ans_rScores) / len(bert_positions)

### Logistic Regression Input

In [None]:
log_input = []

for i, (bert, ans) in enumerate(zip(bert_results, ans_results)):

    if i in ans_not_found:
        continue

    content_score = []
    for b, a in zip(bert, ans):
        content_score.append([a.score, b['score'], b['null_score']])

    log_input.append(content_score)
    
log_input = np.asarray(log_input)

### Logistic Regression Scores

In [None]:
log_sorted = []

for s in log_input:
    
    log_sorted.append(np.argsort(bert_goggles.logistic.score(s)))

### Logistic Ranking

In [None]:
log_ranks = []
bert_positions_present = bert_positions[bert_positions != -1]

for bp, ls in zip(bert_positions_present, log_sorted):
    log_ranks.append(np.where(ls == bp)[0][0])

log_ranks = np.asarray(log_ranks)

### Improve Positions

In [None]:
np.sum(bert_positions_present - log_ranks)

### System rScore

In [None]:
system_rScore = []

for pos in log_ranks:
    system_rScore.append(rScore(pos))

np.sum(system_rScore) / len(bert_positions)

### System REM

In [None]:
np.sum(np.asarray(EM) * np.asarray(system_rScore)) / len(bert_positions)

### System RF1

In [None]:
np.sum(np.asarray(F1) * np.asarray(system_rScore)) / len(bert_positions)

### System Latency

In [None]:
np.mean(bert_times) + np.mean(ans_times)