In [None]:
%%capture
!pip3 install pickle5
import re
import json
import numpy as np
import pickle5 as pickle # used for colab
from sklearn.metrics.pairwise import cosine_similarity as sk_cosine_similarity

# E2E Model

## Text embedding

In [None]:
class TextEmbedder(object):
  '''Interface for document embedding'''
  def embed(self, text): raise Exception('Not implemeted!')
  def __call__(self, question): return self.embed(question)

class LSAModel(TextEmbedder):
  '''Wrapper for the Gensim LSA model to be easily used in the E2E model'''
  def __init__(self, path, vocabulary):
    from gensim.models import LsiModel
    from gensim.utils import ClippedCorpus
    self.model = LsiModel.load(path)
    self.frequency_threshold = frequency_threshold
    self.vocabulary = ClippedCorpus.load(vocabulary)

  def embed(self, text): return [ v for d, v in self.model[self.vocabulary.doc2bow(text)] ]

class Doc2VecModel(TextEmbedder):
  '''Wrapper for the Gensim Doc2Vec model to be easily used in E2E model'''
  def __init__(self, path, epochs=200):
    from gensim.models.doc2vec import Doc2Vec
    self.model = Doc2Vec.load(path) 
    self.epochs = epochs
  
  def embed(self, text): return self.model.infer_vector(text, epochs=self.epochs)

## Question Answering

In [None]:
class TextPreprocessor(object):
  '''Interface for lemmatization and ner tagging'''
  def lemmatize(self, question): raise Exception('Not implemented!')
  def ner(self, question): raise Exception('Not implemented!')

class QuestionAnswer(TextPreprocessor):
  '''Interface for Question Answering task'''
  def answer(self, question, paragraph): raise Exception('Not implemented!')
  def __call__(self, question, paragraph): return self.answer(question, paragraph)

class QAModel(QuestionAnswer):
  '''Wrapper for the Question Answer neural model to be easily used in the E2E model'''
  # models cache
  MODELS = dict()

  def __init__(self, path, tokenizer_path, ner_path, pos_path, question_padding=60, paragraph_padding=400):
    from tensorflow.keras.models import load_model
    from tensorflow.keras.utils import to_categorical
    import en_core_web_sm
    import pickle5 as pickle
    
    # if the path is not in the cache load the model from the file system
    if path not in QAModel.MODELS: QAModel.MODELS[path] = load_model(path)

    # load all the model dependencies
    self.model = QAModel.MODELS[path]
    self.preprocessor = en_core_web_sm.load()
    with open(tokenizer_path, 'rb') as handle: self.tokenizer = pickle.load(handle)
    with open(ner_path, 'rb') as handle: self.ner_codes = pickle.load(handle)
    with open(pos_path, 'rb') as handle: self.pos_codes = pickle.load(handle)
    self.question_padding = question_padding
    self.paragraph_padding = paragraph_padding
    self.question_keys = ['what','how','why','where','when','which','who','whose','whom']

  def lemmatize(self, question):
    return [
      token.text
      for token in self.preprocessor(question)
      if not (token.is_punct or token.is_stop)
    ]

  def ner(self, question):
    return [
      token.ent_type_
      for token in self.preprocessor(question)
      if not (token.is_punct or token.is_stop)
    ]

  def __tokenize(self, word):
    '''uses the tokenizer dependency to convert a word into a numerical token'''
    r = self.tokenizer.texts_to_sequences([ word ])[0]
    if len(r): return r[0]
    return 0

  def __pad(self, tokens, length, padding=0):
    '''computes the pre padding (faster then tensorflow)'''
    return np.array([ padding ] * (length - len(tokens)) + list(tokens)[:length])

  def __preprocess(self, text, padding=None):
    '''preprocess a text converting it in an input suitable for the model'''
    tokens = self.preprocessor(text)
    raw = []
    texts = []
    tags  = []
    ents  = []
    nums  = []
    stops = []
    for token in tokens:
      if token.is_punct: continue
      raw.append(token.text)
      texts.append(self.__tokenize(token.text))
      tags.append(self.pos_codes.get(token.tag_, 0))
      ents.append(self.ner_codes.get(token.ent_type_, 0))
      nums.append(token.like_num)
      stops.append(token.is_stop)

    features = [ self.__pad(feature, length=padding) for feature in (texts, tags, ents, nums, stops) ]
    features = [ feature.reshape((1,) + feature.shape + (1, )) for feature in features ]
    return tuple(features), raw

  def __presence_flag(self, question_tokens, paragraph_tokens):
    '''
    creates an array of flags where 1 means that a paragraph token is
    present in the question, while 0 means that is absent 
    '''
    question_tokens = list(question_tokens.squeeze())
    paragraph_tokens = list(paragraph_tokens.squeeze())
    result = np.array([ bool(token and token in question_tokens) for token in paragraph_tokens ])
    return result.reshape((1, ) + result.shape + (1, ))
  
  def __classify_question(self, question):
    '''
    computes the classification id of a question.
    If the question contains only one keyword (like 'what', 'where', etc.)
    then the id is assigned. If no keywords or more then one are found, then
    the classification outputs the 0 id (absent).    
    '''
    result = 0
    found = False
    for key_id, key in enumerate(self.question_keys):
      if key in question:
        if found:
          result = 0
          break
        else:
          found = True
          result = key_id + 1
    return np.array([[ result ]])

  def answer(self, question, paragraph):
    '''computes the answer given a question and a paragraph'''
    pre_qst, _ = self.__preprocess(question, padding=self.question_padding)
    qst_class = self.__classify_question(question)

    pre_par, raw_par = self.__preprocess(paragraph, padding=self.paragraph_padding)
    
    # shift is needed to align padded output and start/end indexes 
    shift = max(self.paragraph_padding - len(raw_par), 0)

    X = dict(
      input_question=pre_qst[0].reshape(pre_qst[0].shape[:-1]),
      input_question_pos_tag=pre_qst[1],
      input_question_ner_tag=pre_qst[2],
      input_question_is_num=pre_qst[3],
      input_question_is_stop=pre_qst[4],
      input_question_class=qst_class,

      input_paragraph=pre_par[0],
      input_paragraph_pos_tag=pre_par[1],
      input_paragraph_ner_tag=pre_par[2],
      input_paragraph_is_num=pre_par[3],
      input_paragraph_is_stop=pre_par[4],
      input_paragraph_flag=self.__presence_flag(pre_qst[0], pre_par[0])
    )
    Y = self.model.predict(X)

    # extract start and end from probabilities and align with the shift
    start, end = np.argmax(Y[0], axis=-1).item(), np.argmax(Y[1], axis=-1).item()
    start_offset, end_offset = start - shift, end - shift + 1

    return (start_offset, end_offset), (Y[0][0][start], Y[1][0][end]), str.join(' ', raw_par[start_offset:end_offset])

## E2E Wrapper

### IR Utilities

In [None]:
#infer vector from Doc2Vec model
def get_Doc2Vec_vector(model, text, **kwargs):
  return model.infer_vector(text, epochs=kwargs.get('doc2vec_infer_epochs', 200))

#infer vector from LSA model
def get_LSA_vector(model, text, **kwargs):
  return [v for d,v in model[kwargs['dictionary'].doc2bow(text)]]

#compute similarity using sklearn library
def cosine_similarity(question_vector, doc_vector):
  return sk_cosine_similarity([question_vector], [doc_vector]).item()

def sort_dict_by_value(dictionary, reverse=True):
  return {k: v for k, v in sorted(dictionary.items(), reverse=reverse, key=lambda item: item[1])}

#check if one of the two vectors has dimension 0
def not_null(vector1, vector2):
  if type(vector1) != tuple:
    return len(vector1)*len(vector2)
  for i in range(len(vector1)):
    if not len(vector1[i])*len(vector2[i]):
      return False
  return True

def is_word_in_doc(word, doc):
  return word in doc

def is_question_word_in_doc(question, doc):
  for word in question:
    if is_word_in_doc(word, doc):
      return True
  return False

#select the tokens inside the param text that are tagged with a NER tag
def select_ner_tokens(text, ner_tagged_text):
  entity_tokens = []
  for token, ner_tag in zip(text, ner_tagged_text):
    if ner_tag != 0:
      entity_tokens.append(token)
  return entity_tokens

#compute with the 'models' the vectors of the 'documents' using the functions 'get_vector_functions'
def compute_documents_vectors(models, get_vector_functions, documents, **kwargs):
  doc_vectors = []
  model_scores = []
  for i,model in enumerate(models):
    doc_vectors.append(dict())
    #compute document vectors
    for title,doc in documents:
      doc_vectors[i][title] = (doc, get_vector_functions[i](model, doc, **kwargs))
  return doc_vectors

#compute the similarity for each of the doc-question pairs
def compute_scores_by_vectors(doc_vectors, question_vector):
  scores = dict()
  #for title,(_,doc_vector) in tqdm.tqdm(doc_vectors.items()):
  for title,(_,doc_vector) in doc_vectors.items():
    #compute the score according to the cosine similarity received as parameter 
    scores[title] = cosine_similarity(question_vector, doc_vector) if not_null(question_vector, doc_vector) else 0
  return scores

#return the documents with the scores, sorted by the score
def compute_documents_ranking(models, get_vector_functions, doc_vectors, question, question_ner=None, score_normalization=True, exact_match_bonus=0, ner_exact_match_bonus=0, **kwargs):
  #definition of variables
  model_scores = []
  em_bonus_counter = 0
  ner_em_bonus_counter = 0
  
  #compute question vector and score between question and docs
  for i,model in enumerate(models):
    question_vector = get_vector_functions[i](model, question, **kwargs)
    model_scores.append(compute_scores_by_vectors(doc_vectors[i], question_vector))   
  #perform the normalization of the scores
  if score_normalization:
    for i in range(len(model_scores)):
      model_scores[i] = normalize_dict_values(model_scores[i])

  #average scores if more models are passed
  scores = dict()
  for title in model_scores[0].keys():
    score_sum = 0
    for i in range(len(model_scores)):
      score_sum += model_scores[i][title]
    scores[title] = score_sum / len(model_scores)

  #if the question vector is null, then all the scores will be equal to 0. If this condition doesn't hold compute the exact match's bonuses
  if not (min(scores.values()) == 0 and max(scores.values()) == 0):
    #compute exact match bonus
    if exact_match_bonus or ner_exact_match_bonus:
      #select tokens in the question with a NER tag assigned 
      if question_ner and ner_exact_match_bonus:
        ner_tokens = select_ner_tokens(question, question_ner)
      for title,(doc,_) in doc_vectors[0].items():
        #add score bonus based on exact match if the doc contains the the words of the question
        keyword_present = is_question_word_in_doc(question, doc) if exact_match_bonus else False
        ner_keyword_present = is_question_word_in_doc(ner_tokens, doc) if question_ner and ner_exact_match_bonus else False
        #add exact match bonuses, if present, without exceed 1
        scores[title] = scores[title] + (exact_match_bonus * keyword_present) + (ner_exact_match_bonus * ner_keyword_present)
        #count how many pairs question-docs are affected by the exact match bonus
        if keyword_present:
          em_bonus_counter += 1
        if ner_keyword_present:
          ner_em_bonus_counter += 1
  #else:
    #print("Question: ", question, "\n")
  ranked_docs = sort_dict_by_value(normalize_dict_values(scores))
  return ranked_docs, em_bonus_counter, ner_em_bonus_counter

def normalize_dict_values(dictionary):
  #if all the values are not equal, perform the normalization
  #find min and max
  min_value = min(dictionary.values())
  max_value = max(dictionary.values())
  if min_value != max_value: 
    #normalize: (value - min) / max - min
    for k,v in dictionary.items():
      dictionary[k] = (dictionary[k] - min_value) / (max_value - min_value)
  else:
    dictionary = {k:1 for k,_ in dictionary.items()}
  return dictionary

#return the results of the metrics for a specific model (or a set of models combined together)
def evaluate_model(models, get_vector_functions, documents, questions, questions_ner=None, score_normalization=True, exact_match_bonus=0, ner_exact_match_bonus=0, **kwargs):
  #definition of variables
  similarity_score = []
  ranking_score = []
  positions = []
  target_doc_first = 0
  num_questions = 0
  
  #var used for debugging and for understaind better some behaviours
  em_bonus_counter = 0
  ner_em_bonus_counter = 0
  non_valid_samples = 0
  
  #compute the document vectors
  doc_vectors = compute_documents_vectors(models, get_vector_functions, documents, **kwargs)

  #compute the doc-question scores
  ner_tagged_question = None
  #for (target_doc, doc_questions) in tqdm.tqdm(questions):
  for (target_doc, doc_questions) in questions:
    #check if EM_NER_bonus is enabled
    if questions_ner and ner_em_bonus:
      doc_questions_ner = [doc_questions_ner for doc_title,doc_questions_ner in questions_ner if doc_title==target_doc][0]
    #for each questions compute the ranking of the docs and update the metrics
    for i,question in enumerate(doc_questions):
      if questions_ner and ner_em_bonus:
        ner_tagged_question = doc_questions_ner[i]
      num_questions += 1
      #compute documents ranking
      ranked_docs, exact_match_bonuses, ner_exact_match_bonuses = compute_documents_ranking(models, get_vector_functions, doc_vectors, question, ner_tagged_question, score_normalization, exact_match_bonus, ner_exact_match_bonus, **kwargs)
      #update a counter to understand how many doc-questions paris are affected by the bonuses
      em_bonus_counter += exact_match_bonuses
      ner_em_bonus_counter += ner_exact_match_bonuses

      #When testing paragraph serach knowing the target document, removing the paragraphs of the others paragraphs 
      if kwargs.get('rank_paragraph_from_doc', False):
        ranked_docs = sort_dict_by_value({k:v for k,v in ranked_docs.items() if k[:k.rfind('-')] == target_doc[:target_doc.rfind('-')]})

      #some questions in the dataset are mapped in a null vector: in that case the documents' scores are all equal to 0 
      if min(ranked_docs.values()) == max(ranked_docs.values()): 
        #just a counte for debugging
        non_valid_samples += 1
      else:
        #compute metrics
        similarity_score.append(ranked_docs[target_doc])
        docs_positions = {k:(i+1) for i,(k,v) in enumerate(ranked_docs.items())}
        ranking_score.append(1/docs_positions[target_doc])
        positions.append(docs_positions[target_doc])
        if (docs_positions[target_doc] == 1):
          target_doc_first += 1      

  return similarity_score, ranking_score, positions, target_doc_first/num_questions, non_valid_samples, (em_bonus_counter, ner_em_bonus_counter, len(documents*num_questions))

def print_score(model_name, similarity_score, ranking_score, position_score, first_score, non_valid_samples, em_multiplier_counter):
  print(model_name, ":\nSimilarity score: ", np.mean(similarity_score), "\nAverage position: ", np.mean(position_score), 
        "\nStandard deviation of positions: ", np.std(position_score), 
        "\nMediana, First 75%/90%/98% documents' position: : ", np.median(position_score), ", ", np.quantile(position_score, [0.75, 0.90, 0.98]), "\nRanking score: ", np.mean(ranking_score), "\nTarget doc as first: ", first_score, "\nNon valid test samples: ", non_valid_samples, "\nExact match bonus (done, total):", em_multiplier_counter, "\n")

### E2E Model

In [None]:
class E2E:
  def __init__(
    self,
    answer_model,
    preprocessor,
    models=[],
    documents=[],
    paragraphs=[],
    paragraph_texts=None,
    n_docs=None,
    n_pars=None
  ):
    '''
    Model that wrap IR and QA models:
    qa_model - a function that takes question and paragraph and returns start and end
                qa_model(question, paragraph) -> (start, end)
    models - a list of functions that take a string and return a vector of fixed size
                models[i](text) -> np.array[100]
    documents - a list of paths where to load vectors, alternatively a list of
                dictionaries of the following shape:
    paragraphs - a list of paths where to load vectors, alternatively a list of
                 dictionaries of the same shape as documents.
    titles - a list of titles to consider when searching in the documents (used for test purposes) 
    paragraph_texts - a dictionary of all possible paragraphs - { paragraph_id: paragraph_text } 
    n_best_docs - default number of documents to keep during question-document comparison
    n_best_pars - default number of paragraph to keep during question-paragraph comparison
    '''
    self.answer_model = answer_model
    self.preprocessor = preprocessor
    self.models = models
    self.documents = []
    self.paragraphs = []
    self.paragraph_texts = dict()
    self.n_best_docs = n_docs
    self.n_best_pars = n_pars

    # load documents
    for document in documents:
      if type(document) is str: self.load_documents(document)
      else: self.documents.append(document)
    
    # load paragraphs
    for paragraph in paragraphs:
      if type(paragraph) is str: self.load_paragraphs(paragraph)
      else: self.documents.append(paragraph)

    # load paragraph texts
    if type(paragraph_texts) is str:
      self.load_paragraph_texts(paragraph_texts)
    elif type(paragraph_texts) is dict:
      self.paragraph_texts.update(paragraph_texts)
    elif type(paragraph_texts) in [list, tuple]:
      self.paragraph_texts.update({ title: value for title, value in paragraph_texts })

  def __read_file(self, path):
    with open(path, 'rb') as handle:
      result = pickle.load(handle)
    return result
  
  def load_paragraph_texts(self, path):
    self.paragraph_texts.update(self.__read_file(path))
    return self
  
  def load_paragraphs(self, path):
    self.paragraphs.append(self.__read_file(path))
    return self

  def load_documents(self, path):
    self.documents.append(self.__read_file(path))
    return self

  def predict(self, questions, n_docs=None, n_pars=None, doc_norm=False, par_norm=False):
    '''
    Given a list of questions returns a list of predictions, matching the first
    prediction available from question answer model:
    - extract the best paragraphs matching each question
    - find start and end with the question answer model

    The result is a list of the same size of the questions input, with this shape:
    [ ((start, end), (start_confidence, end_confidence), answer, paragraph), ... ]
    '''
    if n_docs is None: n_docs = self.n_best_docs
    if n_pars is None: n_pars = self.n_best_pars

    if n_docs < 0: n_docs = None
    if n_pars < 0: n_pars = None

    # if just one question is passed, return the output as a flat answer
    flat_return = type(questions) is str
    if flat_return: questions = [ questions ]

    result = []
    for question in questions:

      # question preprocessing
      question_lemmatized = self.preprocessor.lemmatize(question)
      question_ner = self.preprocessor.ner(question)
      # document ranking
      best_docs, _, _ = compute_documents_ranking(
        models=self.models,
        get_vector_functions=[ lambda m, q: m(q) for model in self.models ],
        doc_vectors=self.documents,
        question=question_lemmatized,
        question_ner=question_ner,
        score_normalization=doc_norm,
        exact_match_bonus=0.2,
        ner_exact_match_bonus=0.2
      )
      # take just the n_docs best documents
      best_docs = list(best_docs.items())[:n_docs]
      best_docs_titles = [ title for title, _ in best_docs ]

      # paragraphs extraction
      best_docs_pars = [
        { t: v for t, v in model_pars.items() if re.sub(r'_\d+$', '', t) in best_docs_titles }
        for model_pars in self.paragraphs
      ]

      # paragraph ranking
      best_pars, _, _ = compute_documents_ranking(
        models=self.models,
        get_vector_functions=[ lambda m, q: m(q) for model in self.models ],
        doc_vectors=best_docs_pars,
        question=question_lemmatized,
        question_ner=question_ner,
        score_normalization=par_norm,
        exact_match_bonus=0.2,
        ner_exact_match_bonus=0.2
      )
      best_par_ids = list(best_pars)[:n_pars]
      best_pars_text = [
        self.paragraph_texts[paragraph_id]
        for paragraph_id in best_par_ids
      ]
      
      # question answering
      values = []
      for paragraph_text in best_pars_text:
        (s, e), (s_p, e_p), text = self.answer_model(question, paragraph_text)
        values.append(((s, e), (s_p, e_p), text, paragraph_text))
      
      # sort the answers by the sum of the confidence scores
      # given by the question answering system
      values = list(sorted(values, key=lambda v: -sum(v[1])))
      result.append(values)

    return result[0] if flat_return else result

  def ask(self, questions, n_docs=None, n_pars=None, doc_norm=False, par_norm=False):
    '''
    Given a list of questions returns a list of answers, matching the first
    prediction available from question answer model:
    - extract the best paragraphs matching each question
    - find start and end with the question answer model
    - extract the answer from the predictions

    The result is a list of textual answers
    '''
    # if just one question is passed, return the output as a flat answer
    flat_return = type(questions) is str
    if flat_return: questions = [ questions ] 
    predictions = self.predict(questions, n_docs, n_pars, doc_norm, par_norm)
    # output alignment with the flat return
    return predictions[0][0][2] if flat_return else [ pred[0][2] for pred in predictions ]

# E2E Evaluation

In [None]:
# from google.colab import drive, files
# drive.mount('/content/gdrive')

In [None]:
root_path = '/content/gdrive/My Drive/NLP/Project'
squad_path = f'{root_path}/SQUAD MATERIAL'
model_path = f'{root_path}/models'

In [None]:
d2v_model = Doc2VecModel(f'{model_path}/doc2vec.d2v')
d2v_docs = f'{squad_path}/docs_doc2vec.pkl'
d2v_pars = f'{squad_path}/paragraphs_doc2vec.pkl'
 
lsa_model = LSAModel(
  f'{model_path}/lsa_model.lsa',
  f'{model_path}/lsa_model_dictionary.dic'
)
lsa_docs = f'{squad_path}/docs_lsa.pkl'
lsa_pars = f'{squad_path}/paragraphs_lsa.pkl'
 
par_texts = f'{squad_path}/paragraph_texts.pkl'
 
qas_model = QAModel(
  path=f'{model_path}/squad-model.h5',
  tokenizer_path=f'{squad_path}/word_tokenizer.pkl',
  ner_path=f'{squad_path}/ner_tokenizer.pkl',
  pos_path=f'{squad_path}/pos_tokenizer.pkl'
)
 
e2e = E2E(
  answer_model=qas_model,
  preprocessor=qas_model,
  models=[ d2v_model, lsa_model ],
  documents=[ d2v_docs, lsa_docs ],
  paragraphs=[ d2v_pars, lsa_pars ],
  paragraph_texts=par_texts,
  n_docs=10,
  n_pars=10
)

### E2E Testing

In [None]:
with open(f'{squad_path}/titles_split.pkl', 'rb') as f: titles_split = pickle.load(f)
with open(f'{squad_path}/training_set.json', 'r') as f: dataset = json.load(f)
questions = [
  (question['id'], question['question'], question['answers'][0]['text'], paragraph['context'], document['title'], str.join(' ', (p['context'] for p in document['paragraphs'])) )
  for document in dataset['data']
  for paragraph in document['paragraphs']
  for question in paragraph['qas']
  if document['title'] in titles_split['test']
]

In [None]:
import tqdm.autonotebook as tqdm
import ipywidgets as pyw
from IPython.display import display, clear_output
 
displayer = pyw.Output()

N_DOCS = 20
N_PARS = 1
slash_n = '\n'
 
first_exact = 0
first_contained = 0
exact = 0
contained = 0
catched_pars = 0
catched_docs = 0

count = 0

exact_avg = (exact / count) if exact and count else -1
contained_avg = (contained / count) if exact and count else -1
catched_pars_avg = (catched_pars / count) if exact and count else -1
catched_docs_avg = (catched_docs / count) if exact and count else -1

exact_matches = []
contained_matches = []

total = len(questions)
with tqdm.tqdm(questions[count:]) as bar:
  
  display(displayer)

  for id, question, right_answer, right_par, right_title, right_doc in bar:
    
    predictions = e2e.predict(
      question,
      n_docs=N_DOCS,
      n_pars=N_PARS
    )
    
    r = 'BEST OPTION - ' + predictions[0][2]

    for position, ((start, end), (start_prob, end_prob), guess_answer, guess_par) in enumerate(predictions):
      position += 1
      if guess_par in right_doc:
        catched_docs_avg = (catched_docs_avg * catched_docs + position) / (catched_docs + 1)
        catched_docs += 1
        break

    for position, ((start, end), (start_prob, end_prob), guess_answer, guess_par) in enumerate(predictions):
      position += 1
      if guess_par == right_par:
        catched_pars_avg = (catched_pars_avg * catched_pars + position) / (catched_pars + 1)
        catched_pars += 1
        break

    for position, ((start, end), (start_prob, end_prob), guess_answer, guess_par) in enumerate(predictions):
      position += 1
      if guess_answer == right_answer:
        exact_avg = (exact_avg * exact + position) / (exact + 1)
        exact += 1
        if position == 1: first_exact += 1 
        r = 'EXACT - ' + guess_answer
        exact_matches.append(f'\t{question} - {guess_answer}')
        break

      if guess_answer and (guess_answer in right_answer or right_answer in guess_answer):
        contained_avg = (contained_avg * contained + position) / (contained + 1)
        contained += 1
        if position == 0: first_contained += 1
        r = 'CATCH - ' + guess_answer
        contained_matches.append(f'\t{question} - {guess_answer} ({right_answer})')
        break

    count += 1
    with displayer:
      clear_output()
      print(f'''--- STATS ---

EM FIRST: {first_exact} ({first_exact / count:.2%})
CM FIRST: {first_contained} ({first_contained / count:.2%})

EM: {exact} ({exact / count:.2%}) - {exact_avg} avg position
CM: {contained} ({contained / count:.2%}) - {contained_avg} avg position

CATCHED DOCUMENTS: {catched_docs} ({catched_docs / count:.2%}) - {catched_docs_avg} avg position
CATCHED PARAGRAPHS: {catched_pars} ({catched_pars / count:.2%}) - {catched_pars_avg} avg position

COUNT: {count} / {total}

QUESTION: {question}
  MATCH:  {r}
  ANSWER: {right_answer}

-----------

EXACT MATCHES:
{slash_n.join(str(x) for x in exact_matches)}

-----------

CONTAINED MATCHES:
{slash_n.join(str(x) for x in contained_matches)}

''')