## Import

In [1]:
!pip install --upgrade spacy
!python -m spacy download en_core_web_trf
!pip3 install pickle5

Requirement already up-to-date: spacy in /usr/local/lib/python3.7/dist-packages (3.0.6)
2021-06-02 20:08:18.511451: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [2]:
import numpy as np
import json
import pandas as pd
import pickle5 as pickle
import math
import time
from typing import *
import re
import tqdm.autonotebook as tqdm

  if __name__ == '__main__':


## Mount drive

In [3]:
from google.colab import drive, files
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


##Dataset

In [None]:
data_path = '/content/gdrive/My Drive/NLP/Project/SQUAD MATERIAL/'
models_path = '/content/gdrive/My Drive/NLP/Project/models/'

### Load data

In [4]:
with open(data_path + 'doc_dic2.pkl', 'rb') as handle:
  preprocessed_documents = pickle.load(handle)
with open(data_path + 'qst_dic2.pkl', 'rb') as handle:
  preprocessed_questions = pickle.load(handle)

### Split dataset

In [5]:
#load the partition of the documents in the training/validaton/test sets
titles_split = pickle.load(open(data_path + 'titles_split.pkl', 'rb'))
paragraphs_as_docs = True

In [6]:
def split_set(dataset, split_part):
  index = math.floor((1 - split_part) * len(dataset))
  return dataset[:index], dataset[index:]

def is_title_in_title_set(title, title_set):
  return re.sub(r'_\d+$', '', title) in title_set

#split the dataset according to the partiotioning described in titles_split
def split_dataset_by_titles(dataset, titles_split):
  res = dict()
  for key in titles_split.keys():
    res[key] = []
  for elem in dataset:
    for key,title_set in titles_split.items():
      if is_title_in_title_set(elem[0], title_set):
        res[key].append(elem)
  return res['train'], res['validation'], res['test']

#selecting specific tokens (no punctuation and stopwords) in a certain form (plain text, lemma, ner, etc.) specified by the parameter 'token_form' of question and documents 
def build_dataset(docs, questions, token_form, map=str.lower, split_paragraphs=False):
  
  map = (lambda x:x) if map is None else map
  if split_paragraphs:
    documents = [
      (f'{title}_{par_id}', [
        map(token[token_form])
        for token in paragraph if not (token['is_punct'] or token['is_stop'])
      ])
      for title, paragraphs in docs.items()
      for par_id, paragraph in paragraphs.items()
    ]
    
    questions = [
      (f'{title}_{par_id}', [
        [
          map(token[token_form])
          for token in question if not (token['is_punct'] or token['is_stop'])
        ]
        for question in qsts_paragraph.values()
      ])
      for title, qst_paragraphs in questions.items()
      for par_id, qsts_paragraph in qst_paragraphs.items()
    ]
  else:
    documents = [
      (title, [ 
          map(token[token_form])
          for par_id, paragraph in paragraphs.items()
          for token in paragraph if not (token['is_punct'] or token['is_stop'])
      ])
      for title, paragraphs in docs.items()
    ]
    
    questions = [
      (title, [
        [
            map(token[token_form])
            for token in question if not (token['is_punct'] or token['is_stop'])
        ]
        for par_id, qst_paragraph in qst_paragraphs.items()
        for question in qst_paragraph.values()
        ])
      for title, qst_paragraphs in questions.items()
    ]

  return documents, questions

# Build datasets
dataset_documents, dataset_questions = build_dataset(
  preprocessed_documents,
  preprocessed_questions,
  token_form='lemma_',
  split_paragraphs=paragraphs_as_docs
)

training_docs, validation_docs, test_docs = split_dataset_by_titles(dataset_documents, titles_split)
training_questions, validation_questions, test_questions = split_dataset_by_titles(dataset_questions, titles_split)

_, dataset_questions_ner = build_dataset(
  preprocessed_documents,
  preprocessed_questions,
  token_form='ent_type',
  map=None,
  split_paragraphs=paragraphs_as_docs
)

training_questions_ner, validation_questions_ner, test_questions_ner = split_dataset_by_titles(dataset_questions_ner, titles_split)

##Models

In [12]:
from collections import defaultdict
from gensim import corpora
from gensim.models import LsiModel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import ClippedCorpus
from sklearn.metrics.pairwise import cosine_similarity as sk_cosine_similarity

###Functions

In [11]:
#infer vector from Doc2Vec model
def get_Doc2Vec_vector(model, text, **kwargs):
  return model.infer_vector(text, epochs=kwargs.get('doc2vec_infer_epochs', 200))

#infer vector from LSA model
def get_LSA_vector(model, text, **kwargs):
  return [v for d,v in model[kwargs['dictionary'].doc2bow(text)]]

#compute similarity using sklearn library
def cosine_similarity(question_vector, doc_vector):
  return sk_cosine_similarity([question_vector], [doc_vector]).item()

def sort_dict_by_value(dictionary, reverse=True):
  return {k: v for k, v in sorted(dictionary.items(), reverse=reverse, key=lambda item: item[1])}

#check if one of the two vectors has dimension 0
def not_null(vector1, vector2):
  if type(vector1) != tuple:
    return len(vector1)*len(vector2)
  for i in range(len(vector1)):
    if not len(vector1[i])*len(vector2[i]):
      return False
  return True

def is_word_in_doc(word, doc):
  return word in doc

def is_question_word_in_doc(question, doc):
  for word in question:
    if is_word_in_doc(word, doc):
      return True
  return False

#select the tokens inside the param text that are tagged with a NER tag
def select_ner_tokens(text, ner_tagged_text):
  entity_tokens = []
  for token, ner_tag in zip(text, ner_tagged_text):
    if ner_tag != 0:
      entity_tokens.append(token)
  return entity_tokens

#compute with the 'models' the vectors of the 'documents' using the functions 'get_vector_functions'
def compute_documents_vectors(models, get_vector_functions, documents, **kwargs):
  doc_vectors = []
  model_scores = []
  for i,model in enumerate(models):
    doc_vectors.append(dict())
    #compute document vectors
    for title,doc in documents:
      doc_vectors[i][title] = (doc, get_vector_functions[i](model, doc, **kwargs))
  return doc_vectors

#compute the similarity for each of the doc-question pairs
def compute_scores_by_vectors(doc_vectors, question_vector):
  scores = dict()
  #for title,(_,doc_vector) in tqdm.tqdm(doc_vectors.items()):
  for title,(_,doc_vector) in doc_vectors.items():
    #compute the score according to the cosine similarity received as parameter 
    scores[title] = cosine_similarity(question_vector, doc_vector) if not_null(question_vector, doc_vector) else 0
  return scores

#return the documents with the scores, sorted by the score
def compute_documents_ranking(models, get_vector_functions, doc_vectors, question, question_ner=None, score_normalization=True, exact_match_bonus=0, ner_exact_match_bonus=0, **kwargs):
  #definition of variables
  model_scores = []
  em_bonus_counter = 0
  ner_em_bonus_counter = 0
  
  #compute question vector and score between question and docs
  for i,model in enumerate(models):
    question_vector = get_vector_functions[i](model, question, **kwargs)
    model_scores.append(compute_scores_by_vectors(doc_vectors[i], question_vector))   
  #perform the normalization of the scores
  if score_normalization:
    for i in range(len(model_scores)):
      model_scores[i] = normalize_dict_values(model_scores[i])

  #average scores if more models are passed
  scores = dict()
  for title in model_scores[0].keys():
    score_sum = 0
    for i in range(len(model_scores)):
      score_sum += model_scores[i][title]
    scores[title] = score_sum / len(model_scores)

  #if the question vector is null, then all the scores will be equal to 0. If this condition doesn't hold compute the exact match's bonuses
  if not (min(scores.values()) == 0 and max(scores.values()) == 0):
    #compute exact match bonus
    if exact_match_bonus or ner_exact_match_bonus:
      #select tokens in the question with a NER tag assigned 
      if question_ner and ner_exact_match_bonus:
        ner_tokens = select_ner_tokens(question, question_ner)
      for title,(doc,_) in doc_vectors[0].items():
        #add score bonus based on exact match if the doc contains the the words of the question
        keyword_present = is_question_word_in_doc(question, doc) if exact_match_bonus else False
        ner_keyword_present = is_question_word_in_doc(ner_tokens, doc) if question_ner and ner_exact_match_bonus else False
        #add exact match bonuses, if present, without exceed 1
        scores[title] = scores[title] + (exact_match_bonus * keyword_present) + (ner_exact_match_bonus * ner_keyword_present)
        #count how many pairs question-docs are affected by the exact match bonus
        if keyword_present:
          em_bonus_counter += 1
        if ner_keyword_present:
          ner_em_bonus_counter += 1
  #else:
    #print("Question: ", question, "\n")
  ranked_docs = sort_dict_by_value(normalize_dict_values(scores))
  return ranked_docs, em_bonus_counter, ner_em_bonus_counter

def normalize_dict_values(dictionary):
  #if all the values are not equal, perform the normalization
  #find min and max
  min_value = min(dictionary.values())
  max_value = max(dictionary.values())
  if min_value != max_value: 
    #normalize: (value - min) / max - min
    for k,v in dictionary.items():
      dictionary[k] = (dictionary[k] - min_value) / (max_value - min_value)
  else:
    dictionary = {k:1 for k,_ in dictionary.items()}
  return dictionary

#return the results of the metrics for a specific model (or a set of models combined together)
def evaluate_model(models, get_vector_functions, documents, questions, questions_ner=None, score_normalization=True, exact_match_bonus=0, ner_exact_match_bonus=0, **kwargs):
  #definition of variables
  similarity_score = []
  ranking_score = []
  positions = []
  target_doc_first = 0
  num_questions = 0
  
  #var used for debugging and for understaind better some behaviours
  em_bonus_counter = 0
  ner_em_bonus_counter = 0
  non_valid_samples = 0
  
  #compute the document vectors
  doc_vectors = compute_documents_vectors(models, get_vector_functions, documents, **kwargs)

  #compute the doc-question scores
  ner_tagged_question = None
  #for (target_doc, doc_questions) in tqdm.tqdm(questions):
  for (target_doc, doc_questions) in questions:
    #check if EM_NER_bonus is enabled
    if questions_ner and ner_em_bonus:
      doc_questions_ner = [doc_questions_ner for doc_title,doc_questions_ner in questions_ner if doc_title==target_doc][0]
    #for each questions compute the ranking of the docs and update the metrics
    for i,question in enumerate(doc_questions):
      if questions_ner and ner_em_bonus:
        ner_tagged_question = doc_questions_ner[i]
      num_questions += 1
      #compute documents ranking
      ranked_docs, exact_match_bonuses, ner_exact_match_bonuses = compute_documents_ranking(models, get_vector_functions, doc_vectors, question, ner_tagged_question, score_normalization, exact_match_bonus, ner_exact_match_bonus, **kwargs)
      #update a counter to understand how many doc-questions paris are affected by the bonuses
      em_bonus_counter += exact_match_bonuses
      ner_em_bonus_counter += ner_exact_match_bonuses

      #When testing paragraph serach knowing the target document, removing the paragraphs of the others paragraphs 
      if kwargs.get('rank_paragraph_from_doc', False):
        ranked_docs = sort_dict_by_value({k:v for k,v in ranked_docs.items() if k[:k.rfind('-')] == target_doc[:target_doc.rfind('-')]})

      #some questions in the dataset are mapped in a null vector: in that case the documents' scores are all equal to 0 
      if min(ranked_docs.values()) == max(ranked_docs.values()): 
        #just a counte for debugging
        non_valid_samples += 1
      else:
        #compute metrics
        similarity_score.append(ranked_docs[target_doc])
        docs_positions = {k:(i+1) for i,(k,v) in enumerate(ranked_docs.items())}
        ranking_score.append(1/docs_positions[target_doc])
        positions.append(docs_positions[target_doc])
        if (docs_positions[target_doc] == 1):
          target_doc_first += 1      

  return similarity_score, ranking_score, positions, target_doc_first/num_questions, non_valid_samples, (em_bonus_counter, ner_em_bonus_counter, len(documents*num_questions))

def print_score(model_name, similarity_score, ranking_score, position_score, first_score, non_valid_samples, em_multiplier_counter):
  print(model_name, ":\nSimilarity score: ", np.mean(similarity_score), "\nAverage position: ", np.mean(position_score), 
        "\nStandard deviation of positions: ", np.std(position_score), 
        "\nMediana, First 75%/90%/98% documents' position: : ", np.median(position_score), ", ", np.quantile(position_score, [0.75, 0.90, 0.98]), "\nRanking score: ", np.mean(ranking_score), "\nTarget doc as first: ", first_score, "\nNon valid test samples: ", non_valid_samples, "\nExact match bonus (done, total):", em_multiplier_counter, "\n")

###LSA model

####Preparation corpus, definition of the dictionary

In [None]:
FREQUENCY_TRESHOLD = 1

def remove_infrequent_words(corpus, frequency_threshold):
  # Count word frequencies
  frequency = defaultdict(int)
  for doc in corpus:
    for token in doc:
      frequency[token] += 1
  #Select words with frequence > N
  return [[token for token in doc if frequency[token] > frequency_threshold] for doc in corpus]

def compute_dictionary_and_bow(dataset_docs):
  #Removing unfrequent words, initializing the dictionary and construct the bag of words of the document
  processed_corpus = remove_infrequent_words([doc for title,doc in dataset_docs], FREQUENCY_TRESHOLD)
  dictionary = corpora.Dictionary(processed_corpus)
  id2word = {v:k for k,v in dictionary.token2id.items()}
  bow_documents = [dictionary.doc2bow(doc) for doc in processed_corpus]
  return dictionary, id2word, bow_documents

dictionary, id2word, bow_documents = compute_dictionary_and_bow(training_docs)

####Search for the best parameter

In [None]:
lsa_models = dict()
for latent_dimension in range(50, 375, 50):
  lsa_models[latent_dimension] = LsiModel(bow_documents, num_topics=latent_dimension, id2word=id2word)

In [None]:
for dim in range(100, 375, 50):
  print_score("LSA model (topic=" + str(dim) + ")", *evaluate_model([lsa_models[dim]], [get_LSA_vector], validation_docs, validation_questions, score_normalization=False, dictionary=dictionary))  

#### Best model

In [None]:
best_lsa_param = 350

In [None]:
dictionary, id2word, bow_documents = compute_dictionary_and_bow(training_docs+validation_docs)
best_lsa_model = LsiModel(bow_documents, num_topics=best_lsa_param, id2word=id2word)

In [None]:
best_lsa_model.save(models_path + "lsa_model.lsa")
dictionary.save(models_path + "lsa_model_dictionary.dic")

###Doc2Vec model

####Dataset preparation

In [None]:
vdoc_train = [ TaggedDocument(words=doc, tags=[title]) for title, doc in training_docs ]
vdoc_val   = [ TaggedDocument(words=doc, tags=[title]) for title, doc in validation_docs ]

####Hyperparameters

In [None]:
max_epochs = 100   #@param { type: "number" }
val_epochs = 150   #@param { type: "number" }
vec_size = 100     #@param { type: "number" }
alpha = 0.025      #@param { type: "number" }
print_every = 1    #@param { type: "number" }

####Search the best hyperparameters

In [None]:
doc2vec_model = Doc2Vec(
  vector_size=vec_size,
  alpha=alpha,
  min_alpha=0.00025,
  min_count=1,
  dm=1,
  epochs=1,
  workers=8
)

doc2vec_model.build_vocab(vdoc_train)

for epoch in range(max_epochs):
    start_time = time.time()   
    doc2vec_model.train(
        vdoc_train,
        total_examples=doc2vec_model.corpus_count,
        epochs=doc2vec_model.epochs,
        queue_factor=16
    )
    if print_every and (epoch + 1) % print_every == 0:
      print_score("Doc2Vec model, epoch " + str(epoch) + ":", *evaluate_model([doc2vec_model], [get_Doc2Vec_vector], validation_docs, validation_questions, score_normalization=False))  

In [None]:
## One cell training
doc2vec_model = Doc2Vec(
  vector_size=100,
  alpha=0.025,
  min_alpha=0.00025,
  min_count=1,
  dm=1,
  epochs=26,
  workers=8
)
doc2vec_model.build_vocab(vdoc_train)
doc2vec_model.train(
  vdoc_train,
  total_examples=doc2vec_model.corpus_count,
  epochs=doc2vec_model.epochs,
  queue_factor=16
)

In [None]:
print_score("Doc2Vec model, epoch " + str(doc2vec_model.epochs), *evaluate_model([doc2vec_model], [get_Doc2Vec_vector], validation_docs, validation_questions, score_normalization=False))

Doc2Vec model, epoch 26 :
Similarity score:  0.8455138808374248 
Average position:  6.520539013257987 
Standard deviation of positions:  9.77978910580268 
Mediana, First 75%/90%/98% documents' position: :  2.0 ,  [ 7. 21. 39.] 
Ranking score:  0.583815509652554 
Target doc as first:  0.46815909584872856 
Non valid test samples:  0 
Exact match bonus (done, total): (0, 0, 404888) 



####Best model

In [None]:
## One cell training
best_doc2vec_model = Doc2Vec(
  vector_size=100,
  alpha=0.025,
  min_alpha=0.00025,
  min_count=1,
  dm=1,
  epochs=26,
  workers=8
)
best_doc2vec_model.build_vocab(vdoc_train+vdoc_val)
best_doc2vec_model.train(
  vdoc_train,
  total_examples=best_doc2vec_model.corpus_count,
  epochs=best_doc2vec_model.epochs,
  queue_factor=16
)

In [None]:
best_doc2vec_model.save(models_path + 'doc2vec.d2v')

### Load models

In [9]:
best_lsa_model = LsiModel.load(models_path + "lsa_model.lsa")
dictionary = ClippedCorpus.load(models_path + "lsa_model_dictionary.dic")

In [10]:
best_doc2vec_model = Doc2Vec.load(models_path + "doc2vec.d2v") 

###Exact match bonus 

####Search the best parameters

In [None]:
print("Exact match bonus exploration:\n")
normalization = False

for em_bonus in np.arange(0, 0.35, 0.1):
  for ner_em_bonus in np.arange(0, 0.35, 0.1):
    print_score("Bonus = " + str(em_bonus) + ", Ner_bonus = " + str(ner_em_bonus), 
            *evaluate_model([lsa_models[best_lsa_param], doc2vec_model], [get_LSA_vector, get_Doc2Vec_vector], 
            validation_docs, validation_questions, validation_questions_ner, score_normalization=normalization, exact_match_bonus=em_bonus, 
            ner_exact_match_bonus=ner_em_bonus, dictionary=dictionary)) 

In [None]:
print("Exact match bonus exploration:\n")
normalization = True

for em_bonus in np.arange(0, 0.35, 0.1):
  for ner_em_bonus in np.arange(0, 0.35, 0.1):
    print_score("Bonus = " + str(em_bonus) + ", Ner_bonus = " + str(ner_em_bonus), 
            *evaluate_model([lsa_models[best_lsa_param], doc2vec_model], [get_LSA_vector, get_Doc2Vec_vector], 
            validation_docs, validation_questions[:10], validation_questions_ner[:10], score_normalization=normalization, exact_match_bonus=em_bonus, 
            ner_exact_match_bonus=ner_em_bonus, dictionary=dictionary)) 

####Best models

In [None]:
em_bonus = 0.2
ner_em_bonus = 0.2
normalization = False

###Comparison of the models

####LSA

In [None]:
#LSA without bonuses
em_bonus = 0
ner_em_bonus = 0
normalization = False

print_score("Best LSA, Bonus = " + str(em_bonus) + ", Ner_bonus = " + str(ner_em_bonus), 
            *evaluate_model([best_lsa_model], [get_LSA_vector], 
            test_docs, test_questions, score_normalization=normalization, exact_match_bonus=em_bonus, 
            ner_exact_match_bonus=ner_em_bonus, dictionary=dictionary))

Best LSA, Bonus = 0, Ner_bonus = 0 :
Similarity score:  0.7488419794729161 
Average position:  5.299173743401423 
Standard deviation of positions:  7.329525762679216 
Mediana, First 75%/90%/98% documents' position: :  2.0 ,  [ 6. 15. 31.] 
Ranking score:  0.557485289484027 
Target doc as first:  0.41282725505887735 
Non valid test samples:  33 
Exact match bonus (done, total): (0, 0, 384868) 



#### Dov2Vec

In [None]:
#Doc2Vec without bonuses
em_bonus = 0
ner_em_bonus = 0
normalization = False

print_score("Best Doc2Vec, Bonus = " + str(em_bonus) + ", Ner_bonus = " + str(ner_em_bonus), 
            *evaluate_model([best_doc2vec_model], [get_Doc2Vec_vector], 
            test_docs, test_questions, score_normalization=normalization, exact_match_bonus=em_bonus, 
            ner_exact_match_bonus=ner_em_bonus, dictionary=dictionary)) 

Best LSA, Bonus = 0, Ner_bonus = 0 :
Similarity score:  0.8246534663639014 
Average position:  7.423573796730308 
Standard deviation of positions:  10.564839294679748 
Mediana, First 75%/90%/98% documents' position: :  2.0 ,  [ 9. 24. 41.] 
Ranking score:  0.5573304164419389 
Target doc as first:  0.4483823025037156 
Non valid test samples:  0 
Exact match bonus (done, total): (0, 0, 384868) 



####LSA + Doc2Vec

In [None]:
#LSA + Doc2Vec without bonuses
em_bonus = 0
ner_em_bonus = 0
normalization = False

print_score("Best LSA, Bonus = " + str(em_bonus) + ", Ner_bonus = " + str(ner_em_bonus), 
            *evaluate_model([best_lsa_model, best_doc2vec_model], [get_LSA_vector, get_Doc2Vec_vector], 
            test_docs, test_questions, score_normalization=normalization, exact_match_bonus=em_bonus, 
            ner_exact_match_bonus=ner_em_bonus, dictionary=dictionary)) 

Best LSA, Bonus = 0, Ner_bonus = 0 :
Similarity score:  0.8690171713822491 
Average position:  3.987081285012004 
Standard deviation of positions:  6.784803531102837 
Mediana, First 75%/90%/98% documents' position: :  1.0 ,  [ 3. 11. 30.] 
Ranking score:  0.6887884447597129 
Target doc as first:  0.5732251057505431 
Non valid test samples:  0 
Exact match bonus (done, total): (0, 0, 384868) 



In [None]:
#LSA + Doc2Vec with bonuses
em_bonus = 0.2
ner_em_bonus = 0.2
normalization = False

print_score("Best LSA + Doc2Vec, Bonus = " + str(em_bonus) + ", Ner_bonus = " + str(ner_em_bonus), 
            *evaluate_model([best_lsa_model, best_doc2vec_model], [get_LSA_vector, get_Doc2Vec_vector], 
            test_docs, test_questions, test_questions_ner, score_normalization=normalization, exact_match_bonus=em_bonus, 
            ner_exact_match_bonus=ner_em_bonus, dictionary=dictionary)) 

Best LSA + Doc2Vec, Bonus = 0.2, Ner_bonus = 0.2 :
Similarity score:  0.9468445855839863 
Average position:  2.6630844861095233 
Standard deviation of positions:  4.396311345284151 
Mediana, First 75%/90%/98% documents' position: :  1.0 ,  [ 2.  6. 18.] 
Ranking score:  0.7649002064625927 
Target doc as first:  0.6598833885903739 
Non valid test samples:  0 
Exact match bonus (done, total): (301001, 88139, 384868) 



In [None]:
#LSA + Doc2Vec with bonuses upon all the dataset
em_bonus = 0.2
ner_em_bonus = 0.2
normalization = False

print_score("Best LSA + Doc2Vec, Bonus = " + str(em_bonus) + ", Ner_bonus = " + str(ner_em_bonus), 
            *evaluate_model([best_lsa_model, best_doc2vec_model], [get_LSA_vector, get_Doc2Vec_vector], 
            training_docs+validation_docs+test_docs, test_questions, test_questions_ner, 
            score_normalization=normalization, exact_match_bonus=em_bonus, 
            ner_exact_match_bonus=ner_em_bonus, dictionary=dictionary)) 

Best LSA + Doc2Vec, Bonus = 0.2, Ner_bonus = 0.2 :
Similarity score:  0.8458246658113037 
Average position:  18.0380701954956 
Standard deviation of positions:  43.99980814500418 
Mediana, First 75%/90%/98% documents' position: :  3.0 ,  [ 10.  48. 176.] 
Ranking score:  0.4279793900115041 
Target doc as first:  0.25140048016462785 
Non valid test samples:  0 
Exact match bonus (done, total): (2948366, 796748, 3831186) 



#### Upon paragraphs as docs

In [40]:
#LSA + Doc2Vec with bonuses, knowing the target doc
em_bonus = 0.2
ner_em_bonus = 0.2
normalization = False

print_score("Best LSA + Doc2Vec, Bonus = " + str(em_bonus) + ", Ner_bonus = " + str(ner_em_bonus), 
            *evaluate_model([best_lsa_model, best_doc2vec_model], [get_LSA_vector, get_Doc2Vec_vector], 
            test_docs, test_questions, test_questions_ner, score_normalization=normalization, exact_match_bonus=em_bonus, 
            ner_exact_match_bonus=ner_em_bonus, dictionary=dictionary, rank_paragraph_from_doc=True)) 

Best LSA + Doc2Vec, Bonus = 0.2, Ner_bonus = 0.2 :
Similarity score:  0.9136729301210788 
Average position:  1.5725936135998162 
Standard deviation of positions:  1.3136011548604263 
Mediana, First 75%/90%/98% documents' position: :  1.0 ,  [2. 3. 6.] 
Ranking score:  0.8388753970077194 
Target doc as first:  0.7375100034297474 
Non valid test samples:  41 
Exact match bonus (done, total): (2116325, 512363, 16759252) 



In [13]:
#LSA + Doc2Vec with bonuses, searching on all the paragraphs of the test set
em_bonus = 0.2
ner_em_bonus = 0.2
normalization = False

print_score("Best LSA + Doc2Vec, Bonus = " + str(em_bonus) + ", Ner_bonus = " + str(ner_em_bonus), 
            *evaluate_model([best_lsa_model, best_doc2vec_model], [get_LSA_vector, get_Doc2Vec_vector], 
            test_docs, test_questions, test_questions_ner, score_normalization=normalization, exact_match_bonus=em_bonus, 
            ner_exact_match_bonus=ner_em_bonus, dictionary=dictionary)) 

Best LSA + Doc2Vec, Bonus = 0.2, Ner_bonus = 0.2 :
Similarity score:  0.9136127635988304 
Average position:  15.597347662055562 
Standard deviation of positions:  66.16842461451894 
Mediana, First 75%/90%/98% documents' position: :  2.0 ,  [  9.  30. 128.] 
Ranking score:  0.5397222038376489 
Target doc as first:  0.42151594832514006 
Non valid test samples:  0 
Exact match bonus (done, total): (2116325, 512363, 16759252) 



### Store data

In [None]:
# lsa_docs = compute_documents_vectors([best_lsa_model], [get_LSA_vector], dataset_documents, dictionary=dictionary)
# doc2vec_docs = compute_documents_vectors([best_doc2vec_model], [get_Doc2Vec_vector], dataset_documents)

# with open('/content/gdrive/My Drive/NLP/Project/SQUAD MATERIAL/docs_lsa.pkl', 'wb') as lsa_output:
#   pickle.dump(lsa_docs[0], lsa_output, protocol=pickle.HIGHEST_PROTOCOL)
# with open('/content/gdrive/My Drive/NLP/Project/SQUAD MATERIAL/docs_doc2vec.pkl', 'wb') as doc2vec_output:
#   pickle.dump(doc2vec_docs[0], doc2vec_output, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# paragraphs_data = {
#     f'{title}_{par_id}': tuple(
#       [ [ token['text'] for token in paragraph ] ] +
#       [
#         np.array([ token[key] for token in paragraph ])
#         for key in ('tag', 'ent_type', 'like_num', 'is_stop')
#       ]
#     )
#     for title, document in preprocessed_documents.items()
#     for par_id, paragraph in document.items()
# }
# with open('/content/gdrive/My Drive/NLP/Project/SQUAD MATERIAL/paragraphs_data.pkl', 'wb') as par_data_out:
#   pickle.dump(paragraphs_data, par_data_out, protocol=pickle.HIGHEST_PROTOCOL)