In [1]:
# Etapes a faire:

# 1. System that takes a question, returns the top k documents, -> correct answer.

# 2. System that extracts the answer.

In [2]:
!pip install beir datasets transformers

Collecting beir
  Downloading beir-0.2.3.tar.gz (52 kB)
[K     |████████████████████████████████| 52 kB 1.2 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.15.1-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 10.0 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 36.2 MB/s 
[?25hCollecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 7.1 MB/s 
[?25hCollecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
Collecting faiss_cpu
  Downloading faiss_cpu-1.7.1.post2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.4 MB)
[K     |████████████████████████████████| 8.4 MB 33.1 MB/s 
[?25hCollecting elasticsearch
  Downloading elasticsearch-7.15.2-py2.py3-none-any.whl (379 kB)
[K     |████████████████████████████████| 379 kB 38.9 MB/s 
Collecting tensorflo

In [3]:
from utils_index import *
from utils import *

  from tqdm.autonotebook import tqdm


In [4]:
# Load the dbpedia dataset

dataset = "dbpedia"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/dbpedia-entity.zip".format(dataset)
data_path = util.download_and_unzip(url, "datasets")
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

datasets/dbpedia-entity.zip:   0%|          | 0.00/610M [00:00<?, ?iB/s]

  0%|          | 0/4635922 [00:00<?, ?it/s]

In [5]:
batch_size = 16

In [6]:
from datasets import load_dataset, load_metric, DatasetDict
squad_v2 = True
train, validation = load_dataset("squad_v2" if squad_v2 else "squad", split=['train[:10%]', 'validation']) 
datasets = DatasetDict()
datasets["train"] = train
datasets["validation"] = validation

Downloading:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/801k [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
  
tokenizer = AutoTokenizer.from_pretrained("mvonwyl/distilbert-base-uncased-finetuned-squad2")

model = AutoModelForQuestionAnswering.from_pretrained("mvonwyl/distilbert-base-uncased-finetuned-squad2")

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/561 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/253M [00:00<?, ?B/s]

In [8]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model_name = "distilbert-base-uncased-finetuned-squad2".split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-squad2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [9]:
from transformers import default_data_collator

data_collator = default_data_collator

In [10]:
trainer = Trainer(
    model,
    args,
    # train_dataset=tokenized_datasets["train"],
    # eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [11]:
pad_on_right = tokenizer.padding_side == "right"
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128

In [12]:
validation_features = datasets["validation"].map(
    lambda x: prepare_validation_features(x, tokenizer, pad_on_right, max_length, doc_stride),
    batched=True,
    remove_columns=datasets["validation"].column_names
)

  0%|          | 0/12 [00:00<?, ?ba/s]

In [13]:
all_good_entries = []

for i in qrels:
  # Check if query is a question, because we want a question-answer system
  # So it feels natural to get contexts related to questions
  if "QALD2" in i:
    # For every entry that correspond to a question
    for j in qrels[i]:
      # Get context
      context = corpus[j]['text']
      # Check if it's long enough for us
      context_tokens = context.split(' ')
      context_words = [word.lower() for word in context_tokens]
      if len(context_words) >= 50 and context not in all_good_entries:
        # Big enough and not yet added into our dataset
        # Add it in the list of good context
        all_good_entries.append(context)

In [14]:
# Transform the squadv2 validation set into a dictionnary
# To make it easier to manipulate

dico = {}
for i in range(len(datasets['validation'])):
  context = datasets  ['validation'][i]['context']
  if context not in dico:
    new_dico = {}
    new_dico['title'] = datasets['validation'][i]['title']
    new_dico['question'] = datasets['validation'][i]['question']
    new_dico['answers'] = datasets['validation'][i]['answers']
    dico[context] = new_dico

In [15]:
# Create a list for all questions, contexts, titles and answers
# from our dataset

all_questions = []
all_contexts = []
all_titles = []
all_answers = []

for context in dico.keys():
  all_contexts.append(context)
  all_questions.append(dico[context]['question'])
  all_titles.append(dico[context]['title'])
  all_answers.append(dico[context]['answers'])

In [16]:
# Add DBPedia context to SQuADv2 dataset

all_contexts = all_contexts + all_good_entries

In [17]:
# Put it into a DataFrame 

df = pd.DataFrame({'question': pd.Series(all_questions), 'context': pd.Series(all_contexts), 'title': pd.Series(all_titles), 'answers': pd.Series(all_answers)})
df

Unnamed: 0,question,context,title,answers
0,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,Normans,"{'text': ['France', 'France', 'France', 'Franc..."
1,Who was the duke in the battle of Hastings?,"The Norman dynasty had a major political, cult...",Normans,"{'text': ['William the Conqueror', 'William th..."
2,What is the original meaning of the word Norman?,"The English name ""Normans"" comes from the Fren...",Normans,"{'text': ['Viking', 'Norseman, Viking', 'Norse..."
3,When was the Duchy of Normandy founded?,"In the course of the 10th century, the initial...",Normans,"{'text': ['911', '911', '911'], 'answer_start'..."
4,Who upon arriving gave the original viking set...,"Before Rollo's arrival, its populations did no...",Normans,"{'text': ['Rollo', 'Rollo', 'Rollo'], 'answer_..."
...,...,...,...,...
11847,,The World Meteorological Organization (WMO) is...,,
11848,,The World Veterans Federation (WVF) is the wor...,,
11849,,World Vision International is an Evangelical C...,,
11850,,ZF Electronics GmbH (formerly known as Cherry ...,,


In [18]:
# Collect all unique questions from our dataset

seen = set()
seen_add = seen.add
unique_questions = [x for x in all_questions if not (x in seen or seen_add(x))]
q_a = {}

for question in unique_questions:
  a = df.loc[df['question'] == question]
  l = list(a.index)
  q_a[question] = l

In [19]:
%%time
model_distilbert_tas = SentenceTransformer('msmarco-distilbert-base-tas-b') # best performance overall
index = create_index(df.context.to_list(), model_distilbert_tas, df)
D, I = doc_search(unique_questions, model_distilbert_tas, index, num_results=10)
MMR_test(I, unique_questions, q_a)
# len is everything to measure the MRR
# otherwise, specify 10 for top-10 search

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

loading configuration file /root/.cache/torch/sentence_transformers/sentence-transformers_msmarco-distilbert-base-tas-b/config.json
Model config DistilBertConfig {
  "_name_or_path": "old_models/msmarco-distilbert-base-tas-b/0_Transformer",
  "activation": "gelu",
  "architectures": [
    "DistilBertModel"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.12.5",
  "vocab_size": 30522
}

loading weights file /root/.cache/torch/sentence_transformers/sentence-transformers_msmarco-distilbert-base-tas-b/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertModel.

All the weights of DistilBertModel were initialized from the model checkpoint at 

Batches:   0%|          | 0/371 [00:00<?, ?it/s]

Batches:   0%|          | 0/38 [00:00<?, ?it/s]

CPU times: user 1min 43s, sys: 2.67 s, total: 1min 46s
Wall time: 1min 50s


In [20]:
top10 = get_top10_context(unique_questions[0], model_distilbert_tas, index, df.context.to_list())

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
top10 = get_top10_context(unique_questions[0], model_distilbert_tas, index, df.context.to_list())
top10[0]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'

In [22]:
datasets["validation"][0]

{'answers': {'answer_start': [159, 159, 159, 159],
  'text': ['France', 'France', 'France', 'France']},
 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'id': '56ddde6b9a695914005b9628',
 'question': 'In what country is Normandy located?',
 'title': 'Normans'}

In [23]:
datasets["validation"].select([0])

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1
})

In [24]:
validation_features = datasets["validation"].select([0]).map(
    lambda x: prepare_validation_features(x, tokenizer, pad_on_right, max_length, doc_stride),
    batched=True,
    remove_columns=datasets["validation"].column_names
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [25]:
raw_predictions = trainer.predict(validation_features)

The following columns in the test set  don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id.
***** Running Prediction *****
  Num examples = 1
  Batch size = 16


In [32]:
from tqdm import tqdm
def postprocess_qa_prediction(examples, features, raw_predictions, tokenizer, squad_v2, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        print(min_null_score)
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = (answer, best_answer["score"])

    return predictions

In [None]:
first = datasets["validation"].select([0])

In [None]:
first["context"][0] = "hi"

In [None]:
first["context"][0]

In [None]:
datasets["validation"]

In [28]:
def replace_context(example, context):
  example['context'] = context
  return example

In [None]:
new_dataset = datasets["validation"].select([0]).map(lambda x: replace_context(x, "hi"))

In [None]:
new_dataset["context"]

In [None]:
validation_features = new_dataset.map(
    lambda x: prepare_validation_features(x, tokenizer, pad_on_right, max_length, doc_stride),
    batched=True,
    remove_columns=datasets["validation"].column_names
)

In [None]:
raw_predictions = trainer.predict(validation_features)

In [None]:
import collections
final_predictions = postprocess_qa_prediction(new_dataset, validation_features, raw_predictions.predictions, tokenizer, squad_v2)

In [39]:
from io import StringIO 
import sys

class Capturing(list):
    def __enter__(self):
        self._stdout = sys.stdout
        sys.stdout = self._stringio = StringIO()
        return self
    def __exit__(self, *args):
        self.extend(self._stringio.getvalue().splitlines())
        del self._stringio    # free up some memory
        sys.stdout = self._stdout

In [43]:
datasets["validation"].select([1])

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1
})

In [48]:
import collections
from IPython.display import clear_output
predictions = []
for i in range(len(datasets["validation"])):
  top10 = get_top10_context(datasets["validation"]["question"][i], model_distilbert_tas, index, df.context.to_list())

  for context in top10:
    clear_output(wait=True)
    with Capturing() as output:
      new_dataset = datasets["validation"].select([i]).map(lambda x: replace_context(x, context))
      validation_features = new_dataset.map(
        lambda x: prepare_validation_features(x, tokenizer, pad_on_right, max_length, doc_stride),
        batched=True,
        remove_columns=datasets["validation"].column_names
      )
      raw_predictions = trainer.predict(validation_features)
      final_predictions = postprocess_qa_prediction(new_dataset, validation_features, raw_predictions.predictions, tokenizer, squad_v2)
      if squad_v2:
        formatted_predictions = [{"id": k, "prediction_text": v[0], "no_answer_probability": 0.0, "score": v[1]} for k, v in final_predictions.items()]
      predictions.append(formatted_predictions)

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

KeyboardInterrupt: ignored

In [41]:
predictions

[[{'id': '56ddde6b9a695914005b9628',
   'no_answer_probability': 0.0,
   'prediction_text': 'France',
   'score': 15.975548}],
 [{'id': '56ddde6b9a695914005b9628',
   'no_answer_probability': 0.0,
   'prediction_text': '',
   'score': 1.72787}],
 [{'id': '56ddde6b9a695914005b9628',
   'no_answer_probability': 0.0,
   'prediction_text': '',
   'score': 2.977882}],
 [{'id': '56ddde6b9a695914005b9628',
   'no_answer_probability': 0.0,
   'prediction_text': 'Frankish kingdom of Neustria',
   'score': 7.6986885}],
 [{'id': '56ddde6b9a695914005b9628',
   'no_answer_probability': 0.0,
   'prediction_text': '',
   'score': 2.9426007}],
 [{'id': '56ddde6b9a695914005b9628',
   'no_answer_probability': 0.0,
   'prediction_text': '',
   'score': -0.67294085}],
 [{'id': '56ddde6b9a695914005b9628',
   'no_answer_probability': 0.0,
   'prediction_text': '',
   'score': -0.14271984}],
 [{'id': '56ddde6b9a695914005b9628',
   'no_answer_probability': 0.0,
   'prediction_text': '',
   'score': 2.086495}]

In [36]:
datasets["validation"][0]

{'answers': {'answer_start': [159, 159, 159, 159],
  'text': ['France', 'France', 'France', 'France']},
 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'id': '56ddde6b9a695914005b9628',
 'question': 'In what country is Normandy located?',
 'title': 'Normans'}