In [None]:
# Steps done:

# 1. System that takes a question, returns the top k documents, -> correct answer.

# 2. System that extracts the answer.

In [1]:
!pip install beir datasets transformers

Collecting beir
  Downloading beir-0.2.3.tar.gz (52 kB)
[K     |████████████████████████████████| 52 kB 644 kB/s 
[?25hCollecting datasets
  Downloading datasets-1.15.1-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 17.4 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 56.6 MB/s 
[?25hCollecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 6.9 MB/s 
[?25hCollecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
Collecting faiss_cpu
  Downloading faiss_cpu-1.7.1.post2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.4 MB)
[K     |████████████████████████████████| 8.4 MB 20.2 MB/s 
[?25hCollecting elasticsearch
  Downloading elasticsearch-7.15.2-py2.py3-none-any.whl (379 kB)
[K     |████████████████████████████████| 379 kB 66.8 MB/s 
Collecting tensorflo

In [2]:
from utils_index import *
from utils import *
from utils_addcontext import *

In [3]:
# Load the dbpedia dataset

dataset = "dbpedia"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/dbpedia-entity.zip".format(dataset)
data_path = util.download_and_unzip(url, "datasets")
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

datasets/dbpedia-entity.zip:   0%|          | 0.00/610M [00:00<?, ?iB/s]

  0%|          | 0/4635922 [00:00<?, ?it/s]

In [4]:
batch_size = 16

In [5]:
from datasets import load_dataset, load_metric, DatasetDict
squad_v2 = True
train, validation = load_dataset("squad_v2" if squad_v2 else "squad", split=['train[:10%]', 'validation']) 
datasets = DatasetDict()
datasets["train"] = train
datasets["validation"] = validation

Downloading:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/801k [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
  
tokenizer = AutoTokenizer.from_pretrained("mvonwyl/distilbert-base-uncased-finetuned-squad2")

model = AutoModelForQuestionAnswering.from_pretrained("mvonwyl/distilbert-base-uncased-finetuned-squad2")

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/561 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/253M [00:00<?, ?B/s]

In [7]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model_name = "distilbert-base-uncased-finetuned-squad2".split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-squad2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [8]:
from transformers import default_data_collator

data_collator = default_data_collator

In [9]:
trainer = Trainer(
    model,
    args,
    # train_dataset=tokenized_datasets["train"],
    # eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [10]:
pad_on_right = tokenizer.padding_side == "right"
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128

In [11]:
validation_features = datasets["validation"].map(
    lambda x: prepare_validation_features(x, tokenizer, pad_on_right, max_length, doc_stride),
    batched=True,
    remove_columns=datasets["validation"].column_names
)

  0%|          | 0/12 [00:00<?, ?ba/s]

In [12]:
# Add DBPedia context into our dataset

all_good_entries = get_all_good_entries(qrels, corpus)
dico = transform_into_dico(datasets)
all_questions, all_contexts, all_titles, all_answers = create_list_of_all(dico)
all_contexts = all_contexts + all_good_entries

In [13]:
# Put it into a DataFrame 

df = pd.DataFrame({'question': pd.Series(all_questions), 'context': pd.Series(all_contexts), 'title': pd.Series(all_titles), 'answers': pd.Series(all_answers)})
df

Unnamed: 0,question,context,title,answers
0,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,Normans,"{'text': ['France', 'France', 'France', 'Franc..."
1,Who was the duke in the battle of Hastings?,"The Norman dynasty had a major political, cult...",Normans,"{'text': ['William the Conqueror', 'William th..."
2,What is the original meaning of the word Norman?,"The English name ""Normans"" comes from the Fren...",Normans,"{'text': ['Viking', 'Norseman, Viking', 'Norse..."
3,When was the Duchy of Normandy founded?,"In the course of the 10th century, the initial...",Normans,"{'text': ['911', '911', '911'], 'answer_start'..."
4,Who upon arriving gave the original viking set...,"Before Rollo's arrival, its populations did no...",Normans,"{'text': ['Rollo', 'Rollo', 'Rollo'], 'answer_..."
...,...,...,...,...
11847,,The World Meteorological Organization (WMO) is...,,
11848,,The World Veterans Federation (WVF) is the wor...,,
11849,,World Vision International is an Evangelical C...,,
11850,,ZF Electronics GmbH (formerly known as Cherry ...,,


In [14]:
# Collect all unique questions from our dataset

unique_questions, q_a = get_all_unique_questions(all_questions, df)

In [15]:
%%time
model_distilbert_tas = SentenceTransformer('msmarco-distilbert-base-tas-b') # best performance overall
index = create_index(df.context.to_list(), model_distilbert_tas, df)
D, I = doc_search(unique_questions, model_distilbert_tas, index, num_results=10)
MMR_test(I, unique_questions, q_a)
# len is everything to measure the MRR
# otherwise, specify 10 for top-10 search

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

KeyboardInterrupt: ignored

In [None]:
top10 = get_top10_context(unique_questions[0], model_distilbert_tas, index, df.context.to_list())

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
top10 = get_top10_context(unique_questions[0], model_distilbert_tas, index, df.context.to_list())
top10[0]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'

In [None]:
datasets["validation"][0]

{'answers': {'answer_start': [159, 159, 159, 159],
  'text': ['France', 'France', 'France', 'France']},
 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'id': '56ddde6b9a695914005b9628',
 'question': 'In what country is Normandy located?',
 'title': 'Normans'}

In [None]:
datasets["validation"].select([0])

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1
})

In [None]:
validation_features = datasets["validation"].select([0]).map(
    lambda x: prepare_validation_features(x, tokenizer, pad_on_right, max_length, doc_stride),
    batched=True,
    remove_columns=datasets["validation"].column_names
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
raw_predictions = trainer.predict(validation_features)

The following columns in the test set  don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id.
***** Running Prediction *****
  Num examples = 1
  Batch size = 16


In [None]:
from tqdm import tqdm
def postprocess_qa_prediction(examples, features, raw_predictions, tokenizer, squad_v2, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        print(min_null_score)
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = (answer, best_answer["score"])

    return predictions

In [None]:
first = datasets["validation"].select([0])

In [None]:
first["context"][0] = "hi"

In [None]:
first["context"][0]

In [None]:
datasets["validation"]

In [None]:
def replace_context(example, context):
  example['context'] = context
  return example

In [None]:
new_dataset = datasets["validation"].select([0]).map(lambda x: replace_context(x, "hi"))

In [None]:
new_dataset["context"]

In [None]:
validation_features = new_dataset.map(
    lambda x: prepare_validation_features(x, tokenizer, pad_on_right, max_length, doc_stride),
    batched=True,
    remove_columns=datasets["validation"].column_names
)

In [None]:
raw_predictions = trainer.predict(validation_features)

In [None]:
import collections
final_predictions = postprocess_qa_prediction(new_dataset, validation_features, raw_predictions.predictions, tokenizer, squad_v2)

In [None]:
from io import StringIO 
import sys

class Capturing(list):
    def __enter__(self):
        self._stdout = sys.stdout
        sys.stdout = self._stringio = StringIO()
        return self
    def __exit__(self, *args):
        self.extend(self._stringio.getvalue().splitlines())
        del self._stringio    # free up some memory
        sys.stdout = self._stdout

In [None]:
datasets["validation"].select([1])

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1
})

In [None]:
def get_max(preds):
  max = 0
  max_res = {}
  for p in preds:
    if p[0]["score"] > max:
      max = p[0]["score"]
      max_res = p[0]
  print(max_res)
  max_res.pop("score")
  return max_res

In [None]:
import collections
from IPython.display import clear_output

preds = []
for i in range(len(datasets["validation"])):
  top10 = get_top10_context(datasets["validation"]["question"][i], model_distilbert_tas, index, df.context.to_list())
  predictions = []
  for context in top10:
    clear_output(wait=True)
    with Capturing() as output:
      new_dataset = datasets["validation"].select([i]).map(lambda x: replace_context(x, context))
      validation_features = new_dataset.map(
        lambda x: prepare_validation_features(x, tokenizer, pad_on_right, max_length, doc_stride),
        batched=True,
        remove_columns=datasets["validation"].column_names
      )
      raw_predictions = trainer.predict(validation_features)
      final_predictions = postprocess_qa_prediction(new_dataset, validation_features, raw_predictions.predictions, tokenizer, squad_v2)
      if squad_v2:
        formatted_predictions = [{"id": k, "prediction_text": v[0], "no_answer_probability": 0.0, "score": v[1]} for k, v in final_predictions.items()]
      predictions.append(formatted_predictions)
  max = get_max(predictions)
  preds.append(max)


Loading cached processed dataset at /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-27c364bc2154e27b.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-8a332e3ca914a8b9.arrow
The following columns in the test set  don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id.
***** Running Prediction *****
  Num examples = 1
  Batch size = 16


100%|██████████| 1/1 [00:00<00:00, 353.62it/s]

{'id': '56ddde6b9a695914005b962a', 'prediction_text': 'Denmark, Iceland and Norway', 'no_answer_probability': 0.0, 'score': 17.268557}





In [None]:
preds

[{'id': '56ddde6b9a695914005b9628',
  'no_answer_probability': 0.0,
  'prediction_text': 'France'},
 {'id': '56ddde6b9a695914005b9629',
  'no_answer_probability': 0.0,
  'prediction_text': '10th and 11th centuries'},
 {'id': '56ddde6b9a695914005b962a',
  'no_answer_probability': 0.0,
  'prediction_text': 'Denmark, Iceland and Norway'}]

In [None]:
metric = load_metric("squad_v2" if squad_v2 else "squad")
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"])]
metric.compute(predictions=preds, references=references)

{'HasAns_exact': 100.0,
 'HasAns_f1': 100.0,
 'HasAns_total': 3,
 'best_exact': 100.0,
 'best_exact_thresh': 0.0,
 'best_f1': 100.0,
 'best_f1_thresh': 0.0,
 'exact': 100.0,
 'f1': 100.0,
 'total': 3}