## Preprocess Bra Squadv2

In [1]:
from html.entities import name2codepoint
import re

REGEX_PATTERN = re.compile(r'(?<=\&)\w+(?=\;)')

def replace_htmlcodecs(text):

    for match_obj in REGEX_PATTERN.finditer(text):

        str_pattern = match_obj.group()
        text = re.sub(f'\&{str_pattern}\;', chr(name2codepoint[str_pattern]), text)

    return text


In [2]:
def preprocess_data(data_collection):
    examples = []
    for example in data_collection:

      title = example['title']
      paragraphs = example["paragraphs"]

      for paragraph in paragraphs:
        possible_qas = [qa for qa in paragraph["qas"] if not qa["is_impossible"]]
        context = paragraph['context']

        for qa in possible_qas:

          for answer in qa["answers"]:

              answer_start = answer["answer_start"]
              answer_text = replace_htmlcodecs(answer["text"])
              answer_len = len(answer_text)

              if answer_start != 0 and (paragraph['context'][answer_start:answer_start+answer_len] == answer_text):
                  answer_ctx = list(context)
                  answer_ctx.insert(answer_start, " <h1> ")
                  answer_ctx.insert(answer_start + answer_len + 1, " <h1> ")
                  answer_ctx =  "".join(answer_ctx)
                  question = qa["question"]
                  examples.append({
                        "paragraph": replace_htmlcodecs(paragraph["context"]),
                        "paragraph_id": paragraph["qas"][0]["id"],
                        "question": question,
                        "answer": answer_text,
                        "paragraph_answer": replace_htmlcodecs(answer_ctx),
                        "paragraph_question": f"question: {question} paragraph: {context}",
                    })
                  break
              else:
                continue
    return examples

In [3]:
import json

with open('/home/tiagoblima/repos/br-quad-2.0/data/brquad-gte-train-v2.0.json') as f:
  data = json.load(f)["data"]
  train_examples = preprocess_data(data)

with open('/home/tiagoblima/repos/br-quad-2.0/data/brquad-gte-dev-v2.0.json') as f:
  data = json.load(f)["data"]
  dev_examples = preprocess_data(data)

In [4]:
import pandas as pd
import datasets as dts
from datasets import DatasetDict

dataset = DatasetDict({
    "validation": dts.Dataset.from_pandas(pd.DataFrame(dev_examples)),
    "train": dts.Dataset.from_pandas(pd.DataFrame(train_examples))
})
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    validation: Dataset({
        features: ['paragraph', 'paragraph_id', 'question', 'answer', 'paragraph_answer', 'paragraph_question'],
        num_rows: 1946
    })
    train: Dataset({
        features: ['paragraph', 'paragraph_id', 'question', 'answer', 'paragraph_answer', 'paragraph_question'],
        num_rows: 55135
    })
})

In [5]:
HF_TOKEN = "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"
dataset.push_to_hub("tiagoblima/br_squadv2", token=HF_TOKEN)

Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 50.35ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:07<00:00,  7.83s/it]
Creating parquet from Arrow format: 100%|██████████| 56/56 [00:00<00:00, 159.53ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:47<00:00, 47.34s/it]
README.md: 100%|██████████| 607/607 [00:00<00:00, 1.55MB/s]


In [7]:
processed_dataset = dts.load_dataset("tiagoblima/br_squadv2", token=HF_TOKEN)
processed_dataset

DatasetDict({
    validation: Dataset({
        features: ['paragraph', 'paragraph_id', 'question', 'answer', 'paragraph_answer', 'paragraph_question'],
        num_rows: 1946
    })
    train: Dataset({
        features: ['paragraph', 'paragraph_id', 'question', 'answer', 'paragraph_answer', 'paragraph_question'],
        num_rows: 55135
    })
})

In [13]:
from preprocess import utils

paragraph_answer_text = processed_dataset["train"][0]["paragraph_answer"]
doc = utils.nlp(paragraph_answer_text)
print(doc._.answer_sentence)
print(doc._.sentences)
print(doc._.paragraph_sentence)

  return torch._C._cuda_getDeviceCount() > 0


[Nascida e criada em Houston, Texas, ela se apresentou em várias competições de canto e dança quando criança, e ganhou fama  <h1> no final dos anos 90 <h1>  como vocalista do grupo feminino de R&B Destiny's Child., Nascida e criada em Houston, Texas, ela se apresentou em várias competições de canto e dança quando criança, e ganhou fama  <h1> no final dos anos 90 <h1>  como vocalista do grupo feminino de R&B Destiny's Child.]
[Nascida e criada em Houston, Texas, ela se apresentou em várias competições de canto e dança quando criança, e ganhou fama   no final dos anos 90   como vocalista do grupo feminino de R&B Destiny 's Child., Nascida e criada em Houston, Texas, ela se apresentou em várias competições de canto e dança quando criança, e ganhou fama   no final dos anos 90   como vocalista do grupo feminino de R&B Destiny 's Child.]
[Beyoncé Giselle Knowles-Carter( nascida em 4 de setembro de 1981) é uma cantora, compositora, produtora de discos e atriz norte-americana. <h1>Nascida e cr

In [14]:
def preprocess_spacy(example):
    doc = utils.nlp(example["paragraph_answer"])
    
    example["sentence"] = doc._.sentences[0].text
    example["answer_sentence"] = doc._.answer_sentence[0].text
    example["paragraph_sentence"] = doc._.paragraph_sentence[0].text
    return example


In [15]:
processed_dataset_spacy = processed_dataset.map(preprocess_spacy)
processed_dataset_spacy

Map: 100%|██████████| 1946/1946 [00:06<00:00, 293.31 examples/s]
Map: 100%|██████████| 55135/55135 [03:08<00:00, 291.84 examples/s]


DatasetDict({
    validation: Dataset({
        features: ['paragraph', 'paragraph_id', 'question', 'answer', 'paragraph_answer', 'paragraph_question', 'sentence', 'answer_sentence', 'paragraph_sentence'],
        num_rows: 1946
    })
    train: Dataset({
        features: ['paragraph', 'paragraph_id', 'question', 'answer', 'paragraph_answer', 'paragraph_question', 'sentence', 'answer_sentence', 'paragraph_sentence'],
        num_rows: 55135
    })
})

In [16]:
processed_dataset_spacy.push_to_hub('tiagoblima/qg_br_squadv2', token=HF_TOKEN)

Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 55.91ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:07<00:00,  7.62s/it]
Creating parquet from Arrow format: 100%|██████████| 56/56 [00:00<00:00, 91.44ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [03:05<00:00, 185.22s/it]
