# Data Preprocessing

This notebook gathers the preprocessing stages of each dataset for the analysis during out experiments.

In [1]:
DATASET_DIR = "../datasets"
OUTPUT_DIR = "../outputs"

# name of the dataset to preprocess
DATASET_NAME, SPLIT_NAME = "squad", "validation"
# DATASET_NAME, SPLIT_NAME = "newsqa", "validation"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'new_wiki'), "test"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'nyt'), "test"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'amazon'), "test"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'reddit'), "test"

if isinstance(DATASET_NAME, tuple):
    DATASET_NAME = "".join(DATASET_NAME)
    
BASE_FILENAME = f"{DATASET_NAME}_{SPLIT_NAME}"
IS_LOCAL_FS_DATASET = True if DATASET_NAME in ("newsqa",) else False

ROOT_DIR = f"{OUTPUT_DIR}/results/{DATASET_NAME}/{SPLIT_NAME}"
MATRIX_DIR = f"{ROOT_DIR}/matrix"
!mkdir -p {MATRIX_DIR}

MATRIX_FILEPATH = f"{MATRIX_DIR}/{BASE_FILENAME}_preprocessed.csv.gz"
print("Writing matrix at filepath:", MATRIX_FILEPATH)

SEED = 42
# Arguments used to read the files from disk
csv_kwargs = {
   "compression": "gzip",
   "encoding": "utf-8",
}

# ----------------------------------------
## Columns names
# ----------------------------------------
UNIQUE_ID_COL = "id"
print("Using", UNIQUE_ID_COL, "as the unique column")

QUESTION_COLNAME = "question"
CONTEXT_COLNAME = "context"
ANSWER_COLNAME = "answers"

UUID_FEATURES = [UNIQUE_ID_COL, ANSWER_COLNAME]
UUID_FEATURES

Writing matrix at filepath: ../outputs/results/squad/validation/matrix/squad_validation_preprocessed.csv.gz
Using id as the unique column


['id', 'answers']

In [2]:
from utils.datasets import load_dataset, unfold_multiple_answers, create_metadata

In [3]:
LOAD_KWARGS = {
    "dataset": DATASET_NAME,
    "split": SPLIT_NAME,
    "local": IS_LOCAL_FS_DATASET,
    "local_dir": DATASET_DIR,
    
    "fn_kwargs": {
        "answer_col": ANSWER_COLNAME,
    },
}

In [4]:
from utils_generic import filter_params, generate_uuid

load_kwargs = filter_params(LOAD_KWARGS, load_dataset)
print("\n--> Loading dataset with arguments:", load_kwargs)
data = load_dataset(**load_kwargs)
print("Loaded dataset with", len(data), "examples:", data)
#data["answers"]


--> Loading dataset with arguments: {'dataset': 'squad', 'split': 'validation', 'local': False}


Reusing dataset squad (/home/kat/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


Loaded dataset with 10570 examples: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})


In [5]:
unfold_kwargs = filter_params(LOAD_KWARGS, unfold_multiple_answers)
print("\n--> Unfolding (aka flattening) dataset with arguments:", unfold_kwargs)
data = data.map(unfold_multiple_answers, batched=True, **unfold_kwargs)
print("Resulting dataset has", len(data), "examples:", data)

print("\n--> Generate unique identifier using", UUID_FEATURES)
data = create_metadata(data, col=f"{ANSWER_COLNAME}_id", features=UUID_FEATURES, **unfold_kwargs)


COLS_NAMES = {
    "id": "example_id",
    ANSWER_COLNAME: "labels",
    f"{ANSWER_COLNAME}_multi_way": "multi_way_labels", 
}

print("\n--> Renaming column names", COLS_NAMES)
data = data.rename_columns(COLS_NAMES)
print(data)

Loading cached processed dataset at /home/kat/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-fb4084db8cbaec02.arrow
Loading cached processed dataset at /home/kat/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-dd4ac4f01f4647df.arrow



--> Unfolding (aka flattening) dataset with arguments: {}
Resulting dataset has 18015 examples: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'answers_multi_way'],
    num_rows: 18015
})

--> Generate unique identifier using ['id', 'answers']

--> Renaming column names {'id': 'example_id', 'answers': 'labels', 'answers_multi_way': 'multi_way_labels'}
Dataset({
    features: ['example_id', 'title', 'context', 'question', 'labels', 'multi_way_labels', 'answers_id'],
    num_rows: 18015
})


In [6]:
data["multi_way_labels"][2]

'["Levi\'s Stadium", "Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California.", \'Santa Clara, California\']'

### Dump matrix

For some reason, we're not being able to store with compression format using the datasets.

In [15]:
print("Storing matrix at:", MATRIX_FILEPATH[:-3])

Storing matrix at: ../outputs/results/squad/validation/matrix/squad_validation_preprocessed.csv


In [16]:
data.to_csv(
    MATRIX_FILEPATH[:-3],
    index=False,
    **csv_kwargs,
)

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

  ioargs = _get_filepath_or_buffer(


18097200

In [18]:
import pandas as pd
d = pd.read_csv(MATRIX_FILEPATH[:-3]).tail()
d.multi_way_labels

18010    ['metric slug', 'slug', 'the metric slug']
18011    ['metric slug', 'slug', 'the metric slug']
18012    ['metric slug', 'slug', 'the metric slug']
18013                                       ['kip']
18014                                    ['sthène']
Name: multi_way_labels, dtype: object

In [19]:
!ls -latrh {MATRIX_DIR}

total 37M
-rw-rw-r-- 1 kat kat 1.6M Mar  1 20:34 allenaiunifiedqa-t5-small.csv.gz
-rwxrwxr-x 1 kat kat    0 Mar  1 20:37 allenaiunifiedqa-t5-small.csv.gz.lock
drwxrwxr-x 5 kat kat 4.0K Mar  2 04:53 ..
drwxrwxr-x 2 kat kat 4.0K Mar  2 07:00 .
-rw-rw-r-- 1 kat kat  18M Mar  2 07:00 squad_validation_preprocessed.csv.gz
-rw-rw-r-- 1 kat kat  18M Mar  2 07:01 squad_validation_preprocessed.csv
