# Data Preprocessing

This notebook gathers the preprocessing stages of each dataset for the analysis during out experiments.

In [1]:
DATASET_DIR = "../datasets"
OUTPUT_DIR = "../outputs"

# name of the dataset to preprocess
# DATASET_NAME, SPLIT_NAME = "squad", "validation"
# DATASET_NAME, SPLIT_NAME = "newsqa", "dev"
DATASET_NAME, SPLIT_NAME = ('squadshifts', 'new_wiki'), "test"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'nyt'), "test"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'amazon'), "test"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'reddit'), "test"

IS_LOCAL_FS_DATASET = True if DATASET_NAME in ("newsqa",) else False

if isinstance(DATASET_NAME, tuple):
    NORMALIZED_DATASET_NAME = "".join(DATASET_NAME)
else:
    NORMALIZED_DATASET_NAME = DATASET_NAME

BASE_FILENAME = f"{NORMALIZED_DATASET_NAME}_{SPLIT_NAME}"

ROOT_DIR = f"{OUTPUT_DIR}/results/{NORMALIZED_DATASET_NAME}/{SPLIT_NAME}"
MATRIX_DIR = f"{ROOT_DIR}/matrix"
!mkdir -p {MATRIX_DIR}

MATRIX_FILEPATH = f"{MATRIX_DIR}/{BASE_FILENAME}_preprocessed.csv.gz"
print("Writing matrix at filepath:", MATRIX_FILEPATH)

SEED = 42
# Arguments used to read the files from disk
csv_kwargs = {
   "compression": "gzip",
   "encoding": "utf-8",
}

# ----------------------------------------
## Columns names
# ----------------------------------------
UNIQUE_ID_COL = "id"
print("Using", UNIQUE_ID_COL, "as the unique column")

QUESTION_COLNAME = "question"
CONTEXT_COLNAME = "context"
ANSWER_COLNAME = "answers"

UUID_FEATURES = [UNIQUE_ID_COL, ANSWER_COLNAME]
UUID_FEATURES

Writing matrix at filepath: ../outputs/results/squadshiftsnew_wiki/test/matrix/squadshiftsnew_wiki_test_preprocessed.csv.gz
Using id as the unique column


['id', 'answers']

In [2]:
from utils.datasets import load_dataset, unfold_multiple_answers, create_metadata

In [3]:
LOAD_KWARGS = {
    "dataset": DATASET_NAME,
    "split": SPLIT_NAME,
    "local": IS_LOCAL_FS_DATASET,
    "local_dir": DATASET_DIR,
    
    "fn_kwargs": {
        "answer_col": ANSWER_COLNAME,
    },
}

In [4]:
from utils_generic import filter_params, generate_uuid

load_kwargs = LOAD_KWARGS
print("\n--> Loading dataset with arguments:", load_kwargs)
data = load_dataset(**load_kwargs)
print("Loaded dataset with", len(data), "examples:", data)


--> Loading dataset with arguments: {'dataset': ('squadshifts', 'new_wiki'), 'split': 'test', 'local': False, 'local_dir': '../datasets', 'fn_kwargs': {'answer_col': 'answers'}}


Reusing dataset squad_shifts (/home/kat/.cache/huggingface/datasets/squad_shifts/new_wiki/1.0.0/8303de6ce29bd28061c984dc50d04351a73bc3c344d5efe46f38b9948c2e3aca)


Loaded dataset with 7938 examples: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 7938
})


In [5]:
unfold_kwargs = filter_params(LOAD_KWARGS, unfold_multiple_answers)
print("\n--> Unfolding (aka flattening) dataset with arguments:", unfold_kwargs)
data = data.map(unfold_multiple_answers, batched=True, **unfold_kwargs)
print("Resulting dataset has", len(data), "examples:", data)

print("\n--> Generate unique identifier using", UUID_FEATURES)
data = create_metadata(data, col=f"{ANSWER_COLNAME}_id", features=UUID_FEATURES, **unfold_kwargs)


COLS_NAMES = {
    "id": "example_id",
    ANSWER_COLNAME: "labels",
    f"{ANSWER_COLNAME}_multi_way": "multi_way_labels", 
}

print("\n--> Renaming column names", COLS_NAMES)
data = data.rename_columns(COLS_NAMES)
print(data)

Loading cached processed dataset at /home/kat/.cache/huggingface/datasets/squad_shifts/new_wiki/1.0.0/8303de6ce29bd28061c984dc50d04351a73bc3c344d5efe46f38b9948c2e3aca/cache-8d35d8dadbb0a282.arrow
Loading cached processed dataset at /home/kat/.cache/huggingface/datasets/squad_shifts/new_wiki/1.0.0/8303de6ce29bd28061c984dc50d04351a73bc3c344d5efe46f38b9948c2e3aca/cache-bc9054b5f96ad2d9.arrow



--> Unfolding (aka flattening) dataset with arguments: {}
Resulting dataset has 14160 examples: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'answers_multi_way'],
    num_rows: 14160
})

--> Generate unique identifier using ['id', 'answers']

--> Renaming column names {'id': 'example_id', 'answers': 'labels', 'answers_multi_way': 'multi_way_labels'}
Dataset({
    features: ['example_id', 'title', 'context', 'question', 'labels', 'multi_way_labels', 'answers_id'],
    num_rows: 14160
})


In [11]:
data["multi_way_labels"][:10]

["['Each brotherhood elects two delegates who take part in the National Ecclesiastical Assembly', 'two delegates']",
 "['Each brotherhood elects two delegates who take part in the National Ecclesiastical Assembly', 'two delegates']",
 "['Each brotherhood elects two delegates who take part in the National Ecclesiastical Assembly.', 'National Ecclesiastical Assembly', 'The brotherhood makes decisions concerning the inner affairs of the monastery', 'concerning the inner affairs of the monastery']",
 "['Each brotherhood elects two delegates who take part in the National Ecclesiastical Assembly.', 'National Ecclesiastical Assembly', 'The brotherhood makes decisions concerning the inner affairs of the monastery', 'concerning the inner affairs of the monastery']",
 "['Each brotherhood elects two delegates who take part in the National Ecclesiastical Assembly.', 'National Ecclesiastical Assembly', 'The brotherhood makes decisions concerning the inner affairs of the monastery', 'concerning the 

### Dump matrix

For some reason, we're not being able to store with compression format using the datasets.

In [7]:
print("Storing matrix at:", MATRIX_FILEPATH[:-3])

Storing matrix at: ../outputs/results/squadshiftsnew_wiki/test/matrix/squadshiftsnew_wiki_test_preprocessed.csv


In [8]:
data.to_csv(
    MATRIX_FILEPATH[:-3],
    index=False,
    **csv_kwargs,
)

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

  ioargs = _get_filepath_or_buffer(


14440952

In [9]:
import pandas as pd
d = pd.read_csv(MATRIX_FILEPATH[:-3]).tail()
d.multi_way_labels

14155                             ['The Nutrition Source']
14156                                       ['eight-year']
14157    ['increased risk of obesity and diabetes', 'ri...
14158    ['increased risk of obesity and diabetes', 'ri...
14159                                           ['49,000']
Name: multi_way_labels, dtype: object

In [10]:
!ls -latrh {MATRIX_DIR}

total 14M
drwxrwxr-x 3 kat kat 4.0K Mar  2 07:11 ..
drwxrwxr-x 2 kat kat 4.0K Mar  2 07:17 .
-rw-rw-r-- 1 kat kat  14M Mar  2 07:17 squadshiftsnew_wiki_test_preprocessed.csv
