# Data Preprocessing

This notebook gathers the preprocessing stages of each dataset for the analysis during out experiments.

In [1]:
DATASET_DIR = "../datasets"
OUTPUT_DIR = "../outputs"

# name of the dataset to preprocess
# DATASET_NAME, SPLIT_NAME = "squad", "validation"
DATASET_NAME, SPLIT_NAME = "newsqa", "dev"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'new_wiki'), "test"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'nyt'), "test"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'amazon'), "test"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'reddit'), "test"

if isinstance(DATASET_NAME, tuple):
    DATASET_NAME = "".join(DATASET_NAME)
    
BASE_FILENAME = f"{DATASET_NAME}_{SPLIT_NAME}"
IS_LOCAL_FS_DATASET = True if DATASET_NAME in ("newsqa",) else False

ROOT_DIR = f"{OUTPUT_DIR}/results/{DATASET_NAME}/{SPLIT_NAME}"
MATRIX_DIR = f"{ROOT_DIR}/matrix"
!mkdir -p {MATRIX_DIR}

MATRIX_FILEPATH = f"{MATRIX_DIR}/{BASE_FILENAME}_preprocessed.csv.gz"
print("Writing matrix at filepath:", MATRIX_FILEPATH)

SEED = 42
# Arguments used to read the files from disk
csv_kwargs = {
   "compression": "gzip",
   "encoding": "utf-8",
}

# ----------------------------------------
## Columns names
# ----------------------------------------
UNIQUE_ID_COL = "id"
print("Using", UNIQUE_ID_COL, "as the unique column")

QUESTION_COLNAME = "question"
CONTEXT_COLNAME = "context"
ANSWER_COLNAME = "answers"

UUID_FEATURES = [UNIQUE_ID_COL, ANSWER_COLNAME]
UUID_FEATURES

Writing matrix at filepath: ../outputs/results/newsqa/dev/matrix/newsqa_dev_preprocessed.csv.gz
Using id as the unique column


['id', 'answers']

In [2]:
from utils.datasets import load_dataset, unfold_multiple_answers, create_metadata

In [3]:
LOAD_KWARGS = {
    "dataset": DATASET_NAME,
    "split": SPLIT_NAME,
    "local": IS_LOCAL_FS_DATASET,
    "local_dir": DATASET_DIR,
    
    "fn_kwargs": {
        "answer_col": ANSWER_COLNAME,
    },
}

In [11]:
from utils_generic import filter_params, generate_uuid

load_kwargs = LOAD_KWARGS
print("\n--> Loading dataset with arguments:", load_kwargs)
data = load_dataset(**load_kwargs)
print("Loaded dataset with", len(data), "examples:", data)


--> Loading dataset with arguments: {'dataset': 'newsqa', 'split': 'dev', 'local': True, 'local_dir': '../datasets', 'fn_kwargs': {'answer_col': 'answers'}}
Loaded dataset with 4341 examples: Dataset({
    features: ['id', 'title', 'question', 'context', 'answers'],
    num_rows: 4341
})


In [12]:
unfold_kwargs = filter_params(LOAD_KWARGS, unfold_multiple_answers)
print("\n--> Unfolding (aka flattening) dataset with arguments:", unfold_kwargs)
data = data.map(unfold_multiple_answers, batched=True, **unfold_kwargs)
print("Resulting dataset has", len(data), "examples:", data)

print("\n--> Generate unique identifier using", UUID_FEATURES)
data = create_metadata(data, col=f"{ANSWER_COLNAME}_id", features=UUID_FEATURES, **unfold_kwargs)


COLS_NAMES = {
    "id": "example_id",
    ANSWER_COLNAME: "labels",
    f"{ANSWER_COLNAME}_multi_way": "multi_way_labels", 
}

print("\n--> Renaming column names", COLS_NAMES)
data = data.rename_columns(COLS_NAMES)
print(data)


--> Unfolding (aka flattening) dataset with arguments: {}


  0%|          | 0/5 [00:00<?, ?ba/s]

Resulting dataset has 4341 examples: Dataset({
    features: ['id', 'title', 'question', 'context', 'answers', 'answers_multi_way'],
    num_rows: 4341
})

--> Generate unique identifier using ['id', 'answers']


0ex [00:00, ?ex/s]


--> Renaming column names {'id': 'example_id', 'answers': 'labels', 'answers_multi_way': 'multi_way_labels'}
Dataset({
    features: ['example_id', 'title', 'question', 'context', 'labels', 'multi_way_labels', 'answers_id'],
    num_rows: 4341
})


In [13]:
data["multi_way_labels"][2]

"['Wednesday']"

### Dump matrix

For some reason, we're not being able to store with compression format using the datasets.

In [14]:
print("Storing matrix at:", MATRIX_FILEPATH[:-3])

Storing matrix at: ../outputs/results/newsqa/dev/matrix/newsqa_dev_preprocessed.csv


In [15]:
data.to_csv(
    MATRIX_FILEPATH[:-3],
    index=False,
    **csv_kwargs,
)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

  ioargs = _get_filepath_or_buffer(


16008694

In [16]:
import pandas as pd
d = pd.read_csv(MATRIX_FILEPATH[:-3]).tail()
d.multi_way_labels

4336                                    ['the coalition']
4337                                     ['restrictions']
4338           ['Iran of trying to build nuclear bombs,']
4339                                             ['Iran']
4340    ['Iran test-launched a rocket capable of carry...
Name: multi_way_labels, dtype: object

In [17]:
!ls -latrh {MATRIX_DIR}

total 16M
drwxrwxr-x 3 kat kat 4.0K Mar  2 07:06 ..
drwxrwxr-x 2 kat kat 4.0K Mar  2 07:10 .
-rw-rw-r-- 1 kat kat  16M Mar  2 07:10 newsqa_dev_preprocessed.csv
