In [1]:
from datasets import load_dataset, load_metric

import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Swag Dataset

### Swag

In [None]:
swag = load_dataset("swag", "regular", cache_dir="./swag_cache")

In [None]:
swag

In [None]:
# swag["train"][0]

In [2]:
data_path = './swag/'
swag_train = pd.read_csv(os.path.join(data_path, 'train.csv'), index_col=0)
swag_val   = pd.read_csv(os.path.join(data_path, 'val.csv'), index_col=0)
swag_test  = pd.read_csv(os.path.join(data_path, 'test.csv'), index_col=0)

In [3]:
from datasets import Dataset, DatasetDict

train, val, test = Dataset.from_pandas(swag_train, preserve_index=False), Dataset.from_pandas(swag_val, preserve_index=False), Dataset.from_pandas(swag_test, preserve_index=False)
datasets = DatasetDict()
datasets['train'], datasets['val'], datasets['test'] = train, val, test
datasets

DatasetDict({
    train: Dataset({
        features: ['video-id', 'fold-ind', 'startphrase', 'sent1', 'sent2', 'gold-source', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 73546
    })
    val: Dataset({
        features: ['video-id', 'fold-ind', 'startphrase', 'sent1', 'sent2', 'gold-source', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 20006
    })
    test: Dataset({
        features: ['video-id', 'fold-ind', 'startphrase', 'sent1', 'sent2', 'gold-source', 'ending0', 'ending1', 'ending2', 'ending3'],
        num_rows: 20005
    })
})

In [4]:
datasets["val"][-1]

{'video-id': 'lsmdc3090_YOUNG_ADULT-43925',
 'fold-ind': 10185,
 'startphrase': 'Someone sits at a table in the center of the room. A server',
 'sent1': 'Someone sits at a table in the center of the room.',
 'sent2': 'A server',
 'gold-source': 'gold',
 'ending0': 'crosses a courtyard at the foot of the steps.',
 'ending1': 'hands the table to someone.',
 'ending2': 'reads the name of the television.',
 'ending3': 'sets down a napkin and silverware.',
 'label': 3}

In [None]:
data_files = {}
data_files["train"] = "train.json"
data_files["valid"] = "valid.json"
extension = data_files["train"].split('.')[-1]
raw_datasets = load_dataset(extension, data_files=data_files)
raw_datasets

### Swag formatter

In [15]:
import json

def swag_formatter():
    folder = "dataset/"
    corpus = json.load(open(f"{folder}context.json"))
    train = json.load(open(f"{folder}train.json"))
    valid = json.load(open(f"{folder}valid.json"))
    test = json.load(open(f"{folder}test.json"))
    save_keys = ['id', 'question', 'paragraphs', 'relevant']
    ending_names = [f"ending{i}" for i in range(4)]
    
    for idx, data in enumerate(['train', 'valid', 'test']):
        results = []
        for element in eval(data):
            pairs = {}
            for key in save_keys:
                if key == 'relevant':
                    if idx != 2:
                        pairs['label'] = element['paragraphs'].index(element[key])
                    else:
                        pairs['label'] = 0
                elif key == 'paragraphs':
                    for i, num in enumerate(element[key]):
                        pairs[ending_names[i]] = corpus[num]
                elif key == 'question':
                    pairs['sent1'] = element[key]
                    pairs['sent2'] = ''
                else:
                    pairs['video-id'] = element[key]
            results.append(pairs)
        json_obj = json.dumps(results, indent=2, ensure_ascii=False)
        with open(f"{folder}swag_{data}.json", "w", encoding="utf-8") as file:
            file.write(json_obj)
    
swag_formatter()

## SQuAD Dataset

### SQuAD

In [2]:
squad_v2 = False
SQuAD = load_dataset("squad_v2" if squad_v2 else "squad")

Found cached dataset squad (/home/hykao/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2/2 [00:00<00:00, 46.05it/s]


In [3]:
SQuAD

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [4]:
SQuAD["train"][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [None]:
SQuAD["train"][-1]

In [None]:
import json
data_path = './SQuAD/'
SQuAD_train = json.load(open(os.path.join(data_path, 'train-v1.1.json')))
SQuAD_valid = json.load(open(os.path.join(data_path, 'dev-v1.1.json')))

In [None]:
formatted = json.dumps(SQuAD_valid, indent=2)
print(formatted)

In [None]:
SQuAD_train["data"][0]["paragraphs"][0]

### SQuAD formatter

In [6]:
import json

folder = "dataset/"
corpus = json.load(open(f"{folder}context.json"))
train = json.load(open(f"{folder}train.json"))
valid = json.load(open(f"{folder}valid.json"))
test = json.load(open(f"{folder}test.json"))

def squad_formatter():
    save_keys = ['id', 'question', 'context', 'answers']
    for idx, data in enumerate(['train', 'valid', 'test']):
        print(data)
        results = []
        for element in eval(data):
            pairs = {}
            for key in save_keys:
                if key == 'answers':
                    if idx != 2:
                        new_dict = {}
                        for k, v in element[key[:-1]].items():
                            if k != "text":
                                new_dict["answer_" + k] = [v]
                            else:
                                new_dict[k] = [v]
                        pairs[key] = new_dict
                elif key == 'context':
                    if idx != 2:
                        pairs[key] = corpus[element['relevant']]
                    else:
                        pairs[key] = corpus[element['paragraphs'][-1]]
                else:
                    pairs[key] = element[key]
            results.append(pairs)
        json_obj = json.dumps(results, indent=2, ensure_ascii=False)
        with open(f"{folder}squad_{data}.json", "w", encoding="utf-8") as file:
            file.write(json_obj)
    
squad_formatter()

train
valid
test


## Validate

In [2]:
# for swag evaluation
import json
valid = json.load(open("dataset/squad_valid.json")) # validation
test = json.load(open("format_test.json")) # prediction

total = len(valid)
correct = 0
for i, element in enumerate(test):
    correct += element['context'] == valid[i]['context']
print(f"accuracy: {(correct/total*100):.4f}")

accuracy: 95.6132


In [6]:
# for squad evaluation
ans_dict = {}
for element in valid:
    ans_dict[element['id']] = element['answers']['text'][0]
# json_obj = json.dumps(ans_dict, ensure_ascii=False, indent=2)
# with open("ground_truths.json", "w", encoding="utf-8") as file:
#     file.write(json_obj)

correct = 0
total = len(ans_dict)
pred = json.load(open("output/valid_qa/predict_predictions.json"))
for key, val in pred.items():
    if ans_dict[key] == val:
        correct += 1
print(f"exact_match: {(correct/total*100):.4f}")

exact_match: 81.2230


In [19]:
import pandas as pds

df_pred = pd.read_json("output/test_qa21/predict_predictions.json", typ="series").reset_index()
df_pred.columns = ['id', 'answer']
df_pred.to_csv("submit.csv", index=False)