# Create RACE Dataset
<b>Date:</b> October 23, 2023 \
<b>Author:</b> Dimitris Lymperopoulos \
<b>Description:</b> A notebook used to generate csv file using as input the folder with the RACE data

## Imports

In [1]:
import os
import json
import pandas as pd

from itertools import chain
from pathlib import Path

## Parameters

In [29]:
TRAIN_HIGH = '../../Data/RACE/train/high/'
TRAIN_MID = '../../Data/RACE/train/middle/'
TEST_HIGH = '../../Data/RACE/test/high/'
TEST_MID = '../../Data/RACE/test/middle/'

INTERNAL_SEP = '~'

TRAIN_CSV = '../../Data/RACE/train/race_train.csv'
TEST_CSV = '../../Data/RACE/test/race_test.csv'


## Clean Text Function

In [3]:
def clean_text(text, special_chars=["\n", "\t"]):
    for char in special_chars:
        text = text.replace(char, " ")
    return text

## Create Training CSV

In [32]:
answer_mapping = {"A": 0, "B": 1, "C": 2, "D": 3}

train_path = chain(
    Path(TRAIN_HIGH).glob('*.txt'), 
    Path(TRAIN_MID).glob('*.txt')
)

ids = []
articles_lst = []
questions_lst = []
options_lst = []
answers_idx = []

for p in train_path:
    data = json.loads(p.read_text())
    iterator = enumerate(zip(data["answers"], 
                             data["options"], 
                             data["questions"]))
    for idx, (answer, options, question) in iterator: 
        qid = str(data["id"][:-4] + "_" + str(idx))
        ids.append(qid) 
        articles_lst.append(clean_text(data["article"]))
        questions_lst.append(question)
        options_lst.append(options)
        answers_idx.append(answer_mapping[answer])

In [33]:
# convert list to str for each "cell" containing options
options_lst = list(map(lambda x: INTERNAL_SEP.join(x), options_lst))   

print(set(type(opt) for  opt in options_lst))   # should be <class 'str'>

{<class 'str'>}


In [35]:
train_df = pd.DataFrame({
    'ids': ids,
    'articles': articles_lst,
    'questions': questions_lst,
    'options': options_lst,
    'answers': answers_idx
})

print(train_df.head())
print("\nTraining Dataframe has shape: {}".format(train_df.shape))

        ids                                           articles  \
0   high1_0  My husband is a born shopper. He loves to look...   
1   high1_1  My husband is a born shopper. He loves to look...   
2   high1_2  My husband is a born shopper. He loves to look...   
3   high1_3  My husband is a born shopper. He loves to look...   
4  high10_0  Tea drinking was common in China for nearly on...   

                                           questions  \
0          The husband likes shopping because   _  .   
1      They never go shopping together because  _  .   
2    Jimmy can't do the shopping well because   _  .   
3  Jimmy didn't buy what his mother wanted becaus...   
4  Which of the following is true of the introduc...   

                                             options  answers  
0  he has much money.~he likes the shops.~he like...        2  
1  their ways of shopping are quite different~the...        0  
2  he is young~he is absent-minded~he often loses...        1  
3  the sho

In [36]:
train_df.to_csv(TRAIN_CSV, index=False)

## Create Testing CSV

In [37]:
test_path = chain(
    Path(TEST_HIGH).glob('*.txt'), 
    Path(TEST_MID).glob('*.txt')
)

ids = []
articles_lst = []
questions_lst = []
options_lst = []
answers_idx = []

for p in test_path:
    data = json.loads(p.read_text())
    iterator = enumerate(zip(data["answers"], 
                             data["options"], 
                             data["questions"]))
    for idx, (answer, options, question) in iterator: 
        qid = str(data["id"][:-4] + "_" + str(idx))
        ids.append(qid) 
        articles_lst.append(clean_text(data["article"]))
        questions_lst.append(question)
        options_lst.append(options)
        answers_idx.append(answer_mapping[answer])

In [38]:
# convert list to str for each "cell" containing options
options_lst = list(map(lambda x: INTERNAL_SEP.join(x), options_lst))   

print(set(type(opt) for  opt in options_lst))   # should be <class 'str'>

{<class 'str'>}


In [39]:
test_df = pd.DataFrame({
    'ids': ids,
    'articles': articles_lst,
    'questions': questions_lst,
    'options': options_lst,
    'answers': answers_idx
})

print(test_df.head())
print("\nTesting Dataframe has shape: {}".format(test_df.shape))

           ids                                           articles  \
0  high10001_0  Studies show that you may be lied to every day...   
1  high10001_1  Studies show that you may be lied to every day...   
2  high10001_2  Studies show that you may be lied to every day...   
3  high10001_3  Studies show that you may be lied to every day...   
4  high10001_4  Studies show that you may be lied to every day...   

                                           questions  \
0     From Para.1 we learn that lying is very   _  .   
1  According to the passage, a lie works when   _  .   
2                    Lying is complex because   _  .   
3  The examples of kids lying in the passage show...   
4  What will the writer most likely talk about if...   

                                             options  answers  
0                    harmful~easy~interesting~common        3  
1  the liar's words are sweet enough~it is given ...        2  
2  it is practiced by clever ones~people are for ...    

In [40]:
test_df.to_csv(TEST_CSV, index=False)