In [1]:
# Library Imports
import pandas as pd
import json
import ast
import numpy as np
from sklearn.model_selection import train_test_split

# The dataset selector
year = 2042

In [2]:
# Read the csv file
data = pd.read_csv(f'..\data\clean\sustainability-report-{year}-squad-format.csv', sep=";")

In [3]:
# Expand the answers column into multiple columns
df1 = pd.DataFrame(data['answers'].apply(ast.literal_eval).values.tolist())
df1.columns = 'answers.'+ df1.columns
col = data.columns.difference(['answers'])
data = pd.concat([data[col], df1],axis=1)

In [4]:
# Split into test and validation sets
train, dev = train_test_split(data, test_size=0.15, random_state=42, shuffle=True)

In [5]:
# Create the boilerplate json files
json_data_train = {"version":"train", "data":[]}
json_data_dev = {"version":"dev", "data":[]}

In [6]:
# Fill in the json files with the SQuAD data
for row in train.iterrows():
    row_data = row[1]
    id = row_data["id"]
    question = row_data["question"]
    context = row_data["context"]
    answer = row_data["answers.text"][0]
    answer_start = row_data["answers.answer_start"][0]
    single_row = {
        "title": f"Title {id}",
        "paragraphs": [
            {
                "context": f"{context}",
                "qas": [
                    {
                        "id": f"{id}",
                        "question": f"{question}",
                        "answers": [
                            {
                                "text": f"{answer}",
                                "answer_start": answer_start
                            }
                        ]
                    }
                ]
            }
        ]
    }
    json_data_train["data"].append(single_row)

In [7]:
# Serialize json
json_object_train = json.dumps(json_data_train, indent=4)
json_object_dev = json.dumps(json_data_train, indent=4)
 
# Write to files
with open(f'..\data\clean\json\sustainability-report-{year}-squad-format-train.json', "w") as outfile:
    outfile.write(json_object_train)
with open(f'..\data\clean\json\sustainability-report-{year}-squad-format-dev.json', "w") as outfile:
    outfile.write(json_object_train)