In [1]:
import json
import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [2]:
def read_json(filename):
    with open(filename) as f:
        return json.load(f)
    
def write_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [10]:
train = read_json("../../data/at/dbpedia/task1_dbpedia_train.json")

In [11]:
train[0]

{'id': 0,
 'question': 'Was Jacqueline Kennedy Onassis a follower of Melkite Greek Catholic Church?',
 'category': 'boolean',
 'type': ['boolean']}

In [12]:
ids = [q['id'] for q in train]
questions = [q['question'] for q in train]
categories = [q['category'] for q in train]
types = [q['type'] for q in train]

In [13]:
train_df = pd.DataFrame.from_dict({'id': ids, 'question': questions, 'category': categories, 'type': types})
print(train_df.shape)
train_df.head()

(36670, 4)


Unnamed: 0,id,question,category,type
0,0,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,1,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,2,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,3,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,5,Which is the hierarchical BrainInfo ID of the ...,literal,[string]


In [14]:
train_df = train_df[(~train_df.category.isna()) & (~train_df.type.isna()) & (~train_df.question.isna()) & (~train_df.question.isnull())]
train_df.question = train_df.question.apply(lambda x: str(x.replace('{', '').replace('}', ''))).astype(str)
train_df = train_df[train_df.question != 'n/a']
print(train_df.shape)

(36670, 4)


In [15]:
train_df.to_csv("../../data/at/dbpedia/smarttask_dbpedia_train_cleaned.csv", sep='$', index=False)

In [8]:
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
skf.get_n_splits(train_df.id, train_df.category)

3

In [10]:
train_list = []
test_list = []

for train_index, test_index in skf.split(train_df.id, train_df.category):
    # print("TRAIN:", train_index, "TEST:", test_index)
    train_list.append(train_df.iloc[train_index])
    test_list.append(train_df.iloc[test_index])