In [1]:
import json
import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [2]:
def read_json(filename):
    with open(filename) as f:
        return json.load(f)
    
def write_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [3]:
train = read_json("../../data/at/wikidata/task1_wikidata_train.json")

In [4]:
train[0]

{'id': 0,
 'question': 'What periodical literature does Delta Air Lines use as a moutpiece?',
 'category': 'resource',
 'type': ['publication',
  'recurring',
  'intellectual work',
  'text',
  'communication medium',
  'serial']}

In [5]:
ids = [q['id'] for q in train]
questions = [q['question'] for q in train]
categories = [q['category'] for q in train]
types = [q['type'] for q in train]

In [6]:
train_df = pd.DataFrame.from_dict({'id': ids, 'question': questions, 'category': categories, 'type': types})
print(train_df.shape)
train_df.head()

(43554, 4)


Unnamed: 0,id,question,category,type
0,0,What periodical literature does Delta Air Line...,resource,"[publication, recurring, intellectual work, te..."
1,1,Who is the child of Ranavalona I's husband?,resource,"[person, omnivore, natural person]"
2,2,Is it true Jeff_Bridges occupation Lane Chandl...,boolean,[boolean]
3,3,Which is the operating income for Qantas?,literal,[number]
4,4,which cola starts with the letter p,resource,"[soft drink, trademark, carbonated beverage, n..."


In [7]:
train_df = train_df[(~train_df.category.isna()) & (~train_df.type.isna()) & (~train_df.question.isna()) & (~train_df.question.isnull())]
train_df.question = train_df.question.apply(lambda x: str(x.replace('{', '').replace('}', ''))).astype(str)
train_df = train_df[train_df.question != 'n/a']
print(train_df.shape)

(43554, 4)


In [14]:
train_df.to_csv("../../data/at/wikidata/lcquad2_anstype_wikidata_train_cleaned.csv", sep='$', index=False)

In [8]:
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
skf.get_n_splits(train_df.id, train_df.category)

3

In [10]:
train_list = []
test_list = []

for train_index, test_index in skf.split(train_df.id, train_df.category):
    # print("TRAIN:", train_index, "TEST:", test_index)
    train_list.append(train_df.iloc[train_index])
    test_list.append(train_df.iloc[test_index])