In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
with open('data/airbus_helicopters_train_set.json', 'r') as f :
    dataset = json.load(f)

In [3]:
length_tab = []
for key in dataset.keys() :
    len_text = len(dataset[key]['original_text'].split())
    len_sum = len(dataset[key]['reference_summary'].split())
    length_tab.append((key, len_text, len_sum, len_sum/len_text))

pdf_length = pd.DataFrame(length_tab, columns=['uid', 'len_text', 'len_sum', 'ratio'])
pdf_length

Unnamed: 0,uid,len_text,len_sum,ratio
0,train_sum01,49,25,0.510204
1,train_sum010,132,25,0.189394
2,train_sum0100,15,16,1.066667
3,train_sum0101,36,23,0.638889
4,train_sum0102,30,27,0.900000
...,...,...,...,...
408,train_sum095,75,14,0.186667
409,train_sum096,24,17,0.708333
410,train_sum097,54,15,0.277778
411,train_sum098,33,26,0.787879


In [4]:
mask = pdf_length['ratio'] <= 1
pdf_length_masked = pdf_length[mask]
pdf_length_masked.reset_index(inplace=True, drop=True)
pdf_length_masked

Unnamed: 0,uid,len_text,len_sum,ratio
0,train_sum01,49,25,0.510204
1,train_sum010,132,25,0.189394
2,train_sum0101,36,23,0.638889
3,train_sum0102,30,27,0.900000
4,train_sum0103,30,23,0.766667
...,...,...,...,...
403,train_sum094,47,29,0.617021
404,train_sum095,75,14,0.186667
405,train_sum096,24,17,0.708333
406,train_sum097,54,15,0.277778


In [5]:
sorted_ratio = pdf_length_masked.sort_values('ratio')['ratio']
decile = pd.qcut(sorted_ratio, 10, labels=False).sort_index()
pdf_length_masked['decile'] = decile
pdf_length_masked

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pdf_length_masked['decile'] = decile


Unnamed: 0,uid,len_text,len_sum,ratio,decile
0,train_sum01,49,25,0.510204,3
1,train_sum010,132,25,0.189394,0
2,train_sum0101,36,23,0.638889,6
3,train_sum0102,30,27,0.900000,9
4,train_sum0103,30,23,0.766667,8
...,...,...,...,...,...
403,train_sum094,47,29,0.617021,5
404,train_sum095,75,14,0.186667,0
405,train_sum096,24,17,0.708333,7
406,train_sum097,54,15,0.277778,0


In [6]:
idx_val = []
idx_test = []
idx_extracted = []
for i in range(10) :
    mask_decile = pdf_length_masked['decile']==i
    idx = np.random.choice(pdf_length_masked[mask_decile].index, 3, replace=False)
    idx_val += list(idx[:2])
    idx_test += list(idx[2:])
    idx_extracted += list(idx)
print(idx_test)

[22, 390, 305, 285, 226, 403, 90, 319, 345, 31]


In [7]:
pdf_val = pdf_length_masked.iloc[idx_val]
pdf_test = pdf_length_masked.iloc[idx_test]
pdf_train = pdf_length_masked.drop(idx_extracted)

In [8]:
def df_to_json(pdf, dataset) :
    res = {}
    for uid in pdf['uid'] :
        res[uid] = dataset[uid]
    return(res)

In [10]:
val_dict = df_to_json(pdf_val, dataset)
with open('data/data_val.json', 'w', encoding='utf-8') as f:
    json.dump(val_dict, f, ensure_ascii=False, indent=4)

test_dict = df_to_json(pdf_test, dataset)
with open('data/data_test.json', 'w', encoding='utf-8') as f:
    json.dump(test_dict, f, ensure_ascii=False, indent=4)

train_dict = df_to_json(pdf_train, dataset)
with open('data/data_train.json', 'w', encoding='utf-8') as f:
    json.dump(train_dict, f, ensure_ascii=False, indent=4)