In [15]:
import json
import pandas as pd
from poprogress import simple_progress as simp
import random

In [16]:
file_path = "../data/conll-balanced.json"

with open(file_path) as f:
    text = json.load(f)
    print("-"*30)
    print("numbers of sentences: ", len(text))
    print("-"*30)

------------------------------
numbers of sentences:  21363
------------------------------


In [80]:
def get_df(data):

    df = pd.DataFrame()
    for i,(sentence, annotation) in simp(enumerate(data)):

        sent_labels = get_sent_labels(sentence, annotation)
        # print(sent_labels)
        dataset = {"raw_sentence": sentence,
                "labels": [sent_labels]}
        # print(dataset)
    
        df1 = pd.DataFrame(dataset)
        df = pd.concat([df, df1], ignore_index=True)
    # print(df)
    return df

def get_sent_labels(sent, anns):
    '''
    BIO
    param:
        - sent: sentence
        - ann : annotation of the sentence
    '''
    words = get_words(sent)
    words_length = len(words)
    label_list = ["O"] * words_length

    anchor = -1
    for ann in anns:
        # start = ann["start"]
        # end = ann["end"]
        label = ann["label"]
        text = ann["text"]
        text_word = text.split()

        for i,target in enumerate(text_word):
            for j in range(anchor,words_length):
                word = words[j]
                
                if target == word:
                    if i != 0:
                        label_list[j] = 'I-' + label
                    else:
                        label_list[j] = 'B-' + label
                    anchor = j + 1
                    break
    # print(label_list)
    return label_list

def get_words(sent):
    '''
    Clear punctuation
    '''
    words_list = sent.split()
    # eli = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    words = []
    for word in words_list:
        words.append(word)
        # if word not in eli:
        #     words.append(word)
    # print(words)
    return words

In [81]:
def data_split(full_list, ratio, shuffle=False):

    n_total = len(full_list)
    offset = int(n_total * ratio)
    
    if n_total == 0 or offset < 1:
        return [], full_list

    if shuffle:
        random.shuffle(full_list)
    test_set = full_list[:offset]
    train_set = full_list[offset:]
    return test_set, train_set

In [94]:
res_data, train_data = data_split(text, ratio=0.35, shuffle=True)
res_data, val_data = data_split(res_data, ratio=0.65, shuffle=True)
test2_data, test1_data = data_split(res_data, ratio=0.5, shuffle=True)

print(len(train_data))
print(len(val_data))
print(len(test1_data))
print(len(test2_data))

13886
2617
2430
2430


In [98]:
names = ["train", "val", "test1", "test2"]
data_list = [train_data, val_data, test1_data, test2_data]
for i,data in enumerate(data_list):
    df_data = get_df(data)
    df_data.to_csv(f"{names[i]}.csv", index=None)

13886it [00:14, 945.38it/s]
2617it [00:01, 1549.00it/s]
2430it [00:01, 1856.16it/s]
2430it [00:01, 1750.40it/s]
