# Load Data

In [1]:
import pandas as pd
import numpy as np
import json
import os
from glob import glob

from tqdm.notebook import tqdm

In [2]:
DATA_DIR = "../dataset/kor-dialouge-data/"

In [3]:
TRAIN_DIR = "Training/*"
VALIDATION_DIR = "Validation/*"

In [4]:
train_path = os.path.join(DATA_DIR,TRAIN_DIR)
validation_path = os.path.join(DATA_DIR,VALIDATION_DIR)

In [5]:
train_paths = glob(os.path.join(train_path,"*"))
validation_paths = glob(os.path.join(validation_path,"*"))

In [6]:
train = pd.DataFrame(columns=["ID","num_of_people","num_of_talk","type","topic","person_info","utterances"])

for p in tqdm(train_paths):

    with open(p, 'r') as f:
        f = json.load(f)

    d_ids = [d['dialogueID'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    num_of_people = [d['numberOfParticipants'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    num_of_talk = [d['numberOfUtterances'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    d_type = [d['type'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    d_topic = [d['topic'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    person_info = [" ".join(list(map(lambda x:f"{x['participantID']}:{x['gender']},{x['age']}" ,d))) for d in [data['header']['participantsInfo'] for data in f['data']]]
    utterances = [" ".join(list(map(lambda x:"<"+x['participantID']+">"+x["utterance"]+"</"+x['participantID']+">" ,utterance))) for utterance in [body['dialogue'] for body in [data['body'] for data in f['data']]]]

    df_for_concat = pd.DataFrame({
        "ID"             :d_ids,
        "num_of_people"  :num_of_people,
        "num_of_talk"    :num_of_talk,
        "type"           :d_type,
        "topic"          :d_topic,
        "person_info"    :person_info,
        "utterances"     :utterances,
    })

    train = pd.concat([train, df_for_concat])

train = train.reset_index(drop=True)

  0%|          | 0/9 [00:00<?, ?it/s]

In [7]:
validation = pd.DataFrame(columns=["ID","num_of_people","num_of_talk","type","topic","person_info","utterances"])

for p in tqdm(validation_paths):

    with open(p, 'r') as f:
        f = json.load(f)

    d_ids = [d['dialogueID'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    num_of_people = [d['numberOfParticipants'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    num_of_talk = [d['numberOfUtterances'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    d_type = [d['type'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    d_topic = [d['topic'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    person_info = [" ".join(list(map(lambda x:f"{x['participantID']}:{x['gender']},{x['age']}" ,d))) for d in [data['header']['participantsInfo'] for data in f['data']]]
    utterances = [" ".join(list(map(lambda x:"<"+x['participantID']+">"+x["utterance"]+"</"+x['participantID']+">" ,utterance))) for utterance in [body['dialogue'] for body in [data['body'] for data in f['data']]]]

    df_for_concat = pd.DataFrame({
        "ID"             :d_ids,
        "num_of_people"  :num_of_people,
        "num_of_talk"    :num_of_talk,
        "type"           :d_type,
        "topic"          :d_topic,
        "person_info"    :person_info,
        "utterances"     :utterances,
    })

    validation = pd.concat([validation, df_for_concat])

validation = validation.reset_index(drop=True)

  0%|          | 0/9 [00:00<?, ?it/s]

In [14]:
train.to_csv("../dataset/kor-dialouge-data/data_v1/train_v1.tsv",sep="\t",index=False)
validation.to_csv("../dataset/kor-dialouge-data/data_v1/validation_v1.tsv",sep="\t",index=False)