# Load Data

In [1]:
import pandas as pd
import numpy as np
import json
import os
from glob import glob

from tqdm.notebook import tqdm

In [2]:
DATA_DIR = "../dataset/kor-dialouge-data/"

In [3]:
TRAIN_DIR = "Training/*"
VALIDATION_DIR = "Validation/*"

In [4]:
train_path = os.path.join(DATA_DIR,TRAIN_DIR)
validation_path = os.path.join(DATA_DIR,VALIDATION_DIR)

In [5]:
train_paths = glob(os.path.join(train_path,"*"))
validation_paths = glob(os.path.join(validation_path,"*"))

In [7]:
train = pd.DataFrame(columns=["ID","num_of_people","num_of_talk","type","topic","person_info","utterances"])

for p in tqdm(train_paths):

    with open(p, 'r') as f:
        f = json.load(f)

    d_ids = [d['dialogueID'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    num_of_people = [d['numberOfParticipants'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    num_of_talk = [d['numberOfUtterances'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    d_type = [d['type'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    d_topic = [d['topic'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    person_info = [" ".join(list(map(lambda x:f"{x['participantID']}:{x['gender']},{x['age']}" ,d))) for d in [data['header']['participantsInfo'] for data in f['data']]]
    utterances = [" ".join(list(map(lambda x:"<"+x['participantID']+">"+x["utterance"]+"</"+x['participantID']+">" ,utterance))) for utterance in [body['dialogue'] for body in [data['body'] for data in f['data']]]]

    df_for_concat = pd.DataFrame({
        "ID"             :d_ids,
        "num_of_people"  :num_of_people,
        "num_of_talk"    :num_of_talk,
        "type"           :d_type,
        "topic"          :d_topic,
        "person_info"    :person_info,
        "utterances"     :utterances,
    })

    train = pd.concat([train, df_for_concat])

train = train.reset_index(drop=True)

  0%|          | 0/9 [00:00<?, ?it/s]

In [13]:
train

Unnamed: 0,ID,num_of_people,num_of_talk,type,topic,person_info,utterances
0,94f286eb-309e-5657-afe9-40ad840a1071,2,10,일상 대화,개인 및 관계,"P01:여성,20대 P02:여성,20대",<P01>근데 왜 막말을 할까...</P01> <P01>이유가 뭘까...</P01>...
1,8f4355c8-f324-5376-9746-90bdcb843458,2,16,일상 대화,개인 및 관계,"P01:여성,20대 P02:여성,20대",<P01>대박</P01> <P01>#@소속# 다니던</P01> <P01>#@이름# ...
2,dbc95770-d1da-5921-8ea8-14e2b4f0f9a8,2,9,일상 대화,개인 및 관계,"P01:남성,20대 P02:남성,20대",<P01>너는 대중적이지 않은 새로 만들어진 종교나 단체를 쉽게 신뢰하는 성격이야?...
3,d91f77ae-caf4-5c85-bf4b-1626e13d6809,2,11,일상 대화,개인 및 관계,"P01:여성,30대 P02:남성,30대","<P01>여봉</P01> <P01>바쁘오?</P01> <P02>응...쪼끔,.야근할..."
4,a2b26381-cc6d-5140-a71b-2299b37cbd4c,2,8,일상 대화,개인 및 관계,"P01:여성,20대 P02:남성,20대",<P01>5시에저나</P01> <P01>가넝 한 가</P01> <P02>응응</P0...
...,...,...,...,...,...,...,...
279987,53116443-eb8c-5d1a-9386-6f392edda706,2,10,일상 대화,여가 생활,"P01:여성,20대 P02:여성,20대",<P01>#@시스템#사진#</P01> <P01>이거바</P01> <P02>디올에서 ...
279988,a1df98c0-aa40-550a-bbfa-6045051633b8,2,8,일상 대화,여가 생활,"P01:남성,30대 P02:여성,20대",<P01>응응 그렇지...</P01> <P02>오빠...</P02> <P01>응응?...
279989,1de04e2e-6664-5e27-b1f7-a76d498226be,2,11,일상 대화,여가 생활,"P01:여성,20대 P02:여성,20대",<P01>아 와인 생각보다 도수가 높네요</P01> <P02>아 와인 조아요</P0...
279990,dceba922-e328-5196-b85c-8d929366410a,2,9,일상 대화,여가 생활,"P01:여성,30대 P02:남성,30대",<P01>뭐해</P01> <P02>컴퓨터</P02> <P02>영화보려고 티빙 아이디...


In [7]:
validation = pd.DataFrame(columns=["ID","num_of_people","num_of_talk","type","topic","person_info","utterances"])

for p in tqdm(validation_paths):

    with open(p, 'r') as f:
        f = json.load(f)

    d_ids = [d['dialogueID'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    num_of_people = [d['numberOfParticipants'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    num_of_talk = [d['numberOfUtterances'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    d_type = [d['type'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    d_topic = [d['topic'] for d in [data['header']['dialogueInfo'] for data in f['data']]]
    person_info = [" ".join(list(map(lambda x:f"{x['participantID']}:{x['gender']},{x['age']}" ,d))) for d in [data['header']['participantsInfo'] for data in f['data']]]
    utterances = [" ".join(list(map(lambda x:"<"+x['participantID']+">"+x["utterance"]+"</"+x['participantID']+">" ,utterance))) for utterance in [body['dialogue'] for body in [data['body'] for data in f['data']]]]

    df_for_concat = pd.DataFrame({
        "ID"             :d_ids,
        "num_of_people"  :num_of_people,
        "num_of_talk"    :num_of_talk,
        "type"           :d_type,
        "topic"          :d_topic,
        "person_info"    :person_info,
        "utterances"     :utterances,
    })

    validation = pd.concat([validation, df_for_concat])

validation = validation.reset_index(drop=True)

  0%|          | 0/9 [00:00<?, ?it/s]

In [14]:
train.to_csv("../dataset/kor-dialouge-data/data_v1/train_v1.tsv",sep="\t",index=False)
validation.to_csv("../dataset/kor-dialouge-data/data_v1/validation_v1.tsv",sep="\t",index=False)