In [14]:
import json
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [15]:
# Directory of scene and voice data
main_dir = 'D:/training_data/senrenbanka/'
path_to_json = main_dir + 'scenes/'
character_name = 'ムラサメ'

# Get all scene files
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

print(json_files)

['001・アーサー王ver1.07.ks.json', '002・祟り神ver1.08.ks.json', '003・巫女の秘密ver1.11.ks.json', '004・学院初日ver1.04.ks.json', '005・鍛錬ver1.04.ks.json', '006・レナ登場ver1.03.ks.json', '007・レナ転入ver1.03.ks.json', '008・仕切り直しver1.02.ks.json', '009・謎の欠片ver1.03.ks.json', '010・リフレッシュver1.04.ks.json', '011・襲来ver1.04.ks.json', '012・病床ver1.06.ks.json', '013・欠片集めver1.06.ks.json', '014・合体編ver1.03.ks.json', '015・ノーマルendver1.01.ks.json', '100・芳乃－語らいver1.01.ks.json', '101・芳乃－弁当ver1.02.ks.json', '102・芳乃－告白ver1.03.ks.json', '103・芳乃－氷解ver1.04.ks.json', '104・芳乃－記者会見ver1.05.ks.json', '105・芳乃－少女漫画ver1.01.ks.json', '106・芳乃－キスver1.01.ks.json', '107・芳乃－獣耳ver1.02.ks.json', '108・芳乃－解呪ver1.01.ks.json', '109・芳乃－平穏ver1.01.ks.json', '110・芳乃－初体験ver1.01.ks.json', '111・芳乃－その後ver1.01.ks.json', '112・芳乃－２回目ver1.01.ks.json', '113・芳乃－母上様ver1.01.ks.json', '114・芳乃－ＥＰver1.01.ks.json', '200・茉子－語らいver1.00.ks.json', '201・茉子－婚約解消ver1.01.ks.json', '202・茉子－実家ver1.01.ks.json', '203・茉子－子犬ver1.01.ks.json', '204・茉子－気持ちver1.00.ks.json', '205・茉子－デートver1.00.ks

In [16]:
def readFile(fname):
    file = open(path_to_json + fname, encoding='UTF-8')
    data = json.loads(file.read())
    file.close()
    return data

def writeFile(fname, text):
    file = open(fname, "w", encoding='UTF-8')
    n = file.write(text)
    file.close()

def yuzusoft_scene_parser(scenes, character_name):
    for scene in scenes:
        if 'texts' in scene:
            texts = scene['texts']
            for text in texts:
#                 print(text[0])
                if text[0] == character_name:
                    lines= text[2]
                    jp_line = lines[0][1]
#                     print(jp_line[1:-1])
                    jp_line.replace("\u3000", " ")
                    info = text[3][0]
                    voice = info['voice']
#                     print(voice)
                    tmp = "wavs/" + voice + '.wav|' + jp_line[1:-1]
                    filelist.append(tmp)

In [17]:
# For each scene, parses the scene, finds the speaker, and saves the line in the list
# sample line: mur001_001.wav|ふむ。お主が、吾輩のご主人か？

filelist = []
for json_file in json_files:
    data = readFile(json_file)
    scenes = data['scenes']
    yuzusoft_scene_parser(scenes, character_name)

In [18]:
# list -> dataframe in order to do train-test split
filelist_df = pd.DataFrame(filelist)
print(filelist_df.shape)
filelist_df.head(5)

(3510, 1)


Unnamed: 0,0
0,wavs/mur001_001.wav|ふむ。お主が、吾輩のご主人か？
1,wavs/mur001_002.wav|こっちだ、こっち
2,wavs/mur001_003.wav|お？　その驚き様、ちゃんと見えておるし、聞こえておるな
3,wavs/mur001_004.wav|違うっ！　吾輩は断じて幽霊などではない！
4,wavs/mur001_005.wav|吾輩の名前はムラサメ。『叢雨丸』の管理者……まあ、『...


In [19]:
# train-test-validate split
train, validate, test = np.split(filelist_df.sample(frac=1, random_state=0), [int(.8*len(filelist_df)), int(.9*len(filelist_df))])

In [20]:
print(test.shape)
test.head(5)

(351, 1)


Unnamed: 0,0
1429,wavs/mur302_020.wav|安晴だけではない、穂織で吾輩のことを知っている者は皆...
3242,wavs/mur414_079.wav|だいいち、勝つための作戦なのであろう？
2141,wavs/mur307_051.wav|なっ、何でもないっ！
809,wavs/mur107_006.wav|いきなり青空の下でとは……やるのう、お主ら
433,wavs/mur010_042.wav|な、なんだと！？　わ、吾輩をたばかったのか、ご主人っ


In [21]:
def dfToFilelist(df, fname): 
    tmp_filelist = []
    for index, row in test.iterrows():
        tmp_filelist.append(row[0])
    tmpstr = '\n'.join(tmp_filelist)
    writeFile(main_dir + fname, tmpstr)

In [22]:
dfToFilelist(test, "mur_test_filelist.txt")
dfToFilelist(validate, "mur_val_filelist.txt")
dfToFilelist(train, "mur_train_filelist.txt")