In [1]:
import numpy as np
import re
from collections import Counter
from dataset import get_vocab, index_sents
from konlpy.tag import Twitter
from hangul_romanize import Transliter
from hangul_romanize.rule import academic
from separator import Separator
from sklearn.model_selection import train_test_split

## get top n words from crawled data

In [2]:
lines = []
with open('data/kwatchtower') as f:
    data = f.readlines()
    for line in data:
        lines.append(line.strip('\n'))
len(lines)

414455

In [3]:
twitter = Twitter()

In [4]:
%%time
tokens = []
for idx, line in enumerate(lines):
    toks = twitter.pos(line, stem=False)
    for tup in toks:
        tok = tup[0]
        if re.findall(r'[가-힣]+', tok) and 1 < len(tok) < 5:
            tokens.append(tok)
    if idx % 50000 == 0:
        print(idx, "lines processed...")

0 lines processed...
50000 lines processed...
100000 lines processed...
150000 lines processed...
200000 lines processed...
250000 lines processed...
300000 lines processed...
350000 lines processed...
400000 lines processed...
CPU times: user 4min 14s, sys: 520 ms, total: 4min 15s
Wall time: 4min 4s


## process data into sequences

In [5]:
token_counts = Counter(tokens)
x_words = [t[0] for t in token_counts.most_common(100000)]
print(len(tokens), len(x_words))

3913912 41616


In [6]:
transliterizer = Transliter(academic)

In [7]:
%%time
y_words = [re.sub(r'[^a-z]', '', transliterizer.translit(w)) for w in x_words]

CPU times: user 348 ms, sys: 4 ms, total: 352 ms
Wall time: 352 ms


In [8]:
for i in range(10):
    print(x_words[i], '\t', y_words[i])

습니다 	 seubnida
사람 	 salam
우리 	 uli
니다 	 nida
여호와 	 yeohowa
에서 	 eseo
하느님 	 haneunim
으로 	 eulo
입니 	 ibni
에게 	 ege


In [9]:
%%time
x_data = [Separator(w).sep_all for w in x_words]
print(x_data[0])
x_vocab = list(set(w for l in x_data for w in l))
print(len(x_vocab))

['ㅅ', 'ㅡ', 'ㅂ', 'ㄴ', 'ㅣ', 'ㄷ', 'ㅏ']
50
CPU times: user 252 ms, sys: 4 ms, total: 256 ms
Wall time: 255 ms


In [10]:
%%time
y_data = [list(w) for w in y_words]
print(y_data[0])
y_vocab = list(set(w for l in y_data for w in l))
print(len(y_vocab))

['s', 'e', 'u', 'b', 'n', 'i', 'd', 'a']
20
CPU times: user 24 ms, sys: 4 ms, total: 28 ms
Wall time: 27.4 ms


In [11]:
kor2idx, idx2kor = get_vocab(x_data, len(x_vocab)+2)


total vocab size: 50 


trunc vocab size: 50 



In [12]:
eng2idx, idx2eng = get_vocab(y_data, len(y_vocab)+2)


total vocab size: 20 


trunc vocab size: 20 



In [13]:
%%time
x_text_idx = index_sents(x_data, kor2idx)
y_text_idx = index_sents(y_data, eng2idx)

CPU times: user 192 ms, sys: 0 ns, total: 192 ms
Wall time: 192 ms


In [14]:
len(x_text_idx), len(y_text_idx)

(41616, 41616)

## train-test split and save to numpy binaries

In [21]:
indices = [i for i in range(len(x_text_idx))]
train_idx, test_idx, y_train, y_test = train_test_split(indices, y_text_idx, test_size=0.1)

def get_sublist(lst, indices):
    result = []
    for idx in indices:
        result.append(lst[idx])
    return result

x_train = get_sublist(x_text_idx, train_idx)
x_test = get_sublist(x_text_idx, test_idx)

In [22]:
len(x_train), len(x_test), len(y_train), len(y_test)

(37454, 4162, 37454, 4162)

In [23]:
def numpy_save(saves, names):
    for idx, item in enumerate(saves):
        np.save('encoded/{0}.npy'.format(names[idx]), item)
    return

In [24]:
saves = [
x_words, y_words,
x_vocab, y_vocab,
kor2idx, idx2kor,
eng2idx, idx2eng,
train_idx, test_idx,
x_train, x_test,
y_train, y_test,
]

names = [
'x_words', 'y_words',
'x_vocab', 'y_vocab',
'kor2idx', 'idx2kor',
'eng2idx', 'idx2eng',
'train_idx', 'test_idx',
'x_train', 'x_test',
'y_train', 'y_test',
]

numpy_save(saves, names)

In [26]:
x_len = [len(x) for x in x_data]
y_len = [len(y) for y in y_data]
print(max(x_len), max(y_len))

12 22
