In [51]:
import os
import sys
sys.path.append(os.pardir) 
import nltk, re
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Embedding, Dense, Input, Bidirectional, LSTM
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

import konlpy
from konlpy.tag import Komoran
from konlpy.tag import Twitter
from konlpy.tag import Hannanum
from gensim.models import Word2Vec

In [52]:
TEXT_DATA_DIR = './newsData'
MAX_SEQUENCE_LENGTH = 1000
MAX_FEATURES = 20000
EMBEDDING_DIM = 100
TEST_SPLIT = 0.15


In [56]:
trainTexts = []  # list of text samples
testTexts = [] 
labels_index = {}  # dictionary mapping label name to numeric id
trainLabels = []  # list of label ids
testLabels = []
seq = []
komo = Komoran()

for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            fpath = os.path.join(path, fname)
  
            args = {'encoding': 'utf-8'}
            if(re.findall('.1[6789][0-9]NewsData',fpath)):
                text = []
                with open(fpath, **args) as f:
                    t = f.read()
                    pos = komo.pos(t.strip())
                    for pair in pos :
                        if (re.findall('^[NV]',pair[1])):
                            morpheme = pair[0]+'/'+pair[1]
                            text.append(morpheme)
                    testTexts.append(text)
                testLabels.append(label_id)
            else :
                text = []
                with open(fpath, **args) as f:
                    t = f.read()
                    pos = komo.pos(t.strip())
                    for pair in pos :
                        if (re.findall('^[NV]',pair[1])):
                            morpheme = pair[0]+'/'+pair[1]
                            text.append(morpheme)
                    trainTexts.append(text)
                trainLabels.append(label_id)
            seq.append(text)
print('Found %s texts.' % len(trainTexts))
print('Found %s texts.' % len(testTexts))
print(len(trainLabels))
print(len(testLabels))

Found 1280 texts.
Found 320 texts.
1280
320


In [57]:
li  = list()
for i in range(len(trainTexts)):
    print(len(trainTexts[i]))

180
125
512
333
505
201
685
278
261
330
484
327
215
384
293
419
235
383
203
303
367
295
236
327
205
291
378
200
265
336
232
411
562
251
169
200
375
308
293
445
445
390
454
243
303
209
187
363
122
288
293
434
195
257
289
399
190
195
607
340
292
292
223
303
193
172
323
267
217
392
374
371
160
226
323
443
176
394
201
170
301
248
160
205
282
434
493
529
317
171
610
184
436
253
295
269
176
225
428
434
129
163
318
229
282
321
205
502
245
129
179
166
126
243
514
229
214
282
206
277
321
423
312
445
258
367
201
178
430
278
265
260
417
285
370
569
206
378
195
321
133
206
251
235
321
211
302
256
195
444
367
212
129
334
148
183
296
416
223
537
474
271
247
288
221
532
250
361
237
351
117
190
610
182
252
319
254
166
397
222
103
429
385
299
465
452
434
249
297
341
484
430
395
249
265
174
268
257
254
167
271
229
531
260
208
222
213
250
545
210
175
286
272
241
220
118
333
364
211
282
212
610
249
218
394
204
257
280
223
274
326
353
241
339
362
405
434
256
338
224
429
198
206
187
402
180
438
439
211
417


In [58]:
print(trainTexts[1])
print(len(seq))

['예결/NNG', '위/NNG', '추경/NNG', '막바지/NNG', '심사/NNG', '진통/NNG', '여야/NNG', '충돌/NNG', '서울/NNP', '연합뉴스/NNP', '김/NNP', '남/NNG', '기자/NNG', '국회 예산결산특별위원회/NNP', '일/NNB', '추가경정예산/NNP', '안/NNG', '막바지/NNG', '심사/NNG', '돌입/NNG', '여야/NNG', '간/NNB', '이견/NNG', '진통/NNG', '겪/VV', '있/VX', '예결/NNG', '위/NNG', '이날/NNG', '오전/NNG', '시/NNB', '위원회/NNG', '열/VV', '전날/NNG', '심사/NNG', '보류/NNG', '사업/NNG', '건/NNB', '감액/NNG', '심사/NNG', '하/VV', '이/VCP', '개/NNB', '교섭단체/NNP', '예결/NNG', '위/NNG', '간사/NNG', '참석/NNG', '소위/NNG', '심사/NNG', '지/NNB', '시간/NNG', '만/NNB', '여야/NNG', '간/NNB', '충돌/NNG', '정회/NNG', '자유/NNG', '한국당/NNP', '예결/NNG', '위/NNG', '간사/NNG', '이/VCP', '김도읍/NNP', '의원/NNG', '감액/NNG', '사업/NNG', '많/VA', '소위/NNG', '정회/NNG', '상태/NNG', '말/NNG', '예결/NNG', '위/NNG', '소위/NNG', '감액/NNG', '심사/NNG', '완료/NNG', '증액/NNG', '작업/NNG', '등/NNB', '거치/VV', '수정/NNG', '추경안/NNG', '전체/NNG', '회의/NNG', '올리/VV', '예정/NNG', '이/VCP', '현재/NNG', '전체/NNG', '회의/NNG', '개/NNB', '상정/NNG', '전망/NNG', '불투명/NNG', '상태/NNG', '이/VCP', '예결/NNG', '위/NNG', '전체/NNG', 

In [59]:
model_ko = Word2Vec(seq, min_count=1, size=300) 
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(trainTexts)  # train on `texts (list of sample texts)`
tokenizer.fit_on_texts(testTexts) 
trainSequences = tokenizer.texts_to_sequences(trainTexts)  # return list of sequences (one per text)
testSequences = tokenizer.texts_to_sequences(testTexts) 

In [60]:
print(sequences[0])

[4713, 975, 10805, 773, 1210, 1211, 1091, 2619, 596, 1092, 773, 8941, 606, 1430, 429, 6152, 1210, 138, 5084, 939, 879, 35, 180, 157, 1761, 1762, 908, 30, 219, 10806, 745, 975, 10805, 35, 1999, 773, 5, 71, 1210, 14105, 1231, 132, 812, 1092, 773, 48, 167, 2330, 845, 6868, 802, 181, 1210, 14106, 1211, 1092, 773, 456, 6153, 423, 1922, 429, 1706, 4, 9, 35, 1407, 78, 1231, 2169, 180, 157, 5, 378, 82, 4713, 745, 2000, 1, 1092, 773, 157, 271, 6, 24, 125, 350, 10807, 1, 1091, 606, 340, 2, 1092, 773, 26, 21, 5569, 14107, 14108, 1, 1091, 606, 2001, 225, 160, 21, 85, 5569, 99, 1270, 3256, 3630, 6, 341, 288, 2, 74, 6869, 187, 180, 77, 1659, 225, 1248, 41, 408, 5569, 291, 1338, 107, 774, 35, 1707, 7734, 14109, 2520, 955, 2961, 2962, 2521, 41, 1092, 773, 2520, 2962, 7735, 457, 67, 341, 4, 1210, 2243, 1092, 773, 38, 131, 745, 975, 33, 46, 1091, 57, 44, 3, 33, 1, 1091, 57, 157, 50, 278, 582, 271, 6, 24, 3, 1, 44, 12]


In [61]:
words = list(model_ko.wv.vocab)
print(words)

['동남아/NNP', '담당/NNG', '희철/NNP', '부상/NNG', '베이징/NNP', '도착/NNG', '싱가포르/NNP', '행/NNB', '주목/NNG', '최/NNP', '행선지/NNG', '방문/NNG', '목적/NNG', '질문/NNG', '묵묵부답/NNG', '연합뉴스/NNP', '김진/NNP', '방/NNG', '특파원/NNG', '북한/NNP', '북미/NNP', '정상회담/NNP', '무산/NNG', '거론/NNG', '태도/NNG', '보이/VV', '가운데/NNG', '동남아시아/NNP', '외교/NNG', '외무성/NNG', '일/NNB', '중국/NNP', '서우/NNP', '공항/NNG', '모습/NNG', '드러내/VV', '이날/NNG', '오전/NNG', '평양/NNP', '발/NNG', '고려항공/NNP', '편/NNB', '이용/NNG', '서우두 공항/NNP', '최종/NNG', '목적지/NNG', '묻/VV', '취재진/NNG', '답변/NNG', '하/VV', '않/VX', '대사관/NNG', '관계자/NNG', '빠져나가/VV', '앞두/VV', '상황/NNG', '통/NNB', '이/VCP', '준비/NNG', '등/NNB', '위하/VV', '회담/NNG', '개최/NNG', '예정지/NNG', '제기/NNG', '있/VX', '지나/VV', '월/NNB', '아세안/NNP', '동남아시아국가연합/NNP', '의장국/NNG', '양국/NNG', '관계/NNG', '올해/NNG', '열리/VV', '지역/NNG', '안보/NNG', '포럼/NNG', '의제/NNG', '논의/NNG', '바/NNB', '지난해/NNG', '북핵 문제/NNP', '두/VV', '간/NNB', '긴장/NNG', '형성/NNG', '때/NNG', '참석/NNG', '상대/NNG', '여론/NNG', '전/NNG', '펼치/VV', '초청/NNG', '비자/NNP', '쿠마르/NNP', '싱/NNP', '인도/NNP', '외교부/NN