In [2]:
# import library
import os
import sys
sys.path.append(os.pardir) 
import nltk, re
import numpy as np
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Embedding, Dense, Input, Bidirectional, LSTM, Dropout
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

import konlpy
from konlpy.tag import Komoran
from konlpy.tag import Twitter
from konlpy.tag import Hannanum
from gensim.models import Word2Vec

import pickle
import codecs
import csv
import struct

Using TensorFlow backend.


In [3]:
TEXT_DATA_DIR = './newsData'
directory_label = {0: '정치', 1: '경제', 2: '사회', 3 : '생활/문화', 4 : '세계', 5 : '기술/IT', 6 : '연예', 7 : '스포츠'}

In [4]:
#WORD_EMBEDDING_WORD2VEC = './wiki.ko/wiki.ko.bin'
#WORD_EMBEDDING_WORD2VEC = './wiki.ko/wiki.ko.vec'

In [5]:
# parameter
EMBEDDING_DIM = 100

In [6]:
TEST_SPLIT = 0.15
VALIDATION_SPLIT = 0.15

In [7]:
# konlp Objects
komo = Komoran()
twitter = Twitter()
hannanum = Hannanum()

In [8]:
# list of texts
Texts = []  
trainTexts = []
validationTexts = []
testTexts = [] 

In [9]:
# list of label ids
Labels = []
trainLabels = []  
validationLabels = []
testLabels = []

In [10]:
# dictionary mapping label name to numeric id
labels_index = {}

In [11]:
# sequence of texts
seq = []

In [12]:
print(Texts)

[]


In [13]:
# load texts data
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[directory_label[int(name)]] = label_id
        for fname in sorted(os.listdir(path)):
            fpath = os.path.join(path, fname)
            args = {'encoding': 'utf-8'}
            if(re.findall('.1[6789][0-9]NewsData',fpath)):
                # test texts
                text = []
                with open(fpath, **args) as f:
                    t = f.read()
                    pos = komo.pos(t.strip())
                    for pair in pos :
                        #['NNG', 'NNP', 'VV', 'VA']: part-of-speech of content word
                        if (re.findall('^[NV]',pair[1])):
                            morpheme = pair[0]+'/'+pair[1]
                            text.append(morpheme)
                    testTexts.append(text)
                testLabels.append(label_id)
            else :
                # train & validation texts
                text = []
                with open(fpath, **args) as f:
                    t = f.read()
                    pos = komo.pos(t.strip())
                    for pair in pos :
                        if (re.findall('^[NV]',pair[1])):
                            morpheme = pair[0]+'/'+pair[1]
                            text.append(morpheme)
                    Texts.append(text)
                Labels.append(label_id)
            seq.append(text)
print('Found %s texts.' % len(Texts))
print('Found %s texts.' % len(testTexts))
print(len(Labels))
print(len(testLabels))

Found 1280 texts.
Found 320 texts.
1280
320


In [14]:
import seaborn as sns
sns.set(color_codes=True)

'''
print("mean : ", np.mean(trainLen))
print("max : " ,np.max(trainLen))
print("min : " ,np.min(trainLen))
'''


'\nprint("mean : ", np.mean(trainLen))\nprint("max : " ,np.max(trainLen))\nprint("min : " ,np.min(trainLen))\n'

In [15]:
MAX_FEATURES = 20000

In [16]:
#Word2Vec
model_ko = Word2Vec(seq, min_count=1, size=EMBEDDING_DIM) 
words = list(model_ko.wv.vocab)
embeddings_index = {}
for i in words:
    embeddings_index[i] = model_ko[i]

  


In [17]:
print(len(Texts))

1280


In [18]:
# by default, validation_split in `fit` does not shuffle the data
indices = np.arange(len(Texts), dtype=int)
print(type(indices[0]))

<class 'numpy.int32'>


In [19]:
indices = np.arange(len(Texts))
print(indices)
np.random.shuffle(indices)

Texts = np.array(Texts)
Labels = np.array(Labels)

Texts = Texts[indices]
Labels = Labels[indices]

indices2 = np.arange(len(testTexts))
np.random.shuffle(indices2)

testTexts = np.array(testTexts)
testLabels = np.array(testLabels)

testTexts = testTexts[indices2]
testLabels = testLabels[indices2]

[   0    1    2 ... 1277 1278 1279]


In [20]:
num_validation_samples = int(VALIDATION_SPLIT * len(Texts))

validationTexts = Texts[:num_validation_samples]
validationLabels = Labels[:num_validation_samples]

trainTexts = Texts[num_validation_samples:]
trainLabels = Labels[num_validation_samples:]

In [21]:
print(len(trainTexts))
print(len(validationTexts))
print(len(testTexts))

1088
192
320


In [26]:
MAX_SEQUENCE_LENGTH = 350

In [27]:
# tokenizer 선언
tokenizer = Tokenizer(num_words=MAX_FEATURES)

TEXT = list()
for i in range(len(trainTexts)):
    TEXT.append(trainTexts[i])
    
for i in range(len(validationTexts)):
    TEXT.append(validationTexts[i]) 
    
for i in range(len(testTexts)):
    TEXT.append(testTexts[i])  

In [28]:
# train on `texts token' (list of sample texts)`
# return list of sequences (one per text)

tokenizer.fit_on_texts(TEXT)
sequences = tokenizer.texts_to_sequences(TEXT)
sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print(len(sequences))
# sequence는 전체?

1600


In [29]:
type(trainLabels)

numpy.ndarray

In [30]:
x_train = sequences[:1088]
y_train = to_categorical(trainLabels)

x_val = sequences[1088:1280]
y_val = to_categorical(validationLabels)

x_test = sequences[1280:]
y_test = to_categorical(testLabels)

In [31]:
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(1088, 8)
(192, 8)
(320, 8)


In [32]:
print(trainTexts[0])
print(x_train[0])
print(y_train[0])

['파워에이드/NNP', '한국/NNP', '테니스/NNG', '왕자/NNG', '정현/NNG', '파워/NNG', '풀/NNG', '만남/NNG', '윈터/NNG', '뉴스/NNG', '김태경/NNP', '기자/NNG', '세계/NNG', '주목/NNG', '한국/NNP', '테니스/NNG', '왕자/NNG', '정현/NNP', '한국체대/NNP', '세계/NNG', '이/VCP', '스포츠 음료/NNP', '파워에이드/NNP', '파워/NNG', '풀/NNG', '만남/NNG', '시작/NNG', '코카-콜라/NNP', '사/NNG', '스포츠 음료/NNP', '브랜드/NNG', '파워에이드/NNP', '일/NR', '한국/NNP', '테니스/NNG', '떠오르/VV', '아이콘/NNG', '이/VCP', '정현/NNG', '브랜드/NNG', '모델/NNG', '발탁/NNG', '정현/NNG', '폭발/NNG', '이/VCP', '에너지/NNG', '파워에이드/NNP', '어우러지/VV', '캠페인/NNG', '진행/NNG', '예정/NNG', '이/VCP', '밝히/VV', '지나/VV', '월/NNB', '노박 조코비치/NNP', '로저 페더러/NNP', '연잇/VV', '맞대결/NNG', '년/NNB', '새해/NNG', '열/VV', '정현/NNP', '신드롬/NNG', '일으키/VV', '정현/NNG', '세계/NNG', '랭킹/NNG', '위/NNB', '5월 14일/NNP', '남자/NNG', '테니스/NNG', '단식/NNG', '기준/NNG', '랭크/NNG', '아시아/NNP', '톱/NNG', '랭커/NNG', '자리/NNG', '수성/NNG', '연일/NNG', '승승장구/NNG', '있/VX', '지난해/NNG', '세계/NNG', '강호/NNG', '꺾/VV', '돌풍/NNG', '일으키/VV', '정현/NNG', '올해/NNG', '그랜드슬램/NNP', '대회/NNG', '이/VCP', '호주 오픈/NNP', '한국/NNP', '

In [33]:
# Map: word -> rank/index(int) in text
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print(word_index)

Found 25196 unique tokens.
{'이/VCP': 1, '것/NNB': 2, '있/VV': 3, '하/VV': 4, '일/NNB': 5, '있/VX': 6, '등/NNB': 7, '수/NNB': 8, '년/NNB': 9, '않/VX': 10, '되/VV': 11, '말/NNG': 12, '지/VX': 13, '대하/VV': 14, '원/NNB': 15, '없/VA': 16, '명/NNB': 17, '만/NR': 18, '이/NP': 19, '받/VV': 20, '기자/NNG': 21, '월/NNB': 22, '밝히/VV': 23, '씨/NNB': 24, '위하/VV': 25, '중/NNB': 26, '지나/VV': 27, '보/VV': 28, '따르/VV': 29, '주/VX': 30, '보이/VV': 31, '서울/NNP': 32, '아니/VCN': 33, '억/NR': 34, '같/VA': 35, '북한/NNP': 36, '시간/NNG': 37, '통하/VV': 38, '크/VA': 39, '때/NNG': 40, '대/NNB': 41, '이번/NNG': 42, '미국/NNP': 43, '시/NNB': 44, '때문/NNB': 45, '대통령/NNG': 46, '관련/NNG': 47, '그/NP': 48, '사람/NNG': 49, '이날/NNG': 50, '나오/VV': 51, '경찰/NNG': 52, '김/NNP': 53, '분/NNB': 54, '조사/NNG': 55, '이후/NNG': 56, '한국/NNP': 57, '하/VX': 58, '많/VA': 59, '번/NNB': 60, '중국/NNP': 61, '전하/VV': 62, '개/NNB': 63, '경우/NNG': 64, '대표/NNG': 65, '문제/NNG': 66, '지난해/NNG': 67, '사실/NNG': 68, '후보/NNG': 69, '관계자/NNG': 70, '정부/NNG': 71, '높/VA': 72, '못하/VX': 73, '의원/NNG': 74, '혐의/NNG':

In [34]:
num_words = min(MAX_FEATURES, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_FEATURES:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all zeros.
        embedding_matrix[i] = embedding_vector

In [35]:
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix], 
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [36]:
# path where all models are saved
BASE_PATH = './newsModel/'
if not os.path.exists(BASE_PATH):
    os.mkdir(BASE_PATH)

In [37]:
def create_checkpoint(model_name):
    # creates a subdirectory under `BASE_PATH`
    MODEL_PATH = os.path.join(BASE_PATH, model_name)
    if not os.path.exists(MODEL_PATH):
        os.mkdir(MODEL_PATH)
    
    return ModelCheckpoint(filepath=os.path.join(MODEL_PATH, '{epoch:02d}-{val_loss:.4f}.hdf5'),
                           monitor='val_loss',
                           verbose=1,
                           save_best_only=True)

In [38]:
batch_size = 128
max_epochs = 1000
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=3)

In [1]:
print('Training...')

# train a 3-layer bi-LSTM model
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = Bidirectional(LSTM(128, return_sequences=True))(embedded_sequences)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(128))(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

checkpoint = create_checkpoint('lstm')  # checkpoint callback
history = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=max_epochs,
          validation_data=(x_val, y_val),
          shuffle=True,
          callbacks= [checkpoint])

Training...


NameError: name 'Input' is not defined

In [247]:
loss, acc = model.evaluate(x_test, y_test,
                           batch_size=batch_size)

print(f'----- Evaluation loss and metrics for {len(y_test)} test samples -----')
print('Test loss:', loss)
print('Test accuracy:', acc)

----- Evaluation loss and metrics for 320 test samples -----
Test loss: 1.8770819187164307
Test accuracy: 0.3
