In [1]:
import pandas as pd
import numpy as np

# preprocessing
from string import punctuation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# modeling
from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential

Using TensorFlow backend.


이번 예제는 뉴욕 타임즈 기사의 제목을 사용하여 LSTM을 통해 `텍스트 생성 실습 예제`를 다뤄보겠습니다
- 데이터 출처: https://www.kaggle.com/aashita/nyt-comments

In [2]:
df = pd.read_csv('../../New_York_Times_ArticleApril2017/dataset/ArticlesApril2017.csv')
df.head()

Unnamed: 0,abstract,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,,58def1347c459f24986d7c80,716,By STEPHEN HILTNER and SUSAN LEHMAN,article,Finding an Expansive View of a Forgotten Peop...,"['Photography', 'New York Times', 'Niger', 'Fe...",3,Insider,2,2017-04-01 00:15:41,Unknown,One of the largest photo displays in Times his...,The New York Times,News,https://www.nytimes.com/2017/03/31/insider/nig...
1,,58def3237c459f24986d7c84,823,By GAIL COLLINS,article,"And Now, the Dreaded Trump Curse","['United States Politics and Government', 'Tru...",3,OpEd,23,2017-04-01 00:23:58,Unknown,Meet the gang from under the bus.,The New York Times,Op-Ed,https://www.nytimes.com/2017/03/31/opinion/and...
2,,58def9f57c459f24986d7c90,575,By THE EDITORIAL BOARD,article,Venezuela’s Descent Into Dictatorship,"['Venezuela', 'Politics and Government', 'Madu...",3,Editorial,22,2017-04-01 00:53:06,Unknown,A court ruling annulling the legislature’s aut...,The New York Times,Editorial,https://www.nytimes.com/2017/03/31/opinion/ven...
3,,58defd317c459f24986d7c95,1374,By MICHAEL POWELL,article,Stain Permeates Basketball Blue Blood,"['Basketball (College)', 'University of North ...",3,Sports,1,2017-04-01 01:06:52,College Basketball,"For two decades, until 2013, North Carolina en...",The New York Times,News,https://www.nytimes.com/2017/03/31/sports/ncaa...
4,,58df09b77c459f24986d7ca7,708,By DEB AMLEN,article,Taking Things for Granted,['Crossword Puzzles'],3,Games,0,2017-04-01 02:00:14,Unknown,In which Howard Barkin and Will Shortz teach u...,The New York Times,News,https://www.nytimes.com/2017/03/31/crosswords/...


headline 열 중간마다 `Unknown`이라는 단어가 있기때문에 제외한다.

In [3]:
headline = []
headline.extend(list(df.headline.values))
headline[:5]

['Finding an Expansive View  of a Forgotten People in Niger',
 'And Now,  the Dreaded Trump Curse',
 'Venezuela’s Descent Into Dictatorship',
 'Stain Permeates Basketball Blue Blood',
 'Taking Things for Granted']

In [4]:
len(headline)

886

In [5]:
headline = [n for n in headline if n != 'Unknown']
len(headline)

831

string에서 `punctuation`은 아래와 같이 문자이외에 문장부호들이 들어있다.

In [6]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

아래 preprocessing 함수를 통해서 문장부호도 모두 제외되고 스펠링도 모두 소문자로 변환되었다

In [7]:
def preprocessing(s):
    return ''.join(c for c in s if c not in punctuation).lower()

text = [preprocessing(x) for x in headline]
text[:5]

['finding an expansive view  of a forgotten people in niger',
 'and now  the dreaded trump curse',
 'venezuela’s descent into dictatorship',
 'stain permeates basketball blue blood',
 'taking things for granted']

In [8]:
t = Tokenizer()
t.fit_on_texts(text)
vocab_size = len(t.word_index) + 1
print('단어 집합의 크기: %d' % vocab_size)

단어 집합의 크기: 2483


In [9]:
sequences = list()
for line in text:
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
sequences[:5]

[[168, 20],
 [168, 20, 652],
 [168, 20, 652, 358],
 [168, 20, 652, 358, 4],
 [168, 20, 652, 358, 4, 2]]

In [10]:
max_len = max(len(l) for l in sequences)
print(max_len)

21


In [11]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
print(sequences[:3])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0 168  20]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
  168  20 652]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 168
   20 652 358]]


In [12]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [13]:
print('X.shape: ',X.shape)
print('y.shape: ',y.shape)

X.shape:  (4812, 20)
y.shape:  (4812, 2483)


In [14]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_len-1))
model.add(LSTM(128))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/200
 - 5s - loss: 7.4653 - acc: 0.0247
Epoch 2/200
 - 4s - loss: 6.9444 - acc: 0.0297
Epoch 3/200
 - 4s - loss: 6.8245 - acc: 0.0289
Epoch 4/200
 - 4s - loss: 6.7370 - acc: 0.0380
Epoch 5/200
 - 4s - loss: 6.6381 - acc: 0.0457
Epoch 6/200
 - 4s - loss: 6.5145 - acc: 0.0472
Epoch 7/200
 - 4s - loss: 6.3731 - acc: 0.0482
Epoch 8/200
 - 4s - loss: 6.2185 - acc: 0.0528
Epoch 9/200
 - 4s - loss: 6.0647 - acc: 0.0584
Epoch 10/200
 - 4s - loss: 5.9203 - acc: 0.0630
Epoch 11/200
 - 4s - loss: 5.7792 - acc: 0.0653
Epoch 12/200
 - 4s - loss: 5.6462 - acc: 0.0638
Epoch 13/200
 - 4s - loss: 5.5176 - acc: 0.0669
Epoch 14/200
 - 4s - loss: 5.3915 - acc: 0.0707
Epoch 15/200
 - 4s - loss: 5.2683 - acc: 0.0715
Epoch 16/200
 - 4s - loss: 5.1461 - acc: 0.0759
Epoch 17/200
 - 4s - loss: 5.0266 - acc: 0.0819
Epoch 18/200
 - 4s - loss: 4.9082 - acc: 0.0898
Epoch 19/200
 - 4s - loss

<keras.callbacks.History at 0x229ae5b97b8>