In [1]:
import os
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import utils
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential

In [2]:
nyt_dir = 'nyt_dataset/articles/'

all_headlines = []
for filename in os.listdir(nyt_dir):
    if 'Articles' in filename:
        headlines_df = pd.read_csv(nyt_dir + filename)
        all_headlines.extend(list(headlines_df.headline.values))
len(all_headlines)

9335

In [3]:
all_headlines[:20]

['My Beijing: The Sacred City',
 '6 Million Riders a Day, 1930s Technology',
 'Seeking a Cross-Border Conference',
 'Questions for: ‘Despite the “Yuck Factor,” Leeches Are Big in Russian Medicine’',
 'Who Is a ‘Criminal’?',
 'An Antidote to Europe’s Populism',
 'The Cost of a Speech',
 'Degradation of the Language',
 'On the Power of Being Awful',
 'Trump Garbles Pitch on a Revised Health Bill',
 'What’s Going On in This Picture? | May 1, 2017',
 'Unknown',
 'When Patients Hit a Medical Wall',
 'Unknown',
 'For Pregnant Women, Getting Serious About Whooping Cough',
 'Unknown',
 'New York City Transit Reporter in Wonderland: Riding the London Tube',
 'How to Cut an Avocado Without Cutting Yourself',
 'In Fictional Suicide, Health Experts Say They See a Real Cause for Alarm',
 'Claims of Liberal Media Bias Hit ESPN, Too']

In [4]:
all_headlines = [h for h in all_headlines if h != 'Unknown']
len(all_headlines)

8603

In [5]:
all_headlines[:20]

['My Beijing: The Sacred City',
 '6 Million Riders a Day, 1930s Technology',
 'Seeking a Cross-Border Conference',
 'Questions for: ‘Despite the “Yuck Factor,” Leeches Are Big in Russian Medicine’',
 'Who Is a ‘Criminal’?',
 'An Antidote to Europe’s Populism',
 'The Cost of a Speech',
 'Degradation of the Language',
 'On the Power of Being Awful',
 'Trump Garbles Pitch on a Revised Health Bill',
 'What’s Going On in This Picture? | May 1, 2017',
 'When Patients Hit a Medical Wall',
 'For Pregnant Women, Getting Serious About Whooping Cough',
 'New York City Transit Reporter in Wonderland: Riding the London Tube',
 'How to Cut an Avocado Without Cutting Yourself',
 'In Fictional Suicide, Health Experts Say They See a Real Cause for Alarm',
 'Claims of Liberal Media Bias Hit ESPN, Too',
 'Is the dream in Australia crumbling?',
 'Police in Texas Change Account in Officer’s Fatal Shooting of 15-Year-Old',
 'Most Adults Favor Sex Ed. Most Students Don’t Get It.']

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_headlines)
total_words = len(tokenizer.word_index) + 1
print(f'Total words: {total_words}')

Total words: 11753


In [8]:
input_sequences = []
for line in all_headlines:
    token_list = tokenizer.texts_to_sequences([line])[0] # 將句子拆成一個個單字    
    for i in range(1, len(token_list)):
        partial_sequence = token_list[: i+1]
        input_sequences.append(partial_sequence)

print(tokenizer.sequences_to_texts(input_sequences[:5]))
input_sequences[:5]

['my beijing', 'my beijing the', 'my beijing the sacred', 'my beijing the sacred city', '6 million']


[[52, 1616],
 [52, 1616, 1],
 [52, 1616, 1, 1992],
 [52, 1616, 1, 1992, 125],
 [126, 346]]

In [9]:
max_sequence_len = max([len(x) for x in input_sequences])

input_sequences = np.array(pad_sequences(input_sequences, maxlen= max_sequence_len, padding='pre')) # padding: 需要補 0 時是從頭開始補 
input_sequences[3]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,   52, 1616,    1, 1992,  125], dtype=int32)

In [10]:
# Predictors are every word except the last
predictors = input_sequences[:,:-1]
# Labels are the last word
labels = input_sequences[:,-1]
labels[:5]
labels = input_sequences[:,-1]
labels = utils.to_categorical(labels, num_classes= total_words)

In [11]:
input_len = max_sequence_len - 1
model = Sequential([
    Embedding(total_words, 10, input_length= input_len),
    LSTM(100),
    Dense(total_words, activation= 'relu'),
    Dense(total_words, activation= 'softmax'),
])

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 27, 10)            117530    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dense (Dense)                (None, 11753)             1187053   
_________________________________________________________________
dense_1 (Dense)              (None, 11753)             138144762 
Total params: 139,493,745
Trainable params: 139,493,745
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.compile(loss= 'categorical_crossentropy', optimizer= 'adam', metrics=["accuracy"])

In [14]:
model.fit(predictors, labels, epochs= 30, verbose= 1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fb408170b70>

In [30]:
def predict_next_token(seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    prediction = model.predict_classes(token_list, verbose=1)
    return prediction

In [31]:
prediction = predict_next_token("today in new york")
prediction



array([338])

In [32]:
tokenizer.sequences_to_texts([prediction])

['schools']

In [34]:
def generate_headline(seed_text, next_words= 1):
    for _ in range(next_words):
        prediction = predict_next_token(seed_text)
        next_word = tokenizer.sequences_to_texts([prediction])[0]
        seed_text += ' ' + next_word
        
    return seed_text.title()

In [37]:
seed_texts = [
    'washington dc is',
    'today in new york',
    'the school district has',
    'They Can Hit a']
for seed in seed_texts:
    print(generate_headline(seed, next_words=5))

Washington Dc Is Not A Car Of A
Today In New York Schools Calls The Anonymous May
The School District Has Supposed To Little 100 Days
They Can Hit A Ball 400 Feet But Play
