In [10]:
import numpy as np
import sys
import urllib
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding
from keras.utils import pad_sequences
import joblib

import pandas as pd

url = "https://gist.githubusercontent.com/alopes/5358189/raw/2107d809cca6b83ce3d8e04dbd9463283025284f/stopwords.txt"
stopwords_list = urllib.request.urlopen(url).read().decode()
stopwords_ptbr = set(stopwords_list.split())

In [11]:
df = pd.read_csv('../data/database.csv')

In [12]:
texts = df['content'].values

In [13]:
max_words = 0
for t in texts:
    if len(t) > max_words:
        max_words = len(t)

In [14]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [15]:
text = [item for sublist in sequences for item in sublist]
vocab_size = len(tokenizer.word_index)

In [16]:
# Training on 19 words to predict the 20th
sentence_len = 20
pred_len = 1
train_len = sentence_len - pred_len
seq = []
# Sliding window to generate train data
for i in range(len(text)-sentence_len):
    seq.append(text[i:i+sentence_len])
# Reverse dictionary to decode tokenized sequences back to words
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

In [8]:
# Each row in seq is a 20 word long window. We append he first 19 words as the input to predict the 20th word
trainX = []
trainy = []
for i in seq:
    trainX.append(i[:train_len])
    trainy.append(i[-1])

In [9]:
# define model
model_2 = Sequential([
    Embedding(vocab_size+1, 50, input_length=train_len),
    LSTM(100, return_sequences=True),
    LSTM(100),
    Dense(100, activation='relu'),
    Dropout(0.1),
    Dense(vocab_size, activation='softmax')
])

# Train model with checkpoints
model_2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
filepath = "./model_2_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
history = model_2.fit(np.asarray(trainX),
         pd.get_dummies(np.asarray(trainy)),
         epochs = 20,
         batch_size = 64,
         callbacks = callbacks_list,
         verbose = 1)

Epoch 1/20
Epoch 1: loss improved from inf to 6.61868, saving model to .\model_2_weights.hdf5
Epoch 2/20
Epoch 2: loss improved from 6.61868 to 5.16959, saving model to .\model_2_weights.hdf5
Epoch 3/20
Epoch 3: loss improved from 5.16959 to 4.43686, saving model to .\model_2_weights.hdf5
Epoch 4/20
Epoch 4: loss improved from 4.43686 to 4.08097, saving model to .\model_2_weights.hdf5
Epoch 5/20
Epoch 5: loss improved from 4.08097 to 3.83275, saving model to .\model_2_weights.hdf5
Epoch 6/20
Epoch 6: loss improved from 3.83275 to 3.64379, saving model to .\model_2_weights.hdf5
Epoch 7/20
Epoch 7: loss improved from 3.64379 to 3.47986, saving model to .\model_2_weights.hdf5
Epoch 8/20
Epoch 8: loss improved from 3.47986 to 3.33197, saving model to .\model_2_weights.hdf5
Epoch 9/20
Epoch 9: loss improved from 3.33197 to 3.19871, saving model to .\model_2_weights.hdf5
Epoch 10/20
Epoch 10: loss improved from 3.19871 to 3.08041, saving model to .\model_2_weights.hdf5
Epoch 11/20
Epoch 11: 

In [18]:
def gen(model,seq,max_len = 10):
    tokenized_sent = tokenizer.texts_to_sequences([seq])
    max_len = max_len+len(tokenized_sent[0])

    while len(tokenized_sent[0]) < max_len:
        padded_sentence = pad_sequences(tokenized_sent[-19:],maxlen=19)
        op = model.predict(np.asarray(padded_sentence).reshape(1,-1))
        tokenized_sent[0].append(op.argmax()+1)
        
    return " ".join(map(lambda x : reverse_word_map[x],tokenized_sent[0]))

In [19]:
gen(model_2, "O presidente")



'o presidente da bbc news brasil ensaiado disse barrett a bbc co'

In [23]:
# save the model to a file
joblib.dump(model_2, '../models/model_gerador.joblib')

['../models/model_gerador.joblib']

In [8]:
joblib.dump(tokenizer, '../models/tokenizer_gen.joblib')

['../models/tokenizer_gen.joblib']