In [None]:
from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import collections
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from keras.utils.np_utils import to_categorical
from keras_tqdm import TQDMNotebookCallback
#nltk.download('punkt')
#nltk.download('stopwords')
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
file = "../input/news-aggregator-dataset/uci-news-aggregator.csv"

data = pd.read_csv(file, usecols=["CATEGORY", "TITLE"])
#Converter categoria string para numérico
data.CATEGORY = pd.Categorical(data.CATEGORY)
data['CATEGORY'] = data.CATEGORY.cat.codes

ROWS = 8000
ROWS_PER_CATEGORY = int(ROWS/4) 

#Seleciona 8000 linhas balanceadas nas quatro categorias
data = data[data["CATEGORY"] == 0].head(ROWS_PER_CATEGORY) \
                                       .append(data[data["CATEGORY"] == 1].head(ROWS_PER_CATEGORY)) \
                                       .append(data[data["CATEGORY"] == 2].head(ROWS_PER_CATEGORY)) \
                                       .append(data[data["CATEGORY"] == 3].head(ROWS_PER_CATEGORY))
maxlen = 0
word_freqs = collections.Counter()
num_recs = 0
stop_words = set(stopwords.words('english'))
snow_stem = nltk.stem.SnowballStemmer('english')

for sentence in data["TITLE"]:
    words = nltk.word_tokenize(sentence.lower())
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
        if word in stop_words:
            continue;
        word = snow_stem.stem(word)
        word_freqs[word] += 1
    num_recs += 1

print("maxlen :", maxlen)
print("len(word_freqs) :", len(word_freqs))

In [None]:
MAX_VOCAB= 2000
MAX_TITLE_LENGTH = 20

vocab_size = min(MAX_VOCAB, len(word_freqs)) + 1#Somado 1 por causa do UNK
word2index = {x[0]: i+1 for i, x in enumerate(word_freqs.most_common(MAX_VOCAB))}
word2index["UNK"] = 0

In [None]:
X = []
y = []

i = 0

for sentence in data["TITLE"]:
    words = nltk.word_tokenize
    words = nltk.word_tokenize(sentence.lower())
    seqs = []
    for word in words:
        if word in stop_words:
            continue
        word = snow_stem.stem(word)
        if word in word2index:
            seqs.append(word2index[word])
        else:
            seqs.append(word2index["UNK"])
    X.append(seqs)
for category in data["CATEGORY"]:
    y.append(category)
X = sequence.pad_sequences(X, maxlen=MAX_TITLE_LENGTH)
y = to_categorical(y)

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
EMBEDDING_SIZE= 128
HIDDEN_LAYER_SIZE = 256
BATCH_SIZE = 32
NUM_EPOCH = 5

# Define Model
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=MAX_TITLE_LENGTH))
model.add(LSTM(HIDDEN_LAYER_SIZE, dropout=0.4, recurrent_dropout=0.2))
model.add(Dense(4))
model.add(Activation("softmax"))

model_adam = model

model.summary()

model_adam.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
history_adam = model_adam.fit(Xtrain, ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCH, 
                    validation_data=(Xtest, ytest), verbose=2, callbacks=[TQDMNotebookCallback()])

In [None]:
plt.subplot(211)
plt.title("Accuracy")
plt.plot(history_adam.history["accuracy"], color="g", label="Train")
plt.plot(history_adam.history["val_accuracy"], color="b", label="Validation")
plt.legend(loc="best")

plt.subplot(212)
plt.title("Loss")
plt.plot(history_adam.history["loss"], color="g", label="Train")
plt.plot(history_adam.history["val_loss"], color="b", label="Validation")
plt.legend(loc="best")
plt.savefig("grafico")

plt.tight_layout()
plt.show()

In [None]:
score, acc = model.evaluate(Xtest, ytest, batch_size=BATCH_SIZE, verbose=0)
print("Test score: %.3f, accuracy: %.3f" % (score,acc))