In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras.layers import LSTM, Dense, Dropout, Embedding
from keras.models import Sequential
from keras.preprocessing.text import one_hot
from keras_preprocessing.sequence import pad_sequences

In [2]:
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download("stopwords")

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [3]:
df = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [4]:
df.fillna(" ", inplace=True)
test.fillna(" ", inplace=True)

In [5]:
df["total"] = df["title"] + " " + df["author"]
test["total"] = test["title"] + " " + test["author"]

In [6]:
X = df.drop("label", axis=1)
y = df["label"]
print(X.shape)
print(y.shape)

(20800, 5)
(20800,)


In [7]:
voc_size = 5000
msg = X.copy()
msg_test = test.copy()

In [8]:
ps = PorterStemmer()

In [9]:
corpus = []
for i in range(len(msg)):
    review = re.sub("[^a-zA-Z]", " ", msg["total"][i])
    review = review.lower()
    review = review.split()
    review = [
        ps.stem(word) for word in review if not word in stopwords.words("english")
    ]
    review = " ".join(review)
    corpus.append(review)

In [10]:
corpus_test = []
for i in range(len(msg_test)):
    review = re.sub("[^a-zA-Z]", " ", msg_test["total"][i])
    review = review.lower()
    review = review.split()
    review = [
        ps.stem(word) for word in review if not word in stopwords.words("english")
    ]
    review = " ".join(review)
    corpus_test.append(review)

In [11]:
onehot_rep = [one_hot(words, voc_size) for words in corpus]
onehot_rep_test = [one_hot(words, voc_size) for words in corpus_test]

In [12]:
embedded_docs = pad_sequences(onehot_rep, padding="pre", maxlen=25)
embedded_docs_test = pad_sequences(onehot_rep_test, padding="pre", maxlen=25)

In [13]:
model = Sequential(
    [
        Embedding(voc_size, 40, input_length=25),
        Dropout(0.25),
        LSTM(100),
        Dropout(0.25),
        Dense(64, activation="relu"),
        Dropout(0.25),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 40)            200000    
                                                                 
 dropout (Dropout)           (None, 25, 40)            0         
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense (Dense)               (None, 64)                6464      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 6

In [15]:
X_final = np.array(embedded_docs)
y_final = np.array(y)
test_final = np.array(embedded_docs_test)
X_final.shape, y_final.shape, test_final.shape

((20800, 25), (20800,), (5200, 25))

In [16]:
model.fit(X_final, y_final, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x22bbf842bb0>

In [17]:
model.save("model.h5")