In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import ModelCheckpoint, EarlyStopping

Using TensorFlow backend.


In [2]:
FILE_PATH = '/run/media/sphericalkat/archive/datasets/clickbait_dataset/clickbait_data.csv'
data = pd.read_csv(FILE_PATH)
data.head()

Unnamed: 0,headline,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1


In [3]:
text = data['headline'].values
labels = data['clickbait'].values
text_train, text_test, y_train, y_test = train_test_split(text, labels)
print(text_train.shape, text_test.shape, y_train.shape, y_test.shape)

(24000,) (8000,) (24000,) (8000,)


In [4]:
vocab_size = 5000
maxlen = 500
embedding_size = 32

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(text)

X_train = tokenizer.texts_to_sequences(text_train)
x_test = tokenizer.texts_to_sequences(text_test)

X_train = pad_sequences(X_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

In [5]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=maxlen))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 242,561
Trainable params: 242,561
Non-trainable params: 0
_________________________________________________________________


In [6]:
callbacks = [
    EarlyStopping(
        monitor='val_accuracy',
        min_delta=1e-4,
        patience=3,
        verbose=1
    ),
    ModelCheckpoint(
        filepath='weights.h5',
        monitor='val_accuracy', 
        mode='max', 
        save_best_only=True,
        save_weights_only=True,
        verbose=1
    )
]

In [7]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=64, validation_data=(x_test, y_test), epochs=20, callbacks=callbacks)



Train on 24000 samples, validate on 8000 samples
Epoch 1/20

Epoch 00001: val_accuracy improved from -inf to 0.97462, saving model to weights.h5
Epoch 2/20

Epoch 00002: val_accuracy improved from 0.97462 to 0.97925, saving model to weights.h5
Epoch 3/20

Epoch 00003: val_accuracy did not improve from 0.97925
Epoch 4/20

Epoch 00004: val_accuracy did not improve from 0.97925
Epoch 5/20

Epoch 00005: val_accuracy did not improve from 0.97925
Epoch 6/20

Epoch 00006: val_accuracy did not improve from 0.97925
Epoch 7/20

Epoch 00007: val_accuracy did not improve from 0.97925
Epoch 00007: early stopping


<keras.callbacks.callbacks.History at 0x7f3b1cc99af0>

In [10]:
model.load_weights('weights.h5')
model.save('model.pb')