In [1]:
import nltk
import sys
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU,Conv1D,MaxPooling1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from keras.models import load_model
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
#read data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print("train = ",len(train))
print("test = ",len(test))
train = train.dropna().reset_index(drop=True)
test = test.dropna().reset_index(drop=True)

train =  1618
test =  100


In [4]:
# train = df[:1501]
# test = df[1501:]
# train = train.reset_index(drop=True)
# test = test.reset_index(drop=True)
y_train = train['label'].values
y_test = test['label'].values
y_train, y_test = [np_utils.to_categorical(x) for x in [y_train, y_test]]

In [5]:
X_train = train["text"]
X_test = test["text"]
max_features = 200
tokenizer = Tokenizer(num_words=max_features,char_level=True)
tokenizer.fit_on_texts(list(X_train)+list(X_test))

In [6]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [7]:
maxlen = 500
x_train = pad_sequences(X_train, maxlen=maxlen)
x_test = pad_sequences(X_test, maxlen=maxlen)

In [8]:
embed_size = 240

In [9]:
def buid_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(len(tokenizer.word_index)+1, embed_size)(inp)
    x = Conv1D(filters=100,kernel_size=4,padding='same', activation='relu')(x)
    x = MaxPooling1D(pool_size=4)(x)
    x = Bidirectional(GRU(60, return_sequences=True,name='lstm_layer',dropout=0.2,recurrent_dropout=0.2))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dense(2, activation="softmax")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

In [10]:
model = buid_model()
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 500, 240)          10320     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 100)          96100     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 125, 100)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 125, 120)          57960     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 120)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6050

In [11]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)

In [12]:
# only save the best model
f_path = 'model.h5'
msave = ModelCheckpoint(f_path, save_best_only=True)

In [13]:
# training
epochs = 50
batch_size = 8
training = model.fit(X_tra, y_tra,
                     validation_data=(X_val, y_val),
                     epochs=epochs,
                     callbacks=[msave],
                     batch_size=batch_size, 
                     verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1532 samples, validate on 81 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [16]:
score, acc = model.evaluate(x_test, y_test, batch_size=32)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.10160845609381795
Test accuracy: 0.9800000190734863


In [17]:
preds = model.predict(x_test, batch_size=1, verbose=1)
preds



array([[0.00906363, 0.99093634],
       [0.03898832, 0.96101165],
       [0.01690998, 0.98309004],
       [0.13594602, 0.864054  ],
       [0.00829784, 0.99170214],
       [0.0110754 , 0.98892456],
       [0.01729899, 0.982701  ],
       [0.0229989 , 0.97700113],
       [0.02019827, 0.9798018 ],
       [0.01454751, 0.9854525 ],
       [0.00756363, 0.99243635],
       [0.9365979 , 0.06340208],
       [0.01160314, 0.9883968 ],
       [0.01114671, 0.98885334],
       [0.0148629 , 0.9851371 ],
       [0.01334803, 0.986652  ],
       [0.8895358 , 0.11046417],
       [0.01564953, 0.98435044],
       [0.0184247 , 0.98157525],
       [0.9986009 , 0.00139915],
       [0.01048407, 0.9895159 ],
       [0.998355  , 0.001645  ],
       [0.9386358 , 0.06136423],
       [0.01763223, 0.98236775],
       [0.03987497, 0.9601251 ],
       [0.9876324 , 0.01236758],
       [0.06929448, 0.93070555],
       [0.01820971, 0.9817903 ],
       [0.9983956 , 0.00160437],
       [0.01509453, 0.9849054 ],
       [0.

In [18]:
y = []
y_pred = []
for i, (label) in enumerate(preds):
    predicted = np.argmax(preds[i])
    real = np.argmax(y_test[i])
    y.append(real)
    y_pred.append(predicted)

In [20]:
from sklearn import metrics
print('accuracy::', metrics.accuracy_score(y, y_pred))
print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
print('f_score::', metrics.classification_report(y, y_pred))

accuracy:: 0.98
precision:: 0.980952380952381
recall:: 0.98
f_score:: 0.9800744109136006
f_score::               precision    recall  f1-score   support

           0       0.95      1.00      0.98        40
           1       1.00      0.97      0.98        60

    accuracy                           0.98       100
   macro avg       0.98      0.98      0.98       100
weighted avg       0.98      0.98      0.98       100

