In [61]:
# Load, explore and plot data
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
%matplotlib inline
# Train test split
from sklearn.model_selection import train_test_split
# Text pre-processing
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
# Modeling
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Dropout, GlobalAveragePooling1D, Flatten, SpatialDropout1D, Bidirectional

In [62]:
df = pd.read_csv('gcp_data.csv')
# rename the columns
df = df[['Abstract','Article Classification']]
df.rename(columns={'Abstract':'message', 'Article Classification':'label'}, inplace=True)
df.head()

Unnamed: 0,message,label
0,The aim of this paper is to critically re-appr...,1
1,This study aims to introduce compound glycyrrh...,1
2,<b>Objectives:</b> This study aimed to assess ...,1
3,Though triazole antifungals are the first choi...,1
4,The cultivation of true morels (<i>Morchella</...,0


In [63]:
df.describe()

Unnamed: 0,label
count,5434.0
mean,0.466139
std,0.498898
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [64]:
df.groupby('label').describe().T

Unnamed: 0,label,0,1
message,count,2901,2533
message,unique,2900,2531
message,top,Iron supplementation previously demonstrated a...,Self-inflicted violence is a major and growing...
message,freq,2,2


In [65]:
x_train, x_test, y_train, y_test = train_test_split(df['message'],df['label'], test_size=0.2, random_state=434)

In [155]:
# Defining pre-processing parameters
max_len = 700 
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # out of vocabulary token
vocab_size = 3000

In [156]:
tokenizer = Tokenizer(num_words = vocab_size, 
                      char_level = False,
                      oov_token = oov_tok)
tokenizer.fit_on_texts(x_train)

In [167]:
import pickle

# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [157]:
# Get the word_index
word_index = tokenizer.word_index
total_words = len(word_index)
total_words

33920

In [158]:
training_sequences = tokenizer.texts_to_sequences(x_train)
training_padded = pad_sequences(training_sequences,
                                maxlen = max_len,
                                padding = padding_type,
                                truncating = trunc_type)

In [159]:
testing_sequences = tokenizer.texts_to_sequences(x_test)
testing_padded = pad_sequences(testing_sequences,
                               maxlen = max_len,
                               padding = padding_type,
                               truncating = trunc_type)

In [160]:
print('Shape of training tensor: ', training_padded.shape)
print('Shape of testing tensor: ', testing_padded.shape)

Shape of training tensor:  (4347, 700)
Shape of testing tensor:  (1087, 700)


In [161]:
# Define parameter
vocab_size = 3000 
embedding_dim = 16
drop_value = 0.1
n_dense = 24

# Define Dense Model Architecture
model = Sequential()
model.add(Embedding(vocab_size,
                    embedding_dim,
                    input_length = max_len))
model.add(GlobalAveragePooling1D())

# Add more dense layers
model.add(Dense(n_dense, activation='relu'))
model.add(Dropout(drop_value))
model.add(Dense(1, activation='sigmoid'))


In [162]:
model.summary()

Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_24 (Embedding)    (None, 700, 16)           48000     
                                                                 
 global_average_pooling1d_16  (None, 16)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_48 (Dense)            (None, 24)                408       
                                                                 
 dropout_32 (Dropout)        (None, 24)                0         
                                                                 
 dense_49 (Dense)            (None, 1)                 25        
                                                                 
Total params: 48,433
Trainable params: 48,433
Non-trainable params: 0
_________________________________________________

In [163]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam' , metrics = ['accuracy'])

In [164]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(training_padded,
                    y_train,
                    epochs=num_epochs, 
                    validation_data=(testing_padded, y_test),
                    callbacks =[early_stop],
                    verbose=2)

Epoch 1/30


136/136 - 2s - loss: 0.6796 - accuracy: 0.6246 - val_loss: 0.6461 - val_accuracy: 0.7277 - 2s/epoch - 11ms/step
Epoch 2/30
136/136 - 1s - loss: 0.5974 - accuracy: 0.7235 - val_loss: 0.5390 - val_accuracy: 0.7443 - 819ms/epoch - 6ms/step
Epoch 3/30
136/136 - 1s - loss: 0.5204 - accuracy: 0.7467 - val_loss: 0.4723 - val_accuracy: 0.7856 - 740ms/epoch - 5ms/step
Epoch 4/30
136/136 - 1s - loss: 0.4653 - accuracy: 0.7805 - val_loss: 0.4248 - val_accuracy: 0.8123 - 718ms/epoch - 5ms/step
Epoch 5/30
136/136 - 1s - loss: 0.4262 - accuracy: 0.8038 - val_loss: 0.3909 - val_accuracy: 0.8326 - 719ms/epoch - 5ms/step
Epoch 6/30
136/136 - 1s - loss: 0.4024 - accuracy: 0.8171 - val_loss: 0.3724 - val_accuracy: 0.8344 - 695ms/epoch - 5ms/step
Epoch 7/30
136/136 - 1s - loss: 0.3856 - accuracy: 0.8226 - val_loss: 0.3598 - val_accuracy: 0.8427 - 777ms/epoch - 6ms/step
Epoch 8/30
136/136 - 1s - loss: 0.3710 - accuracy: 0.8325 - val_loss: 0.3522 - val_accuracy: 0.8408 - 739ms/epoch - 5ms/step
Epoch 9/30
13

In [131]:
model.evaluate(testing_padded, y_test)




[0.3596648573875427, 0.8500459790229797]

In [165]:
model.save('ANN')

INFO:tensorflow:Assets written to: ANN\assets


INFO:tensorflow:Assets written to: ANN\assets


# LSTM

In [109]:
# Define parameter
n_lstm = 128
drop_lstm = 0.2
# Define LSTM Model 
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model1.add(SpatialDropout1D(drop_lstm))
model1.add(LSTM(n_lstm, return_sequences=False))
model1.add(Dropout(drop_lstm))
model1.add(Dense(1, activation='sigmoid'))

In [110]:
model1.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 500, 16)           48000     
                                                                 
 spatial_dropout1d_3 (Spatia  (None, 500, 16)          0         
 lDropout1D)                                                     
                                                                 
 lstm_5 (LSTM)               (None, 128)               74240     
                                                                 
 dropout_14 (Dropout)        (None, 128)               0         
                                                                 
 dense_22 (Dense)            (None, 1)                 129       
                                                                 
Total params: 122,369
Trainable params: 122,369
Non-trainable params: 0
_______________________________________________

In [111]:
model1.compile(loss = 'binary_crossentropy',
               optimizer = 'adam',
               metrics = ['accuracy'])

In [113]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model1.fit(training_padded,
                     y_train,
                     epochs=num_epochs, 
                     validation_data=(testing_padded, y_test),
                     #callbacks =[early_stop],
                     verbose=2)

Epoch 1/30
136/136 - 8s - loss: 0.6912 - accuracy: 0.5282 - val_loss: 0.6901 - val_accuracy: 0.5409 - 8s/epoch - 57ms/step
Epoch 2/30
136/136 - 8s - loss: 0.6917 - accuracy: 0.5321 - val_loss: 0.6901 - val_accuracy: 0.5409 - 8s/epoch - 55ms/step
Epoch 3/30
136/136 - 8s - loss: 0.6915 - accuracy: 0.5321 - val_loss: 0.6899 - val_accuracy: 0.5409 - 8s/epoch - 55ms/step
Epoch 4/30
136/136 - 7s - loss: 0.6915 - accuracy: 0.5321 - val_loss: 0.6903 - val_accuracy: 0.5409 - 7s/epoch - 55ms/step
Epoch 5/30
136/136 - 7s - loss: 0.6913 - accuracy: 0.5319 - val_loss: 0.6899 - val_accuracy: 0.5409 - 7s/epoch - 54ms/step
Epoch 6/30
136/136 - 7s - loss: 0.6914 - accuracy: 0.5321 - val_loss: 0.6900 - val_accuracy: 0.5409 - 7s/epoch - 55ms/step
Epoch 7/30
136/136 - 7s - loss: 0.6912 - accuracy: 0.5323 - val_loss: 0.6899 - val_accuracy: 0.5409 - 7s/epoch - 55ms/step
Epoch 8/30
136/136 - 7s - loss: 0.6911 - accuracy: 0.5321 - val_loss: 0.6901 - val_accuracy: 0.5409 - 7s/epoch - 55ms/step
Epoch 9/30
136/1

KeyboardInterrupt: 

Bi- LSTM

In [114]:
model2 = Sequential()
model2.add(Embedding(vocab_size,
                     embedding_dim,
                     input_length = max_len))
model2.add(Bidirectional(LSTM(n_lstm,
                              return_sequences = False)))
model2.add(Dropout(drop_lstm))
model2.add(Dense(1, activation='sigmoid'))

In [115]:
model2.compile(loss = 'binary_crossentropy',
               optimizer = 'adam',
               metrics=['accuracy'])

In [116]:
num_epochs = 30
early_stop = EarlyStopping(monitor = 'val_loss',
                           patience = 2)
history = model2.fit(training_padded,
                     y_train,
                     epochs = num_epochs,
                     validation_data = (testing_padded, y_test),
                     #callbacks = [early_stop],
                     verbose = 2)

Epoch 1/30


136/136 - 18s - loss: 0.6096 - accuracy: 0.6400 - val_loss: 0.4835 - val_accuracy: 0.8151 - 18s/epoch - 130ms/step
Epoch 2/30
136/136 - 14s - loss: 0.4462 - accuracy: 0.8100 - val_loss: 0.4184 - val_accuracy: 0.8307 - 14s/epoch - 105ms/step
Epoch 3/30
136/136 - 14s - loss: 0.3802 - accuracy: 0.8468 - val_loss: 0.3898 - val_accuracy: 0.8169 - 14s/epoch - 105ms/step
Epoch 4/30
136/136 - 14s - loss: 0.3622 - accuracy: 0.8443 - val_loss: 0.3678 - val_accuracy: 0.8418 - 14s/epoch - 105ms/step
Epoch 5/30
136/136 - 14s - loss: 0.2878 - accuracy: 0.8838 - val_loss: 0.3875 - val_accuracy: 0.8436 - 14s/epoch - 106ms/step
Epoch 6/30
136/136 - 15s - loss: 0.2452 - accuracy: 0.9107 - val_loss: 0.3972 - val_accuracy: 0.8399 - 15s/epoch - 107ms/step
Epoch 7/30
136/136 - 15s - loss: 0.2706 - accuracy: 0.9029 - val_loss: 0.4280 - val_accuracy: 0.8132 - 15s/epoch - 107ms/step
Epoch 8/30
136/136 - 14s - loss: 0.2199 - accuracy: 0.9172 - val_loss: 0.4312 - val_accuracy: 0.8132 - 14s/epoch - 106ms/step
Epo

KeyboardInterrupt: 

In [50]:
new_model = tf.keras.models.load_model('my_model.keras')



INFO:tensorflow:Assets written to: biLSTM\assets


INFO:tensorflow:Assets written to: biLSTM\assets


In [51]:
model3 = Sequential()
model3.add(Embedding(vocab_size,
                     embedding_dim,
                     input_length = max_len))
model3.add(SpatialDropout1D(0.2))
model3.add(GRU(128, return_sequences = False))
model3.add(Dropout(0.2))
model3.add(Dense(1, activation = 'sigmoid'))

In [52]:
model3.compile(loss = 'binary_crossentropy',
                       optimizer = 'adam',
                       metrics=['accuracy'])

In [53]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model3.fit(training_padded,
                     y_train,
                     epochs=num_epochs, 
                     validation_data=(testing_padded, y_test),
                     callbacks =[early_stop],
                     verbose=2)

Epoch 1/30


136/136 - 4s - loss: 0.6260 - accuracy: 0.6865 - val_loss: 0.5726 - val_accuracy: 0.7111 - 4s/epoch - 29ms/step
Epoch 2/30
136/136 - 2s - loss: 0.5872 - accuracy: 0.7081 - val_loss: 0.5259 - val_accuracy: 0.7443 - 2s/epoch - 14ms/step
Epoch 3/30
136/136 - 2s - loss: 0.5618 - accuracy: 0.7187 - val_loss: 0.4522 - val_accuracy: 0.7820 - 2s/epoch - 14ms/step
Epoch 4/30
136/136 - 2s - loss: 0.4634 - accuracy: 0.7867 - val_loss: 0.4361 - val_accuracy: 0.7994 - 2s/epoch - 14ms/step
Epoch 5/30
136/136 - 2s - loss: 0.4299 - accuracy: 0.8116 - val_loss: 0.4307 - val_accuracy: 0.8086 - 2s/epoch - 14ms/step
Epoch 6/30
136/136 - 2s - loss: 0.4014 - accuracy: 0.8171 - val_loss: 0.4223 - val_accuracy: 0.8040 - 2s/epoch - 15ms/step
Epoch 7/30
136/136 - 2s - loss: 0.3962 - accuracy: 0.8249 - val_loss: 0.4137 - val_accuracy: 0.8160 - 2s/epoch - 15ms/step
Epoch 8/30
136/136 - 2s - loss: 0.3839 - accuracy: 0.8337 - val_loss: 0.4144 - val_accuracy: 0.8151 - 2s/epoch - 15ms/step
Epoch 9/30
136/136 - 2s - l

In [166]:
predict_msg = ["""Pseudomonas aeruginosa is one of the most common pathogens that lead to fatal human infection. This Gram-negative pathogen has evolved complex drug resistance, which poses significant challenges to the current antibiotic-dependent healthcare system. New therapeutic approaches are urgently required to treat infections caused by P. aeruginosa."""]
def predict_spam(predict_msg):
  new_seq = tokenizer.texts_to_sequences(predict_msg)
  padded = pad_sequences(new_seq,
                         maxlen = max_len,
                         padding = padding_type,
                         truncating = trunc_type)
  return(model.predict(padded))
predict_spam(predict_msg)

array([[0.16314809]], dtype=float32)

# Attention 