In [0]:
%tensorflow_version 2.x

from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import tensorflow as tf
print(tf.__version__)
print(tf.test.gpu_device_name())

from google.colab import drive
drive.mount('/content/drive')

2.2.0-rc2
/device:GPU:0
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Activation
from tensorflow.keras.layers import LSTM, Bidirectional 
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from keras import metrics

Using TensorFlow backend.


In [0]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
# set random seed for numpy and tensorflow
np.random.seed(93)
tf.random.set_seed(93)

In [0]:
import sys
sys.path.append('/content/drive/My Drive/Adv_PY/Final_Project')
from util import preprocess_text, shuffle_dataset, split_data

In [0]:
filename = '/content/drive/My Drive/Adv_PY/Final_Project/data.txt'

read_file = df = pd.read_table(filename, sep='\t',header=None,names=['label','msg'])
read_file.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [0]:
# create train, test sets
with open(filename, encoding='utf-8') as f:
  texts = f.read().splitlines()

labels = []
corpus = []
for text in texts:
    label, msg = preprocess_text(text)
    labels.append(label)
    corpus.append(msg)

train, test = split_data(corpus, labels, 0.2)
y_train = np.asarray(train[1]).astype('int32').reshape((-1,1))
y_test = np.asarray(test[1]).astype('int32').reshape((-1,1))
# Converting training and validation data into sequences

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(corpus)

In [0]:
y_train

array([[0],
       [0],
       [0],
       ...,
       [0],
       [1],
       [0]], dtype=int32)

In [0]:
# culculate max length

num_tokens = [len(tokens) for tokens in tokenizer.texts_to_sequences(train[0]) + tokenizer.texts_to_sequences(test[0])]
num_tokens = np.array(num_tokens)

max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)

print(max_tokens)

np.sum(num_tokens < max_tokens) / len (num_tokens)

25


0.9608898457122353

In [0]:
np.max(num_tokens)

100

In [0]:
# pad the sequence
max_tokens = 100

train_seq = sequence.pad_sequences(tokenizer.texts_to_sequences(train[0]),maxlen=max_tokens,padding='post',truncating='post')
# train_seq = np.expand_dims(train_seq,-1)
test_seq = sequence.pad_sequences(tokenizer.texts_to_sequences(test[0]),maxlen=max_tokens,padding='post',truncating='post')
# test_seq = np.expand_dims(test_seq,-1)
vocab_size=len(tokenizer.word_counts)

print(train_seq.shape)
print(test_seq.shape)

(4460, 100)
(1114, 100)


In [0]:
# define parameters in the model
epochs =20
embedding_dim = 128
unit_dim = 64
batch_size = 32


In [0]:
def bilstm_model():
    model = tf.keras.Sequential()
    model.add(Embedding(input_dim=vocab_size+1, 
                            output_dim=embedding_dim, 
                            input_length=max_tokens, 
                            name='layer_embedding'))
    model.add(Bidirectional(LSTM(units=unit_dim, return_sequences=True)))
    model.add(Dropout(0.3))
    model.add(Dense(vocab_size, activation='sigmoid'))
    model.add(Dense(1,activation='sigmoid'))
    model.summary()

    model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
    return model

In [0]:
def cnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size+1,
                        output_dim=embedding_dim,
                        input_length=max_tokens))
    model.add(Dropout(0.2))
    model.add(Conv1D(256,
                     3,
                     padding='valid',
                     activation='relu',
                     strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(256))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    return model

In [0]:
def fit_model(model, x, y):
    model.fit(x, y, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.2)

In [0]:
cnn = cnn_model()
fit_model(cnn, train_seq, y_train)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 128)          978048    
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 256)           98560     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
activation_2 (Activation)    (None, 256)              

In [0]:
score = cnn.evaluate(test_seq, y_test)
score



[0.056553225964307785, 0.9883303642272949]

In [0]:
bilstm = bilstm_model()
fit_model(bilstm, train_seq, y_train)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 100, 128)          978048    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          98816     
_________________________________________________________________
dropout_5 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
dense_6 (Dense)              (None, 100, 7640)         985560    
_________________________________________________________________
dense_7 (Dense)              (None, 100, 1)            7641      
Total params: 2,070,065
Trainable params: 2,070,065
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20

In [0]:
score = bilstm.evaluate(test_seq, y_test)
score



[0.10231638699769974, 0.9910771250724792]