In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Layer
from tensorflow.keras.layers import Embedding, Input, GlobalAveragePooling1D, Dense
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, Model
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

2023-10-05 19:18:54.419259: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-05 19:18:54.498368: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-05 19:18:54.499455: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"), 
             Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [4]:
class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [5]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review

(x_train, y_train), (x_val, y_val) = imdb.load_data(num_words=vocab_size)

print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
25000 Training sequences
25000 Validation sequences


In [6]:
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = tf.keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)

In [8]:
# read data.csv
import pandas as pd
from sklearn.model_selection import train_test_split

# df = pd.read_csv('data.csv')
df = pd.read_csv('spam_email.csv')

print(df.head())

                                                Text  CB_Label
0  Subject: stock promo mover : cwtd\n * * * urge...         1
1  Subject: are you listed in major search engine...         1
2  Subject: important information thu , 30 jun 20...         1
3  Subject: = ? utf - 8 ? q ? bask your life with...         1
4  Subject: " bidstogo " is places to go , things...         1


In [9]:
# Tokenize the sentences
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df['text'])

# Obtain the corresponding word indices
x_train = tokenizer.texts_to_sequences(df['text'])
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)

# Obtain the corresponding labels
y_train = df['label'].values

# Split the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)

print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")


8000 Training sequences
2000 Validation sequences


In [10]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(2, activation="softmax")(x)

model = Model(inputs=inputs, outputs=outputs)

In [11]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

history = model.fit(x_train, y_train, 
                    batch_size=64, epochs=10, 
                    validation_data=(x_val, y_val)
                   )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
model.save_weights("predict_class.h5")

In [13]:
results = model.evaluate(x_val, y_val, verbose=2)

for name, value in zip(model.metrics_names, results):
    print("%s: %.3f" % (name, value))

63/63 - 2s - loss: 0.0722 - accuracy: 0.9845 - 2s/epoch - 26ms/step
loss: 0.072
accuracy: 0.984


In [14]:
# create a user input for testing
user_input = input("Enter a sentence: ")

# Tokenize the user input
x_test = tokenizer.texts_to_sequences([user_input])

# Pad the user input
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

# Predict the class
prediction = model.predict(x_test)

# Print the prediction
print(prediction)
# Print out the prediction in human readable form
print("The sentence is", "spam" if np.argmax(prediction) == 1 else "not spam")


[[7.875095e-04 9.992125e-01]]
The sentence is spam


In [15]:
# Get the word counts for each word in the corpus
word_counts = tokenizer.word_counts

# Sort the word counts in descending order
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Get the top most common words
top_words = [word for word, count in sorted_word_counts[:100]]
least_words = [word for word, count in sorted_word_counts[-100:]]

# Print the 10 most common words
print(top_words)
print(least_words)

# Print the length of the vocabulary
print(len(tokenizer.word_counts))
print(len(tokenizer.word_index))

# Print the frequency of the top 10 most common words
for word, count in sorted_word_counts[:10]:
    print(word, count)
    
# Print the frequency of the top 10 least common words
for word, count in sorted_word_counts[20000:20010]:
	print(word, count)
 
text = "for simpiicity is a test sentence containing words that simpiicity  be in the tokenizer's vocabulary."
tokenized_text = tokenizer.texts_to_sequences([text])
print(tokenized_text)
 

['the', 'to', 'and', 'of', 'a', 'in', "'", 'for', 'you', 'is', 'enron', 'this', 'on', 'that', 'i', 's', 'with', 'subject', 'be', 'your', 'we', 'as', 'it', 'from', 'have', 'will', 'are', 'ect', 'or', 'at', 'by', 'not', 'com', 'our', 'company', '1', 'if', 'all', 'an', '2', 'has', 'please', '3', 'can', 'hou', 'was', '2001', 'any', 'e', 'me', 'would', 'its', 'new', 'more', 'no', '10', '2000', 'am', 'my', '5', 't', 'but', 'information', 'may', 'said', 're', 'which', '00', 'do', 'about', 'they', 'business', 'energy', 'time', 'been', 'up', 'one', 'gas', '4', 'out', 'us', 'here', 'http', 'get', '0', '01', 'he', '000', 'these', 'their', 'message', 'pm', 'email', 'know', 'cc', 'there', '11', 'price', 'now', 'also']
['ckh', 'ukh', '9890', 'milhalik', 'reapproach', 'dewatering', 'cloaking', 'guise', '412219', '454057', 'mousemillions', 'mousemaniacs', 'mydomain', '1434', '9277', '1221927', 'whitselk', 'perfoming', 'despatch', 'disconnects', 'illusions', 'formalize', 'mispelling', '60217', '286715'