In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Load BERT Preprocessing & Encoder
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [8]:
# Sample Data
df = pd.read_csv('spam.csv')

In [9]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [10]:
df_spam = df[df['Category']=='spam']
df_spam.shape

(747, 2)

In [11]:
df_ham = df[df['Category']=='ham']
df_ham.shape

(4825, 2)

In [12]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [13]:
df_balanced = pd.concat([df_ham_downsampled, df_spam])
df_balanced.shape

(1494, 2)

In [14]:
df_balanced['Category'].value_counts()

Category
ham     747
spam    747
Name: count, dtype: int64

In [15]:
df_balanced['spam']=df_balanced['Category'].apply(lambda x: 1 if x=='spam' else 0)
df_balanced.sample(5)

Unnamed: 0,Category,Message,spam
4264,ham,Den only weekdays got special price... Haiz......,0
5326,ham,What makes you most happy?,0
650,spam,"You have won ?1,000 cash or a ?2,000 prize! To...",1
4026,ham,&lt;#&gt; in mca. But not conform.,0
2019,ham,Ya it came a while ago,0


In [17]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(df_balanced["Message"], df_balanced["spam"], 
                                                    stratify=df_balanced["spam"], train_size=0.2, random_state=42)


In [18]:
# Define BERT Model
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')  # Expecting string inputs
preprocessed_text = bert_preprocess(text_input)  # Preprocess inside the model
outputs = bert_encoder(preprocessed_text)  # Pass through BERT encoder

In [19]:
# Neural Network Layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)


In [20]:
# Construct Model
model = tf.keras.Model(inputs=[text_input], outputs=[l])

In [21]:
# Compile Model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [22]:
# Train Model (DO NOT preprocess before training)
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
# Evaluate Model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")


Test Accuracy: 0.9005


In [25]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()



In [26]:
import numpy as np

y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

array([0, 0, 0, ..., 1, 1, 0])

In [27]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_predicted)
cm 

array([[518,  80],
       [ 39, 559]], dtype=int64)

In [28]:
# Make Predictions
sample_text = ["This is a test message"]
prediction = model.predict(sample_text)
print("Prediction:", prediction)

Prediction: [[0.1502649]]
