In [16]:
import pandas as pd
import numpy as np
import random
import os
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from google.colab import files
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2

In [17]:
!wget https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv
spam = pd.read_csv("spam.csv", encoding='ISO-8859-1')
spam.head()

--2026-02-11 03:29:14--  https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 503663 (492K) [application/octet-stream]
Saving to: ‘spam.csv.1’


2026-02-11 03:29:14 (10.2 MB/s) - ‘spam.csv.1’ saved [503663/503663]



Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [18]:
spam.isnull().sum()

Unnamed: 0,0
v1,0
v2,0
Unnamed: 2,5522
Unnamed: 3,5560
Unnamed: 4,5566


In [19]:
spam = spam[['v1', 'v2']]
spam.columns = ['lable', 'msg']
spam.head()

Unnamed: 0,lable,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
ps = PorterStemmer()

def clean_msg(text):
  msg = re.sub('[^a-zA-Z]' , ' ' , text)
  msg = msg.lower()
  msg = msg.split()
  msgs = [ps.stem(word) for word in msg if not word in stop_words]
  return " ".join(msgs)

In [22]:
spam.loc[:, 'msg'] = spam['msg'].apply(clean_msg)
spam.head()

Unnamed: 0,lable,msg
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkt st m...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though


In [23]:
spam['lable'] = spam['lable'].map({'ham': 0, 'spam': 1})

X = spam['msg'].values
y = spam['lable'].values

print("Labels check (First 5):", y[:5])


Labels check (First 5): [0 0 1 0 0]


In [24]:
vocab_size = 10000
max_length = 100
embedding_dim = 16
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index
print(f"Total Unique Words found: {len(word_index)}")

training_sequences = tokenizer.texts_to_sequences(X_train)
testing_sequences = tokenizer.texts_to_sequences(X_test)


training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


training_padded = np.array(training_padded)
training_labels = np.array(y_train)
testing_padded = np.array(testing_padded)
testing_labels = np.array(y_test)

print("\n--- Data Ready for Model ---")
print(f"Training Data Shape: {training_padded.shape}")
print(f"Testing Data Shape: {testing_padded.shape}")

Total Unique Words found: 5582

--- Data Ready for Model ---
Training Data Shape: (4457, 100)
Testing Data Shape: (1115, 100)


In [25]:
def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

set_seed(42)
tf.keras.backend.clear_session()

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, kernel_regularizer=tf.keras.regularizers.l2(0.01))),
    tf.keras.layers.Dropout(0.6),
    tf.keras.layers.Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(0.01))
])

custom_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(loss='binary_crossentropy', optimizer=custom_optimizer, metrics=['accuracy'])

history = model.fit(
    training_padded, training_labels,
    epochs=20,
    validation_data=(testing_padded, testing_labels),
    verbose=2
)

Epoch 1/20




140/140 - 15s - 108ms/step - accuracy: 0.8625 - loss: 1.1084 - val_accuracy: 0.8664 - val_loss: 0.8749
Epoch 2/20
140/140 - 11s - 78ms/step - accuracy: 0.8658 - loss: 0.8278 - val_accuracy: 0.8664 - val_loss: 0.7639
Epoch 3/20
140/140 - 9s - 67ms/step - accuracy: 0.8658 - loss: 0.7138 - val_accuracy: 0.8664 - val_loss: 0.6528
Epoch 4/20
140/140 - 11s - 76ms/step - accuracy: 0.8661 - loss: 0.6107 - val_accuracy: 0.8664 - val_loss: 0.5705
Epoch 5/20
140/140 - 11s - 76ms/step - accuracy: 0.8692 - loss: 0.5421 - val_accuracy: 0.8664 - val_loss: 0.5099
Epoch 6/20
140/140 - 11s - 76ms/step - accuracy: 0.8797 - loss: 0.4811 - val_accuracy: 0.8673 - val_loss: 0.4590
Epoch 7/20
140/140 - 11s - 76ms/step - accuracy: 0.8993 - loss: 0.4279 - val_accuracy: 0.8978 - val_loss: 0.4106
Epoch 8/20
140/140 - 10s - 72ms/step - accuracy: 0.9168 - loss: 0.3817 - val_accuracy: 0.9256 - val_loss: 0.3602
Epoch 9/20
140/140 - 10s - 70ms/step - accuracy: 0.9423 - loss: 0.3306 - val_accuracy: 0.9462 - val_loss: 0

In [26]:
loss, accuracy = model.evaluate(testing_padded, testing_labels)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

y_pred_prob = model.predict(testing_padded)
y_pred_standard = (y_pred_prob > 0.32).astype("int32")

print("\n--- Confusion Matrix (Normal 0.32) ---")
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(testing_labels, y_pred_standard)
print(cm)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9814 - loss: 0.1325
Test Accuracy: 98.21%
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step

--- Confusion Matrix (Normal 0.32) ---
[[966   0]
 [ 19 130]]


In [27]:
model.save('Final_Spam_Model.keras')
print("Model saved as 'Final_Spam_Model.keras' ✅")

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Tokenizer saved as 'tokenizer.pickle' ✅")

print("Downloading files...")
files.download('Final_Spam_Model.keras')
files.download('tokenizer.pickle')

Model saved as 'Final_Spam_Model.keras' ✅
Tokenizer saved as 'tokenizer.pickle' ✅
Downloading files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>