In [1]:
import os
import requests
import zipfile
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from sklearn.model_selection import train_test_split
import json

print(f"Good evening from Surat! Starting the Spam SMS model training process at {
      np.datetime64('now', 's')}.")
print(f"Using TensorFlow version: {tf.__version__}")

# --- 1. Download and Prepare Data ---
DATA_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
ZIP_FILE_PATH = 'smsspamcollection.zip'
EXTRACTED_FILE_PATH = 'SMSSpamCollection'

if not os.path.exists(EXTRACTED_FILE_PATH):
    print("Downloading dataset...")
    response = requests.get(DATA_URL)
    with open(ZIP_FILE_PATH, 'wb') as f:
        f.write(response.content)
    
    print("Extracting dataset...")
    with zipfile.ZipFile(ZIP_FILE_PATH, 'r') as zip_ref:
        zip_ref.extractall()
    
    os.remove(ZIP_FILE_PATH)
    print("Dataset downloaded and extracted.")
else:
    print("Dataset already exists.")

# Load data with pandas
df = pd.read_csv(EXTRACTED_FILE_PATH, sep='\t', header=None, names=['label', 'message'])
print("\nDataset Info:")
print(df.head())
print(df['label'].value_counts())

# --- 2. Preprocess Data ---
# Map labels to numerical values
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Create tokenizer
# These parameters must be consistent between training and the app
VOCAB_SIZE = 5000  # Size of the vocabulary
MAX_LEN = 100      # Max length of a message
TRUNC_TYPE = 'post'
PADDING_TYPE = 'post'
OOV_TOKEN = "<OOV>"  # Out-of-vocabulary token for words not in the tokenizer

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(df['message'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['message'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE)

# --- 3. Split Data ---
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, df['label'].values, test_size=0.2, random_state=42
)
print(f"\nTraining data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 4. Build and Compile Model ---
EMBEDDING_DIM = 64
LSTM_UNITS = 64
EPOCHS = 10
BATCH_SIZE = 32

print("\nBuilding the LSTM model architecture...")
model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LEN),
    SpatialDropout1D(0.2), # Dropout for the embedding layer
    LSTM(units=LSTM_UNITS, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# --- 5. Train the Model ---
print("\nStarting training...")
history = model.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_test, y_test),
    verbose=2
)

# --- 6. Save Artifacts ---
# Save the trained model
model_path = 'spam_lstm_model.h5'
print(f"\nSaving model to {model_path}...")
model.save(model_path)

# Save the tokenizer's word_index
tokenizer_path = 'spam_tokenizer_word_index.json'
print(f"Saving tokenizer word index to {tokenizer_path}...")
with open(tokenizer_path, 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer.word_index, ensure_ascii=False))

print("\nModel and tokenizer have been saved successfully.")
print("You can now run the Streamlit app.")


Good evening from Surat! Starting the Spam SMS model training process at 2025-09-24T13:05:36.
Using TensorFlow version: 2.20.0
Downloading dataset...
Extracting dataset...
Dataset downloaded and extracted.

Dataset Info:
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
label
ham     4825
spam     747
Name: count, dtype: int64

Training data shape: (4457, 100)
Test data shape: (1115, 100)

Building the LSTM model architecture...





Starting training...
Epoch 1/10
140/140 - 17s - 122ms/step - accuracy: 0.8627 - loss: 0.4138 - val_accuracy: 0.8664 - val_loss: 0.3935
Epoch 2/10
140/140 - 9s - 65ms/step - accuracy: 0.8658 - loss: 0.3963 - val_accuracy: 0.8664 - val_loss: 0.3934
Epoch 3/10
140/140 - 9s - 61ms/step - accuracy: 0.8658 - loss: 0.3965 - val_accuracy: 0.8664 - val_loss: 0.3942
Epoch 4/10
140/140 - 9s - 61ms/step - accuracy: 0.8658 - loss: 0.3962 - val_accuracy: 0.8664 - val_loss: 0.3935
Epoch 5/10
140/140 - 9s - 61ms/step - accuracy: 0.8658 - loss: 0.3948 - val_accuracy: 0.8664 - val_loss: 0.3935
Epoch 6/10
140/140 - 9s - 61ms/step - accuracy: 0.8658 - loss: 0.3950 - val_accuracy: 0.8664 - val_loss: 0.3935
Epoch 7/10
140/140 - 9s - 61ms/step - accuracy: 0.8658 - loss: 0.3946 - val_accuracy: 0.8664 - val_loss: 0.3940
Epoch 8/10
140/140 - 10s - 70ms/step - accuracy: 0.8658 - loss: 0.3955 - val_accuracy: 0.8664 - val_loss: 0.3933
Epoch 9/10
140/140 - 15s - 110ms/step - accuracy: 0.8658 - loss: 0.3954 - val_a




Saving model to spam_lstm_model.h5...
Saving tokenizer word index to spam_tokenizer_word_index.json...

Model and tokenizer have been saved successfully.
You can now run the Streamlit app.
