In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Step 1: Load and Preprocess Data
# Assume you have a dataset with 'text' and 'label' columns.
data = pd.read_csv('spam_ham_dataset.csv')  # Replace with your dataset path
text_data = data['text']
labels = data['label']

# Text preprocessing
max_words = 10000  # Maximum number of unique words to consider
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(text_data)
sequences = tokenizer.texts_to_sequences(text_data)
X = pad_sequences(sequences)
y = to_categorical(labels, num_classes=2)

# Step 2: Handle Class Imbalance (if needed)
# If your dataset is imbalanced, consider oversampling, undersampling, or using weighted classes.

# Step 3: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Build the RNN Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=X.shape[1]),
    LSTM(64),
    Dense(2, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 5: Hyperparameter Tuning (optional)
# Experiment with different hyperparameters such as the number of LSTM units, batch size, and learning rate.

# Step 6: Train the Model
history = model.fit(X_train, y_train, batch_size=64, epochs=10, validation_split=0.2)

# Step 7: Plot Training and Validation Accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Step 8: Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

# Step 9: Make Predictions
# You can use the model to predict whether a sample text is spam or ham.
sample_text = ["Get a free iPhone now!"]
sample_sequence = tokenizer.texts_to_sequences(sample_text)
sample_X = pad_sequences(sample_sequence, maxlen=X.shape[1])
prediction = model.predict(sample_X)

# Step 10: Display the Prediction
# The prediction will be a probability distribution; you can use a threshold to classify as spam or ham.
threshold = 0.5
if prediction[0][1] > threshold:
    print("Predicted: Spam")
else:
    print("Predicted: Ham")
