In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

# Step 1: Load and Preprocess Data
data = pd.read_csv('spam_ham_dataset.csv')  # Replace with your dataset path
data = shuffle(data)  # Shuffle the data to randomize class distribution
text_data = data['text']
labels = data['label']

max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(text_data)
sequences = tokenizer.texts_to_sequences(text_data)
X = pad_sequences(sequences)
y = np.array(labels)

# Step 2: Handling Class Imbalance (Optional)
# You can apply techniques to address class imbalance, such as oversampling, undersampling, or using class weights.

# Step 3: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Build the RNN Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128),
    SimpleRNN(64),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 5: Train the Model
history = model.fit(X_train, y_train, batch_size=64, epochs=10, validation_split=0.2)

# Step 6: Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

# Step 7: Plot Accuracy and Loss Curves
# You can visualize training and validation accuracy and loss using Matplotlib.
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Step 8: Predict for Sample Text
# You can use the trained model to predict whether sample text is 'ham' or 'spam'.
sample_text = ["Your Amazon order is out for delivery."]
sample_sequence = tokenizer.texts_to_sequences(sample_text)
sample_sequence = pad_sequences(sample_sequence, maxlen=X.shape[1])  # Match sequence length with training data
prediction = model.predict(sample_sequence)
class_label = 'spam' if prediction > 0.5 else 'ham'
print(f"Sample Text: {sample_text[0]}\nPredicted Label: {class_label}")

# Step 9: Hyperparameter Tuning (Optional)
# You can fine-tune hyperparameters, such as the number of epochs, batch size, and model architecture, to optimize performance.
