In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout

# Load the dataset
data = pd.read_csv("C:/Users/ankuv/Desktop/DAB/Semester 3/DAB 322/spam email data.csv", encoding='latin1')

# Check the columns
print(data.columns)

# Assume the columns are 'v1' for labels and 'v2' for text
data = data[['v1', 'v2']]
data.columns = ['Label', 'Text']

# Encode the labels
encoder = LabelEncoder()
data['Label'] = encoder.fit_transform(data['Label'])

# Tokenize the text
max_words = 5000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['Text'])
sequences = tokenizer.texts_to_sequences(data['Text'])

# Pad sequences to ensure uniform input length
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Labels
y = data['Label'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_sequence_length))
model.add(SimpleRNN(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

# Make predictions
predictions = model.predict(X_test)
predictions = (predictions > 0.5).astype(int)

# Print some example predictions
for i in range(10):
    print(f'Actual: {y_test[i]}, Predicted: {predictions[i][0]}')


Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')




Epoch 1/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 28ms/step - accuracy: 0.8807 - loss: 0.3476 - val_accuracy: 0.9518 - val_loss: 0.1699
Epoch 2/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.9838 - loss: 0.0909 - val_accuracy: 0.9854 - val_loss: 0.0513
Epoch 3/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.9908 - loss: 0.0353 - val_accuracy: 0.9821 - val_loss: 0.0590
Epoch 4/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step - accuracy: 0.9955 - loss: 0.0174 - val_accuracy: 0.9821 - val_loss: 0.0542
Epoch 5/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.9977 - loss: 0.0101 - val_accuracy: 0.9877 - val_loss: 0.0552
Epoch 6/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9988 - loss: 0.0049 - val_accuracy: 0.9854 - val_loss: 0.0623
Epoch 7/10
[1m112/112

In [3]:
# Function to predict on new data
def predict_new_data(new_data, model, tokenizer, max_sequence_length):
    # Convert all entries in the 'Text' column to strings
    new_data['Text'] = new_data['Text'].astype(str)
    
    # Tokenize and pad the new data
    new_sequences = tokenizer.texts_to_sequences(new_data['Text'])
    new_X = pad_sequences(new_sequences, maxlen=max_sequence_length)
    
    # Make predictions
    new_predictions = model.predict(new_X)
    new_predictions = (new_predictions > 0.5).astype(int)
    
    return new_predictions

# Function to calculate accuracy on new data
def calculate_accuracy(true_labels, predicted_labels):
    correct = sum(true_labels == predicted_labels)
    total = len(true_labels)
    accuracy = correct / total
    return accuracy

# Example usage for new dataset
# Assuming you have a new dataset in the same format as the original
new_data = pd.read_csv("C:/Users/ankuv/Downloads/spam_or_not_spam.csv", encoding='latin1')
new_data = new_data[['v1', 'v2']]
new_data.columns = ['Label', 'Text']

# Remove rows with NaN values
new_data = new_data.dropna()

# Encode the labels for the new dataset
new_data['Label'] = encoder.transform(new_data['Label'])

# Make predictions on the new dataset
new_predictions = predict_new_data(new_data, model, tokenizer, max_sequence_length)

# Calculate accuracy on the new dataset
new_accuracy = calculate_accuracy(new_data['Label'].values, new_predictions.flatten())
print(f'New Dataset Accuracy: {new_accuracy:.4f}')

# Print some example predictions from the new dataset
for i in range(min(10, len(new_data))):
    print(f'Actual: {new_data["Label"].values[i]}, Predicted: {new_predictions[i][0]}')

[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step
New Dataset Accuracy: 0.6808
Actual: 0, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 0


In [4]:
# Save the model and tokenizer
model.save('spam_model.h5')
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Function to preprocess and predict on new data
def predict_spam(new_texts):
    # Load the saved model and tokenizer
    loaded_model = load_model('spam_model.h5')
    with open('tokenizer.pickle', 'rb') as handle:
        loaded_tokenizer = pickle.load(handle)
    
    # Tokenize and pad the new texts
    new_sequences = loaded_tokenizer.texts_to_sequences(new_texts)
    new_X = pad_sequences(new_sequences, maxlen=max_sequence_length)
    
    # Make predictions
    new_predictions = loaded_model.predict(new_X)
    new_predictions = (new_predictions > 0.5).astype(int)
    
    return new_predictions

# Example usage of the prediction function
new_texts = [
    "Congratulations! You've won a free iPhone! Click here to claim your prize.",
    "Hi Mom, can you pick me up after school today?",
    "URGENT: Your bank account has been compromised. Reply with your details immediately."
]

results = predict_spam(new_texts)

for text, result in zip(new_texts, results):
    print(f"Text: {text}")
    print(f"Prediction: {'Spam' if result[0] == 1 else 'Not Spam'}")
    print()



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 274ms/step
Text: Congratulations! You've won a free iPhone! Click here to claim your prize.
Prediction: Spam

Text: Hi Mom, can you pick me up after school today?
Prediction: Not Spam

Text: URGENT: Your bank account has been compromised. Reply with your details immediately.
Prediction: Spam

