<a href="https://colab.research.google.com/github/Velmani06/Calculator/blob/main/Email%20Spam%20Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.utils import class_weight
import tensorflow as tf
from tensorflow.keras import layers

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/combined_data.csv')  # Replace with the actual path

# Preprocessing the dataset
# We are now using 'label' for target and 'text' for the email content
X = df['text']
y = df['label']

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical form using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Convert to dense arrays for TensorFlow input
X_train_tfidf = X_train_tfidf.toarray()
X_test_tfidf = X_test_tfidf.toarray()

# Convert target labels to integer numpy arrays
y_train = np.array(y_train).astype('int')
y_test = np.array(y_test).astype('int')

# Calculate class weights to handle imbalance
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))

# Build the TensorFlow model with an Input layer
model = tf.keras.Sequential([
    layers.Input(shape=(5000,)),  # Explicit input layer
    layers.Dense(512, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with class weights
history = model.fit(X_train_tfidf, y_train, epochs=10, batch_size=64,
                    validation_data=(X_test_tfidf, y_test), class_weight=class_weights)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_tfidf, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

# Predicting the test set with a threshold of 0.5
y_pred = (model.predict(X_test_tfidf) > 0.5).astype("int32")

# Confusion Matrix and Classification Report
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm)
print('Classification Report:\n', classification_report(y_test, y_pred))

# Plot Confusion Matrix Heatmap
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plotting accuracy and loss during training
plt.figure(figsize=(12, 4))

# Accuracy Plot
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Loss Plot
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()



Epoch 1/10
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 54ms/step - accuracy: 0.8718 - loss: 0.3643 - val_accuracy: 0.9852 - val_loss: 0.0480
Epoch 2/10


In [None]:
# Adjusted Prediction Function with Threshold Control
def classify_email(threshold=0.5):
    while True:
        email_input = input("\nEnter an email to classify (or type 'exit' to stop):\n")
        if email_input.lower() == 'exit':
            print("Exiting...")
            break

        # Convert the input email to TF-IDF features
        email_tfidf = vectorizer.transform([email_input]).toarray()

        # Predict whether the email is spam or not using the adjustable threshold
        prediction = model.predict(email_tfidf)
        result = 'Spam' if prediction > threshold else 'Ham'

        print(f"\nPrediction: {result}")

# Call the classify_email function for live input
classify_email(threshold=0.4)  # Example of using a custom threshold of 0.4
