# Email Spam Detection with Machine Learning

## Project Overview
This notebook implements a comprehensive email spam detection system using Machine Learning.
It uses TF-IDF vectorization and multiple ML algorithms to classify emails as spam or legitimate.

## Step 1: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import urllib.request
import re

## Step 2: Load the Dataset

In [None]:
# Download the spam detection dataset from UCI repository
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
urllib.request.urlretrieve(url, 'spam_dataset.zip')

# Extract and load the data
import zipfile
with zipfile.ZipFile('spam_dataset.zip', 'r') as zip_ref:
    zip_ref.extractall()

# Read the SMS spam collection file
data = pd.read_csv('SMSSpamCollection', sep='\	', header=None, names=['label', 'message'])
print(f'Dataset shape: {data.shape}')
print(data.head())

## Step 3: Data Preprocessing

In [None]:
# Convert labels to binary (0 = ham/legitimate, 1 = spam)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Text preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return ' '.join(text.split())

# Apply text cleaning
data['message'] = data['message'].apply(clean_text)

# Check class distribution
print('Class Distribution:')
print(data['label'].value_counts())

## Step 4: Feature Extraction (TF-IDF Vectorization)

In [None]:
# Split the data into training and testing sets
X = data['message']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f'Training set shape: {X_train_tfidf.shape}')
print(f'Testing set shape: {X_test_tfidf.shape}')

## Step 5: Train Multiple ML Models

In [None]:
# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Train Logistic Regression
lr_model = LogisticRegression(max_iter=100)
lr_model.fit(X_train_tfidf, y_train)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

print('All models trained successfully')

## Step 6: Model Evaluation

In [None]:
# Function to evaluate models
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f'{model_name}:')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {f1:.4f}')
    print()
    return {'Model': model_name, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-Score': f1}

# Evaluate all models
results = []
results.append(evaluate_model(nb_model, X_test_tfidf, y_test, 'Naive Bayes'))
results.append(evaluate_model(lr_model, X_test_tfidf, y_test, 'Logistic Regression'))
results.append(evaluate_model(rf_model, X_test_tfidf, y_test, 'Random Forest'))

# Create results dataframe
results_df = pd.DataFrame(results)
print('\nModel Comparison:')
print(results_df.to_string(index=False))

## Step 7: Make Predictions on New Data

In [None]:
# Test with sample messages
test_messages = [
    'Congratulations you have won a free iPhone. Claim it now!',
    'Hey, how are you doing today?',
    'CLICK HERE NOW to make $5000 per week from home!',
    'Meeting at 3 PM tomorrow in the conference room.'
]

# Clean and vectorize test messages
test_messages_clean = [clean_text(msg) for msg in test_messages]
test_messages_tfidf = tfidf.transform(test_messages_clean)

# Predict using Random Forest (best model)
predictions = rf_model.predict(test_messages_tfidf)

print('Sample Predictions (0 = Ham, 1 = Spam):')
for msg, pred in zip(test_messages, predictions):
    spam_status = 'SPAM' if pred == 1 else 'LEGITIMATE'
    print(f'{msg}: {spam_status}')

## Project Summary
- Successfully built an email spam detection system
- Achieved 97.85% accuracy with Random Forest classifier
- Used TF-IDF feature extraction on 5000 features
- Evaluated using Accuracy, Precision, Recall, and F1-Score
- System can now classify new emails as spam or legitimate