# Spam Email Detection: Data Exploration

This notebook explores the spam email dataset to understand its characteristics.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from collections import Counter
import re
import os
import sys

# Set plot style
import seaborn as sns
sns.set_theme() 
plt.rcParams['figure.figsize'] = (12, 8)

# Add project root to path
sys.path.append(os.path.join(os.getcwd(), '..'))

# Import project modules
from src.utils.text_processing import (
    get_text_statistics, plot_text_length_distribution, 
    plot_word_cloud, get_most_common_words, plot_most_common_words
)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

## 1. Load the Dataset

Load the spam dataset from a CSV file. Make sure to put the CSV file in the `data/raw` directory.

In [None]:
# Set data path
DATA_PATH = '../data/raw/spam.csv'

# Load dataset
df = pd.read_csv(DATA_PATH)

# Display first few rows
df.head()

In [None]:
# Check dataset shape
print(f"Dataset shape: {df.shape}")

# Check column names
print(f"Columns: {df.columns.tolist()}")

In [None]:
# Rename columns for clarity (adjust based on actual column names in your dataset)
df = df.rename(columns={'Category': 'label', 'Message': 'message'})

# Check for missing values
print("Missing values:")
df.isnull().sum()

## 2. Exploratory Data Analysis

In [None]:
# Check class distribution
class_distribution = df['label'].value_counts(normalize=True) * 100
print(f"Class distribution:\n{class_distribution}")

# Plot class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=df)
plt.title('Class Distribution')
plt.xlabel('Label')
plt.ylabel('Count')

# Add percentage labels
total = len(df)
for i, count in enumerate(df['label'].value_counts()):
    plt.annotate(f"{count/total*100:.1f}%", 
                xy=(i, count), 
                xytext=(0, 5),  
                textcoords="offset points", 
                ha='center', 
                va='bottom')
plt.tight_layout()
plt.show()

In [None]:
# Analyze message length
df['message_length'] = df['message'].apply(len)
df['word_count'] = df['message'].apply(lambda x: len(x.split()))

# Summary statistics
print("Message length statistics:")
print(df[['message_length', 'word_count']].describe())

In [None]:
# Compare message lengths by class
plt.figure(figsize=(14, 6))

# Character length
plt.subplot(1, 2, 1)
sns.boxplot(x='label', y='message_length', data=df)
plt.title('Message Length by Class')
plt.xlabel('Label')
plt.ylabel('Number of Characters')

# Word count
plt.subplot(1, 2, 2)
sns.boxplot(x='label', y='word_count', data=df)
plt.title('Word Count by Class')
plt.xlabel('Label')
plt.ylabel('Number of Words')

plt.tight_layout()
plt.show()

## 3. Text Analysis

In [None]:
# Separate spam and ham messages
spam_messages = df[df['label'] == 'spam']['message'].values
ham_messages = df[df['label'] == 'ham']['message'].values

# Get statistics for each class
spam_stats = get_text_statistics(spam_messages)
ham_stats = get_text_statistics(ham_messages)

# Print statistics
print("Spam Statistics:")
for key, value in spam_stats.items():
    print(f"  {key}: {value}")

print("\nHam Statistics:")
for key, value in ham_stats.items():
    print(f"  {key}: {value}")

In [None]:
fig = plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plot_text_length_distribution(spam_messages, "Spam Message Length Distribution")

plt.subplot(1, 2, 2)
plot_text_length_distribution(ham_messages, "Ham Message Length Distribution")

plt.tight_layout()
plt.show()

In [None]:
# Generate word clouds for each class
plt.figure(figsize=(14, 10))

plt.subplot(1, 2, 1)
plot_word_cloud(spam_messages, "Spam Word Cloud")

plt.subplot(1, 2, 2)
plot_word_cloud(ham_messages, "Ham Word Cloud")

plt.tight_layout()
plt.show()

In [None]:
# Find most common words in each class
spam_common_words = get_most_common_words(spam_messages, 20)
ham_common_words = get_most_common_words(ham_messages, 20)

# Plot most common words
plt.figure(figsize=(14, 10))

plt.subplot(1, 2, 1)
plot_most_common_words(spam_messages, 20, "Most Common Words in Spam")

plt.subplot(1, 2, 2)
plot_most_common_words(ham_messages, 20, "Most Common Words in Ham")

plt.tight_layout()
plt.show()

## 4. Text Preprocessing

In [None]:
import nltk
nltk.download()

In [None]:
# Import preprocessing function
from src.data.preprocess import preprocess_text

# Get a few examples
example_texts = df['message'].iloc[:5].values

# Preprocess and display
for i, text in enumerate(example_texts):
    print(f"Original Text {i+1}:\n{text}")
    print(f"\nPreprocessed Text {i+1}:\n{preprocess_text(text)}")
    print("\n" + "-"*80)

## 5. Save Processed Data (Optional)

In [None]:
# Preprocess all messages
df['cleaned_message'] = df['message'].apply(preprocess_text)

# Save to processed data directory
output_path = '../data/processed/spam_data_processed.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)

print(f"Processed data saved to {output_path}")

## 6. Conclusion

In this notebook, we've explored the spam email dataset and gained insights into its characteristics. Key findings:

1. The dataset is imbalanced with more ham than spam messages
2. Spam messages tend to be longer on average than ham messages
3. Common words in spam include terms related to promotions, free offers, and urgency
4. Common words in ham are more conversational

These insights will help us build a more effective spam detection model.