In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("Libraries imported successfully!")

## 1. Load Data

In [None]:
# Load the dataset
# Update this path to your dataset location
df = pd.read_csv('../data/raw/emails.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {list(df.columns)}")
df.head()

## 2. Basic Statistics

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

print(f"\nTotal missing: {df.isnull().sum().sum()}")

In [None]:
# Label distribution
print("Label distribution:")
print(df['label'].value_counts())
print(f"\nPercentages:")
print(df['label'].value_counts(normalize=True) * 100)

In [None]:
# Visualize label distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Count plot
labels = ['Legitimate', 'Phishing']
counts = df['label'].value_counts().sort_index()

axes[0].bar(labels, counts, color=['green', 'red'])
axes[0].set_title('Email Distribution by Class')
axes[0].set_ylabel('Count')

for i, v in enumerate(counts):
    axes[0].text(i, v + 50, str(v), ha='center')

# Pie chart
axes[1].pie(counts, labels=labels, autopct='%1.1f%%', colors=['green', 'red'])
axes[1].set_title('Email Distribution (Percentage)')

plt.tight_layout()
plt.show()

## 3. Text Analysis

In [None]:
# Calculate text lengths
df['text_length'] = df['email_text'].str.len()
df['word_count'] = df['email_text'].str.split().str.len()

# Statistics by class
print("Text length statistics:")
print(df.groupby('label')['text_length'].describe())

print("\nWord count statistics:")
print(df.groupby('label')['word_count'].describe())

In [None]:
# Visualize text length distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Text length distribution
for label, name, color in [(0, 'Legitimate', 'green'), (1, 'Phishing', 'red')]:
    subset = df[df['label'] == label]['text_length']
    axes[0].hist(subset, bins=50, alpha=0.6, label=name, color=color)

axes[0].set_xlabel('Text Length (characters)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Text Length Distribution')
axes[0].legend()

# Word count distribution
for label, name, color in [(0, 'Legitimate', 'green'), (1, 'Phishing', 'red')]:
    subset = df[df['label'] == label]['word_count']
    axes[1].hist(subset, bins=50, alpha=0.6, label=name, color=color)

axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Word Count Distribution')
axes[1].legend()

plt.tight_layout()
plt.show()

## 4. URL and Email Analysis

In [None]:
# Count URLs in emails
def count_urls(text):
    if pd.isna(text):
        return 0
    return len(re.findall(r'http\S+|www\.\S+', text))

def count_emails(text):
    if pd.isna(text):
        return 0
    return len(re.findall(r'\S+@\S+', text))

df['url_count'] = df['email_text'].apply(count_urls)
df['email_count'] = df['email_text'].apply(count_emails)

print("URL count by class:")
print(df.groupby('label')['url_count'].describe())

print("\nEmail address count by class:")
print(df.groupby('label')['email_count'].describe())

In [None]:
# Visualize URL presence
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# URL count by class
df_url_summary = df.groupby('label')['url_count'].mean()
axes[0].bar(['Legitimate', 'Phishing'], df_url_summary, color=['green', 'red'])
axes[0].set_title('Average URL Count by Class')
axes[0].set_ylabel('Average URLs per Email')

# Emails with URLs
url_presence = df.groupby('label').apply(lambda x: (x['url_count'] > 0).mean() * 100)
axes[1].bar(['Legitimate', 'Phishing'], url_presence, color=['green', 'red'])
axes[1].set_title('Percentage of Emails with URLs')
axes[1].set_ylabel('Percentage (%)')

plt.tight_layout()
plt.show()

## 5. Common Words Analysis

In [None]:
from collections import Counter
import string

def get_top_words(texts, n=20):
    """Get most common words from texts."""
    words = []
    for text in texts:
        if pd.isna(text):
            continue
        # Simple tokenization
        text = text.lower()
        text = ''.join(c if c.isalpha() or c.isspace() else ' ' for c in text)
        words.extend(text.split())
    
    # Remove stopwords (simple list)
    stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
                 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
                 'would', 'could', 'should', 'may', 'might', 'must', 'shall',
                 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from',
                 'as', 'into', 'through', 'during', 'before', 'after', 'above',
                 'below', 'and', 'but', 'or', 'nor', 'so', 'yet', 'both',
                 'either', 'neither', 'not', 'only', 'own', 'same', 'than',
                 'too', 'very', 's', 't', 'can', 'just', 'don', 'now', 'i',
                 'you', 'your', 'we', 'our', 'they', 'their', 'this', 'that',
                 'it', 'its', 'if', 'then', 'else', 'when', 'there', 'here'}
    
    words = [w for w in words if w not in stopwords and len(w) > 2]
    
    return Counter(words).most_common(n)

# Get top words for each class
phishing_words = get_top_words(df[df['label'] == 1]['email_text'])
legitimate_words = get_top_words(df[df['label'] == 0]['email_text'])

print("Top words in PHISHING emails:")
for word, count in phishing_words:
    print(f"  {word}: {count}")

print("\nTop words in LEGITIMATE emails:")
for word, count in legitimate_words:
    print(f"  {word}: {count}")

In [None]:
# Visualize top words
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Phishing words
words, counts = zip(*phishing_words[:15])
axes[0].barh(words, counts, color='red', alpha=0.7)
axes[0].set_xlabel('Frequency')
axes[0].set_title('Top Words in Phishing Emails')
axes[0].invert_yaxis()

# Legitimate words
words, counts = zip(*legitimate_words[:15])
axes[1].barh(words, counts, color='green', alpha=0.7)
axes[1].set_xlabel('Frequency')
axes[1].set_title('Top Words in Legitimate Emails')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## 6. Sample Emails

In [None]:
# Display sample phishing emails
print("=" * 50)
print("SAMPLE PHISHING EMAILS")
print("=" * 50)

for i, row in df[df['label'] == 1].sample(3).iterrows():
    print(f"\n--- Email #{i} ---")
    print(row['email_text'][:500] + "..." if len(row['email_text']) > 500 else row['email_text'])
    print()

In [None]:
# Display sample legitimate emails
print("=" * 50)
print("SAMPLE LEGITIMATE EMAILS")
print("=" * 50)

for i, row in df[df['label'] == 0].sample(3).iterrows():
    print(f"\n--- Email #{i} ---")
    print(row['email_text'][:500] + "..." if len(row['email_text']) > 500 else row['email_text'])
    print()

## 7. Summary

### Key Findings

Document your key findings here after running the analysis:

1. **Class Balance**: [Balanced/Imbalanced?]
2. **Text Length**: [How do phishing vs legitimate compare?]
3. **URL Presence**: [More URLs in phishing emails?]
4. **Key Words**: [What distinguishes phishing from legitimate?]