In [None]:
# Cell 1: Setup
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.utils.data_loader import create_sample_email_data, load_dataset
from src.utils.visualization import plot_risk_distribution

%matplotlib inline
sns.set_style('whitegrid')

print("‚úÖ Libraries loaded successfully!")

In [None]:
# Cell 2: Load Data
emails_df = create_sample_email_data()
print(f"üìä Loaded {len(emails_df)} emails")
print("\nüìã Columns:", emails_df.columns.tolist())
print("\nüîç First few rows:")
emails_df.head()

In [None]:
# Cell 3: Basic Statistics
print("üìà Dataset Statistics:")
print(f"Total emails: {len(emails_df)}")
print(f"Phishing emails: {len(emails_df[emails_df['label'] == 1])}")
print(f"Safe emails: {len(emails_df[emails_df['label'] == 0])}")
print(f"\nPhishing rate: {len(emails_df[emails_df['label'] == 1]) / len(emails_df) * 100:.1f}%")

# Visualize distribution
emails_df['label'].value_counts().plot(kind='bar', color=['green', 'red'])
plt.title('Email Distribution')
plt.xlabel('Label (0=Safe, 1=Phishing)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Cell 4: Text Analysis
print("üìù Email Text Analysis:")

# Average email length
emails_df['body_length'] = emails_df['body'].apply(len)
emails_df['subject_length'] = emails_df['subject'].apply(len)

print(f"\nAverage body length:")
print(emails_df.groupby('label')['body_length'].mean())

print(f"\nAverage subject length:")
print(emails_df.groupby('label')['subject_length'].mean())

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

emails_df.boxplot(column='body_length', by='label', ax=axes[0])
axes[0].set_title('Body Length by Email Type')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Length')

emails_df.boxplot(column='subject_length', by='label', ax=axes[1])
axes[1].set_title('Subject Length by Email Type')
axes[1].set_xlabel('Label')
axes[1].set_ylabel('Length')

plt.tight_layout()
plt.show()

In [None]:
# Cell 5: Word Frequency Analysis
from collections import Counter
import re

def get_word_freq(texts):
    words = []
    for text in texts:
        words.extend(re.findall(r'\b\w+\b', text.lower()))
    return Counter(words)

phishing_words = get_word_freq(emails_df[emails_df['label'] == 1]['body'])
safe_words = get_word_freq(emails_df[emails_df['label'] == 0]['body'])

print("üî¥ Top 10 words in Phishing emails:")
for word, count in phishing_words.most_common(10):
    print(f"  {word}: {count}")

print("\nüü¢ Top 10 words in Safe emails:")
for word, count in safe_words.most_common(10):
    print(f"  {word}: {count}")


In [None]:
# Cell 6: Summary
print("‚úÖ Data Exploration Complete!")
print("\nüìä Key Findings:")
print("1. Dataset contains both phishing and safe emails")
print("2. Phishing emails tend to have specific characteristics")
print("3. Text length and word patterns differ between classes")
print("\n‚û°Ô∏è Next: Model training in 02_email_analysis.ipynb")