In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from IPython.display import display

In [None]:
import os
print(os.getcwd())
# Print the current working directory to ensure the file paths are correct.

In [None]:
df = pd.read_csv('/Users/zhicha/Project_Folder/Data/emails.csv')
df.head(10)
# head(10) shows the first 10 rows of the dataframe

In [None]:
df.shape, df.columns

In [None]:
# Basic checks
print("Shape:", df.shape)
print("\nFirst 10 columns:", df.columns[:10].tolist())
print("\nLast 5 columns:", df.columns[-5:].tolist())

# Missing values
missing = df.isna().sum().sum()
print("\nTotal missing values:", missing)

# Replicated rows
rep_rows = df.duplicated().sum()
print("Duplicated rows:", rep_rows)

df.info()

In [None]:
label_col = df.columns[-1]
id_col = df.columns[0]

print("ID column:", id_col)
print("Label column:", label_col)

feature_cols = df.columns[1: -1]
print("Number of feature columns:", len(feature_cols))

In [None]:
class_counts = df[label_col].value_counts().sort_index()
labels = ["Not spam(0)", "Spam (1)"]

plt.figure()
plt.bar(labels, class_counts.values)
plt.title("Class Distribution")
plt.ylabel("Count")
plt.xticks(rotation = 0)
plt.show()

print(class_counts)
print("\nSpam rate:", class_counts.get(1, 0) / class_counts.sum())

In [None]:
df["total_word_count"] = df[feature_cols].sum(axis = 1)

ham_lengths = df.loc[df[label_col] == 0, "total_word_count"]
spam_lengths = df.loc[df[label_col] == 1, "total_word_count"]

# Original data distribution plot
plt.figure()
plt.hist(ham_lengths, bins = 50, alpha = 0.7, label = "Not spam (0)")
plt.hist(spam_lengths, bins = 50, alpha = 0.7, label = "Spam (1)")
plt.title("Total Word Count Distribution by Class")
plt.xlabel("Total Word Count (sum of 3000 word features)")
plt.ylabel("Number of Emails")
plt.legend()
plt.show()

# Cleaned data distribution plot with log scale
plt.figure()
plt.hist(np.log1p(ham_lengths), bins = 50, alpha = 0.7, label = "Not spam (0)")
plt.hist(np.log1p(spam_lengths), bins = 50, alpha = 0.7, label = "Spam (1)")
plt.title("Distribution of log(1 + total word counts)")
plt.xlabel("log(1 + total word count)")
plt.ylabel("Number of emails")
plt.legend()
plt.show()


print("Median total words(ham):", ham_lengths.median())
print("Median total words(spam):", spam_lengths.median())

In [None]:
spam_mean = df.loc[df[label_col] == 1, feature_cols].mean(axis = 0)
ham_mean = df.loc[df[label_col] == 0, feature_cols].mean(axis = 0)

mean_diff = (spam_mean - ham_mean).sort_values(ascending = False)

top_n = 20
top_spam_words = mean_diff.head(top_n)
top_ham_words = mean_diff.tail(top_n)

display(top_spam_words)
display(top_ham_words)

In [None]:
plt.figure()
plt.barh(top_spam_words.index[::-1], top_spam_words.values[::-1])
plt.title(f"Top {top_n} words frequent in spam (mean difference)")
plt.xlabel("Mean Frequency Difference (Spam - Not spam)")
plt.ylabel("Word Feature")
plt.show()

plt.figure()
plt.barh(top_ham_words.index, top_ham_words.values)
plt.title(f"Top {top_n} words frequent in not spam (mean difference)")
plt.xlabel("Mean Frequency Difference (Spam - Not spam)")
plt.ylabel("Word Feature")
plt.show()

In [None]:
X = df[feature_cols]
row_sums = X.sum(axis=1).replace(0, np.nan)  # 防止除0
X_rel = X.div(row_sums, axis=0).fillna(0)

spam_mean_rel = X_rel.loc[df[label_col] == 1].mean(axis=0)
ham_mean_rel  = X_rel.loc[df[label_col] == 0].mean(axis=0)

mean_diff_rel = (spam_mean_rel - ham_mean_rel).sort_values(ascending=False)

top_spam_words_rel = mean_diff_rel.head(top_n)
top_ham_words_rel  = mean_diff_rel.tail(top_n)

plt.figure()
plt.barh(top_spam_words_rel.index[::-1], top_spam_words_rel.values[::-1])
plt.title(f"Top {top_n} spam-heavy words (relative frequency)")
plt.xlabel("Mean relative frequency difference (spam - ham)")
plt.ylabel("Word")
plt.show()