# 02 — Modeling: Spam Emails 📧🚨

<p align="left">
  <img alt="ID3 Algorithm" src="https://img.shields.io/badge/ID3-Decision%20Tree-0A81D1">
  <img alt="Status" src="https://img.shields.io/badge/Notebook-Modeling-1e90ff">
</p>

**Purpose**: Validate the ID3 implementation on a toy spam/ham dataset (`emails.csv`), inspect the tree, and visualize results.  
**Author**: <span style="color:#FF6B6B"><b>Noëlla Buti</b></span>

---

### 🛠️ Workflow
- 📥 Load dataset (`emails.csv`)  
- 🔢 Entropy & information gain  
- 🌳 Build ID3 (from scratch)  
- 👀 Pretty-print & Graphviz  
- ✅ Training accuracy  

<details>
  <summary><b>📁 Artifacts (click to expand)</b></summary>

- Tree PNG (Drive):  
  <code>/content/drive/MyDrive/id3-census-income/reports/assets/emails_tree.png</code>  

- Notebook (Drive):  
  <code>/content/drive/MyDrive/id3-census-income/notebooks/02_emails.ipynb</code>  

</details>

---

### 🚦 Results Snapshot
- **Likely best split**: `SUSPICIOUS WORDS`  
- **Tree**: shallow & interpretable  
- **Accuracy (train)**: typically ~100% on this tiny toy set  

> 💡 Binary indicators often make ID3 trees very compact.

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/

import os
OUT_DIR = "id3-census-income/reports/assets"
os.makedirs(OUT_DIR, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive


## 1. Setup and Data

In [2]:
import pandas as pd
import numpy as np

emails = pd.read_csv("data/raw/emails.csv")
emails = emails.drop(columns=["ID"], errors="ignore")
emails.head()

Unnamed: 0,SUSPICIOUS WORDS,UNKNOWN SENDER,CONTAINS IMAGES,CLASS
0,True,False,True,spam
1,True,True,False,spam
2,True,True,False,spam
3,False,True,True,ham
4,False,False,False,ham


## 2. Entropy and Information Gain

In [3]:
from collections import Counter
import math

def entropy(series: pd.Series) -> float:
    counts = Counter(series); n = sum(counts.values())
    if n == 0: return 0.0
    return -sum((c/n) * math.log2(c/n) for c in counts.values() if c)

def information_gain(df: pd.DataFrame, feature: str, target: str) -> float:
    H = entropy(df[target])
    weights = df[feature].value_counts(normalize=True)
    cond = sum(w * entropy(df[df[feature]==v][target]) for v, w in weights.items())
    return H - cond

## 3. ID3

In [4]:
def majority_label(series: pd.Series):
    return series.mode().iloc[0]

def best_feature_by_ig(df: pd.DataFrame, target: str) -> str:
    feats = [c for c in df.columns if c != target]
    gains = {f: information_gain(df, f, target) for f in feats}
    return max(gains, key=gains.get)

def build_id3(df: pd.DataFrame, target: str, max_depth=None, depth=0):
    if len(df[target].unique()) == 1:
        return df[target].iloc[0]
    if max_depth is not None and depth >= max_depth:
        return majority_label(df[target])
    feats_left = [c for c in df.columns if c != target]
    if not feats_left:
        return majority_label(df[target])

    f = best_feature_by_ig(df, target)
    node = {f: {}}
    for v in sorted(df[f].dropna().unique()):
        sub = df[df[f] == v]
        node[f][v] = build_id3(sub.drop(columns=[f]), target, max_depth, depth+1) if not sub.empty else majority_label(df[target])
    return node

## 4. Pretty-print and Predict

In [5]:
def print_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + f"→ {tree}"); return
    (feat, branches), = tree.items()
    print(indent + f"[{feat}]")
    for val, subtree in branches.items():
        print(indent + f" ├─ {val}")
        print_tree(subtree, indent + " │   ")

def _collect_leaves(node):
    leaves = []
    stack = [node]
    while stack:
        cur = stack.pop()
        if isinstance(cur, dict):
            (feat, branches), = cur.items()
            stack.extend(branches.values())
        else:
            leaves.append(cur)
    return leaves

def predict_one(tree, row: dict):
    node = tree
    while isinstance(node, dict):
        (feat, branches), = node.items()
        val = row.get(feat)
        if val in branches:
            node = branches[val]
        else:
            leaves = _collect_leaves(branches)
            return pd.Series(leaves).mode().iloc[0]
    return node

def predict(tree, X: pd.DataFrame):
    return [predict_one(tree, r.to_dict()) for _, r in X.iterrows()]

## 5. Train, Print, and Accuracy

In [6]:
target = "CLASS"
tree_emails = build_id3(emails.copy(), target)
print_tree(tree_emails)

y_true = emails[target].tolist()
y_pred = predict(tree_emails, emails.drop(columns=[target]))
acc = (pd.Series(y_true) == pd.Series(y_pred)).mean()
print("Training accuracy:", acc)

[SUSPICIOUS WORDS]
 ├─ False
 │   → ham
 ├─ True
 │   → spam
Training accuracy: 1.0


## 6. Graphviz Export

In [7]:
from graphviz import Digraph

def draw_dict_tree(tree):
    def _add(dot, node, parent=None, edge_label=None, idx=[0]):
        if not isinstance(node, dict):
            nid = f"leaf_{idx[0]}"; idx[0]+=1
            dot.node(nid, str(node), shape="box")
            if parent is not None:
                dot.edge(parent, nid, label=str(edge_label))
            return
        (feat, branches), = node.items()
        fid = f"feat_{feat}"
        dot.node(fid, feat, shape="ellipse")
        if parent is not None:
            dot.edge(parent, fid, label=str(edge_label))
        for val, sub in branches.items():
            _add(dot, sub, parent=fid, edge_label=val, idx=idx)
    dot = Digraph()
    _add(dot, tree)
    return dot

dot = draw_dict_tree(tree_emails)
dot.format = "png"
dot.render(f"{OUT_DIR}/emails_tree", cleanup=True)
print(f"Saved tree visualization to {OUT_DIR}/emails_tree.png")

Saved tree visualization to id3-census-income/reports/assets/emails_tree.png
