# Problem 2: EM for Bayesian Networks
### House Votes Dataset Analysis

## Data Preprocessing

In [3]:
import pandas as pd

In [4]:
# Load, preprocess and encode data
def preprocess_data(df):
    df.columns = ['Party', 'Vote1', 'Vote2']
    df['Party'] = df['Party'].map({'democrat': 0, 'republican': 1})
    for col in ['Vote1', 'Vote2']:
        df[col] = df[col].map({'y': 1, 'n': 0, '?': 2})  # 2 = missing
    return df

In [5]:
train_data = preprocess_data(pd.read_csv('house-votes-84.data', header=None).iloc[:300, :3])
test_data = preprocess_data(pd.read_csv('house-votes-84.data', header=None).iloc[300:, :3])

In [6]:
print("Training Data:")
print(train_data.to_markdown())

Training Data:
|     |   Party |   Vote1 |   Vote2 |
|----:|--------:|--------:|--------:|
|   0 |       1 |       0 |       1 |
|   1 |       1 |       0 |       1 |
|   2 |       0 |       2 |       1 |
|   3 |       0 |       0 |       1 |
|   4 |       0 |       1 |       1 |
|   5 |       0 |       0 |       1 |
|   6 |       0 |       0 |       1 |
|   7 |       1 |       0 |       1 |
|   8 |       1 |       0 |       1 |
|   9 |       0 |       1 |       1 |
|  10 |       1 |       0 |       1 |
|  11 |       1 |       0 |       1 |
|  12 |       0 |       0 |       1 |
|  13 |       0 |       1 |       1 |
|  14 |       1 |       0 |       1 |
|  15 |       1 |       0 |       1 |
|  16 |       0 |       1 |       0 |
|  17 |       0 |       1 |       2 |
|  18 |       1 |       0 |       1 |
|  19 |       0 |       1 |       1 |
|  20 |       0 |       1 |       1 |
|  21 |       0 |       1 |       1 |
|  22 |       0 |       1 |       2 |
|  23 |       0 |       1 |       1

In [7]:
print("Testing Data:")
print(test_data.to_markdown())

Testing Data:
|     |   Party |   Vote1 |   Vote2 |
|----:|--------:|--------:|--------:|
| 300 |       1 |       0 |       0 |
| 301 |       0 |       0 |       0 |
| 302 |       1 |       0 |       0 |
| 303 |       1 |       0 |       0 |
| 304 |       1 |       0 |       1 |
| 305 |       1 |       0 |       0 |
| 306 |       1 |       0 |       0 |
| 307 |       0 |       1 |       0 |
| 308 |       1 |       0 |       0 |
| 309 |       0 |       1 |       0 |
| 310 |       1 |       0 |       0 |
| 311 |       0 |       0 |       0 |
| 312 |       0 |       1 |       1 |
| 313 |       1 |       0 |       1 |
| 314 |       1 |       0 |       1 |
| 315 |       1 |       0 |       1 |
| 316 |       0 |       0 |       0 |
| 317 |       0 |       1 |       0 |
| 318 |       0 |       0 |       0 |
| 319 |       0 |       1 |       0 |
| 320 |       0 |       0 |       1 |
| 321 |       0 |       1 |       1 |
| 322 |       0 |       1 |       1 |
| 323 |       0 |       1 |       1 

In [8]:
print("Train Data Length:", len(train_data))
print("Test Data Length:", len(test_data))

Train Data Length: 300
Test Data Length: 135


In [9]:
train_data = train_data.dropna(subset=['Party'])  # Remove rows with missing Party
test_data = test_data.dropna(subset=['Party'])  # Remove rows with missing Party

In [10]:
print("Train Data Length after dropping missing party:", len(train_data))
print("Test Data Length after dropping missing party:", len(test_data))

Train Data Length after dropping missing party: 300
Test Data Length after dropping missing party: 135


## Define DAG Structures

In [11]:
# Eight possible DAGs for 3 variables
dags = [
    {0: [], 1: [0], 2: [1]},  # Party → Vote1 → Vote2
    {0: [], 2: [0], 1: [2]},  # Party → Vote2 → Vote1
    {1: [], 0: [1], 2: [0]},  # Vote1 → Party → Vote2
    {2: [], 0: [2], 1: [0]},  # Vote2 → Party → Vote1
    {0: [], 1: [0], 2: [0]},  # Party → Vote1, Party → Vote2
    {1: [], 2: [], 0: [1, 2]},  # Vote1 → Party, Vote2 → Party
    {1: [], 0: [1], 2: [1]},  # Vote1 → Party, Vote1 → Vote2
    {2: [], 0: [2], 1: [2]},  # Vote2 → Party, Vote2 → Vote1
]

## EM Algorithm Implementation

In [12]:
import numpy as np
from itertools import product

The `learn_bn` function implements the **Expectation-Maximization (EM)** algorithm to learn the **Conditional Probability Tables (CPTs)** of a **Bayesian Network (BN)** from partially observed data.

#### **Inputs**:
- `dag`:  The **Directed Acyclic Graph (DAG)** structure defining parent-child relationships in the BN.
- `data`:  A dataset with **potential missing values** (encoded as `2`).
- `max_iter`:  The **maximum number of EM iterations** to perform.
- `eps`:  The **convergence threshold** used to determine when to stop EM iterations.


In [13]:
def learn_bn(dag, data, max_iter=100, eps=1e-3):
    nodes = [0, 1, 2]
    cpts = {}

    # Initialize CPTs
    for node in nodes:
        parents = dag[node]
        if not parents:
            cpts[node] = np.random.dirichlet([1, 1])
        else:
            num_parent_states = 2 ** len(parents)
            cpts[node] = np.random.dirichlet([1, 1], size=num_parent_states)

    print(cpts)
    for _ in range(max_iter):
        counts = {n: {} for n in nodes}
        for _, row in data.iterrows():
            observed = {}
            missing = []
            for n in nodes:
                val = row[n]
                if val in {0, 1}:
                    observed[n] = int(val)
                else:
                    missing.append(n)

            assignments = list(product([0, 1], repeat=len(missing)))
            weights = []
            for assign in assignments:
                full = observed.copy()
                full.update(zip(missing, assign))
                prob = 1.0
                for node in nodes:
                    parents = dag[node]
                    if parents:
                        parent_vals = tuple(full[p] for p in parents)
                        parent_idx = sum(v * (2 ** i) for i, v in enumerate(parent_vals))
                        prob *= cpts[node][parent_idx, full[node]]  # Now scalar!
                    else:
                        prob *= cpts[node][full[node]]  # Scalar
                weights.append(prob)
            total = sum(weights)
            if total == 0:  # Now scalar comparison
                continue

            for i, assign in enumerate(assignments):
                full = observed.copy()
                full.update(zip(missing, assign))
                weight = weights[i] / total
                for node in nodes:
                    parents = dag[node]
                    val = full[node]
                    if parents:
                        parent_vals = tuple(full[p] for p in parents)
                        parent_idx = sum(v * (2 ** i) for i, v in enumerate(parent_vals))
                        key = (parent_idx, val)
                    else:
                        key = val
                    counts[node][key] = counts[node].get(key, 0.0) + weight

        new_cpts = {}
        for node in nodes:
            parents = dag[node]
            if not parents:
                total = sum(counts[node].values())
                new_cpts[node] = np.array([
                    counts[node].get(0, 0) / total,
                    counts[node].get(1, 0) / total
                ])
            else:
                num_parent_states = 2 ** len(parents)
                new_cpt = np.zeros((num_parent_states, 2))
                for config in range(num_parent_states):
                    total = sum(counts[node].get((config, v), 0) for v in [0, 1])
                    if total == 0:
                        new_cpt[config] = cpts[node][config]
                    else:
                        new_cpt[config, 0] = counts[node].get((config, 0), 0) / total
                        new_cpt[config, 1] = counts[node].get((config, 1), 0) / total
                new_cpts[node] = new_cpt

        converged = all(np.allclose(new_cpts[n], cpts[n], atol=eps) for n in nodes)
        if converged:
            break
        cpts = new_cpts

    return cpts

#### Part 1: Learn models for each DAG

In [14]:
learned_models = []
for dag in dags:
    print(f"Training DAG: {dag}")
    model = learn_bn(dag, train_data, max_iter=100)
    learned_models.append(model)

Training DAG: {0: [], 1: [0], 2: [1]}
{0: array([0.26716497, 0.73283503]), 1: array([[0.08235062, 0.91764938],
       [0.82893966, 0.17106034]]), 2: array([[0.5502585 , 0.4497415 ],
       [0.01381604, 0.98618396]])}
Training DAG: {0: [], 2: [0], 1: [2]}
{0: array([0.50704094, 0.49295906]), 1: array([[0.61913882, 0.38086118],
       [0.41197283, 0.58802717]]), 2: array([[0.73029033, 0.26970967],
       [0.3978362 , 0.6021638 ]])}
Training DAG: {1: [], 0: [1], 2: [0]}
{0: array([[0.99141639, 0.00858361],
       [0.27315027, 0.72684973]]), 1: array([0.16859718, 0.83140282]), 2: array([[0.39434043, 0.60565957],
       [0.93644254, 0.06355746]])}
Training DAG: {2: [], 0: [2], 1: [0]}
{0: array([[0.45084828, 0.54915172],
       [0.6323296 , 0.3676704 ]]), 1: array([[0.8522372, 0.1477628],
       [0.6578368, 0.3421632]]), 2: array([0.98321889, 0.01678111])}


  val = row[n]


Training DAG: {0: [], 1: [0], 2: [0]}
{0: array([0.62489026, 0.37510974]), 1: array([[0.31595659, 0.68404341],
       [0.0185073 , 0.9814927 ]]), 2: array([[0.7489081 , 0.2510919 ],
       [0.34479172, 0.65520828]])}
Training DAG: {1: [], 2: [], 0: [1, 2]}
{0: array([[0.31405889, 0.68594111],
       [0.71469361, 0.28530639],
       [0.10171686, 0.89828314],
       [0.83165487, 0.16834513]]), 1: array([0.31018976, 0.68981024]), 2: array([0.61238591, 0.38761409])}
Training DAG: {1: [], 0: [1], 2: [1]}
{0: array([[0.98630038, 0.01369962],
       [0.33367306, 0.66632694]]), 1: array([0.8131177, 0.1868823]), 2: array([[0.95988612, 0.04011388],
       [0.32649347, 0.67350653]])}
Training DAG: {2: [], 0: [2], 1: [2]}
{0: array([[0.44584024, 0.55415976],
       [0.13727337, 0.86272663]]), 1: array([[0.45015073, 0.54984927],
       [0.91913409, 0.08086591]]), 2: array([0.41120567, 0.58879433])}


#### Part 2: Check initialization sensitivity

In [61]:
np.random.seed(42)
cpts1 = learn_bn(dags[0], train_data)
np.random.seed(123)
cpts2 = learn_bn(dags[0], train_data)
print("\nDifferent runs produce different models:", not np.allclose(cpts1[0], cpts2[0]))

  val = row[n]



Different runs produce different models: False


## Model Evaluation

In [62]:
def predict_party(cpts, dag, vote1, vote2):
    # Handle missing votes by marginalization
    prob_p0, prob_p1 = 0.0, 0.0
    missing = []
    votes = {}

    # Identify missing votes and store observed votes
    if vote1 == 2:
        missing.append(1)
    else:
        votes[1] = vote1
    if vote2 == 2:
        missing.append(2)
    else:
        votes[2] = vote2

    # Iterate over all possible assignments for missing votes
    for assign in product([0, 1], repeat=len(missing)):
        full = votes.copy()
        full.update(zip(missing, assign))  # Impute missing votes

        # === Compute P(Votes, Party=0) ===
        # Include Party=0 in the full assignment
        full_p0 = full.copy()
        full_p0[0] = 0  # Explicitly set Party=0

        # Compute P(Party=0, Votes)
        p0 = 1.0
        for node in [0, 1, 2]:  # Include Party in the computation
            parents = dag[node]
            if parents:
                # Get parent values from the full assignment
                parent_vals = tuple(full_p0[p] for p in parents)
                parent_idx = sum(v * (2 ** i) for i, v in enumerate(parent_vals))
                p0 *= cpts[node][parent_idx, full_p0[node]]
            else:
                # Root node: use prior probability
                p0 *= cpts[node][full_p0[node]]

        # === Compute P(Votes, Party=1) ===
        # Include Party=1 in the full assignment
        full_p1 = full.copy()
        full_p1[0] = 1  # Explicitly set Party=1

        # Compute P(Party=1, Votes)
        p1 = 1.0
        for node in [0, 1, 2]:  # Include Party in the computation
            parents = dag[node]
            if parents:
                parent_vals = tuple(full_p1[p] for p in parents)
                parent_idx = sum(v * (2 ** i) for i, v in enumerate(parent_vals))
                p1 *= cpts[node][parent_idx, full_p1[node]]
            else:
                p1 *= cpts[node][full_p1[node]]

        # Accumulate probabilities
        prob_p0 += p0
        prob_p1 += p1

    # Predict the party with higher probability
    return 0 if prob_p0 > prob_p1 else 1

#### Part 3: Evaluate models

In [63]:
accuracies = []
for i, (dag, cpts) in enumerate(zip(dags, learned_models)):
    correct = 0
    total = 0
    for _, row in test_data.iterrows():
        if row['Party'] not in {0, 1}:
            continue  # Skip invalid party
        vote1 = row['Vote1'] if row['Vote1'] in {0, 1} else 2
        vote2 = row['Vote2'] if row['Vote2'] in {0, 1} else 2
        pred = predict_party(cpts, dag, vote1, vote2)
        correct += (pred == row['Party'])
        total += 1
    acc = correct / total
    accuracies.append(acc)
    print(f"DAG {i + 1} Accuracy: {acc:.4f}")

DAG 1 Accuracy: 0.6815
DAG 2 Accuracy: 0.5926
DAG 3 Accuracy: 0.6815
DAG 4 Accuracy: 0.6815
DAG 5 Accuracy: 0.6815
DAG 6 Accuracy: 0.6815
DAG 7 Accuracy: 0.6815
DAG 8 Accuracy: 0.5926


## Results

In [48]:
print("\nFinal Accuracies:")
for i, acc in enumerate(accuracies):
    print(f"DAG {i + 1}: {acc:.4f}")


Final Accuracies:
DAG 1: 0.6815
DAG 2: 0.5926
DAG 3: 0.6815
DAG 4: 0.6815
DAG 5: 0.6815
DAG 6: 0.6815
DAG 7: 0.6815
DAG 8: 0.5926
