# Problem 2: EM for Bayesian Networks
### House Votes Dataset Analysis

In [213]:
from collections import defaultdict
from itertools import permutations, product

import numpy as np
import pandas as pd

## Data Preprocessing

In [214]:
data = pd.read_csv('house-votes-84.data', header=None)
data = data.iloc[:, :3]
data.columns = ['party', 'vote1', 'vote2']

data['party'] = data['party'].map({'democrat': 0, 'republican': 1})
for col in ['vote1', 'vote2']:
    data[col] = data[col].map({'y': 1, 'n': 0, '?': np.nan})

train_data = data.iloc[:300]
test_data = data.iloc[300:]

In [215]:
print(train_data.to_markdown())

|     |   party |   vote1 |   vote2 |
|----:|--------:|--------:|--------:|
|   0 |       1 |       0 |       1 |
|   1 |       1 |       0 |       1 |
|   2 |       0 |     nan |       1 |
|   3 |       0 |       0 |       1 |
|   4 |       0 |       1 |       1 |
|   5 |       0 |       0 |       1 |
|   6 |       0 |       0 |       1 |
|   7 |       1 |       0 |       1 |
|   8 |       1 |       0 |       1 |
|   9 |       0 |       1 |       1 |
|  10 |       1 |       0 |       1 |
|  11 |       1 |       0 |       1 |
|  12 |       0 |       0 |       1 |
|  13 |       0 |       1 |       1 |
|  14 |       1 |       0 |       1 |
|  15 |       1 |       0 |       1 |
|  16 |       0 |       1 |       0 |
|  17 |       0 |       1 |     nan |
|  18 |       1 |       0 |       1 |
|  19 |       0 |       1 |       1 |
|  20 |       0 |       1 |       1 |
|  21 |       0 |       1 |       1 |
|  22 |       0 |       1 |     nan |
|  23 |       0 |       1 |       1 |
|  24 |     

In [216]:
print(test_data.to_markdown())

|     |   party |   vote1 |   vote2 |
|----:|--------:|--------:|--------:|
| 300 |       1 |       0 |       0 |
| 301 |       0 |       0 |       0 |
| 302 |       1 |       0 |       0 |
| 303 |       1 |       0 |       0 |
| 304 |       1 |       0 |       1 |
| 305 |       1 |       0 |       0 |
| 306 |       1 |       0 |       0 |
| 307 |       0 |       1 |       0 |
| 308 |       1 |       0 |       0 |
| 309 |       0 |       1 |       0 |
| 310 |       1 |       0 |       0 |
| 311 |       0 |       0 |       0 |
| 312 |       0 |       1 |       1 |
| 313 |       1 |       0 |       1 |
| 314 |       1 |       0 |       1 |
| 315 |       1 |       0 |       1 |
| 316 |       0 |       0 |       0 |
| 317 |       0 |       1 |       0 |
| 318 |       0 |       0 |       0 |
| 319 |       0 |       1 |       0 |
| 320 |       0 |       0 |       1 |
| 321 |       0 |       1 |       1 |
| 322 |       0 |       1 |       1 |
| 323 |       0 |       1 |       1 |
| 324 |     

In [217]:
dags = []
for perm in permutations([0, 1, 2]):
    parents = {perm[0]: [], perm[1]: [perm[0]], perm[2]: [perm[1]]}
    dags.append(parents)

In [218]:
print(dags)

[{0: [], 1: [0], 2: [1]}, {0: [], 2: [0], 1: [2]}, {1: [], 0: [1], 2: [0]}, {1: [], 2: [1], 0: [2]}, {2: [], 0: [2], 1: [0]}, {2: [], 1: [2], 0: [1]}]


In [219]:
def initialize_cpts(dag, seed):
    """Randomly initializes CPTs as arrays for both values (0 and 1)."""
    np.random.seed(seed)
    cpts = {}
    for node in range(3):
        parents = dag[node]
        if not parents:
            # Root node: [P(0), P(1)]
            cpts[node] = np.random.dirichlet([1, 1])
        else:
            # Child node: CPT for P(node | parents)
            num_parent_configs = 2 ** len(parents)
            cpts[node] = np.random.dirichlet([1, 1], size=num_parent_configs)
    return cpts

In [220]:
def initialize_expected_counts(dag):
    """Initializes the expected counts structure for all nodes and parent configurations."""
    expected_counts = defaultdict(float)
    for node in range(3):
        parents = dag[node]
        if parents:
            for p_combo in product([0, 1], repeat=len(parents)):
                expected_counts[(node, p_combo, 1)] = 0.0
                expected_counts[(node, p_combo, 0)] = 0.0
        else:
            expected_counts[(node, (), 1)] = 0.0
            expected_counts[(node, (), 0)] = 0.0
    return expected_counts

In [221]:
def process_complete_sample(sample_vals, dag, cpts):
    """Processes a complete sample with array-based CPTs."""
    delta = defaultdict(float)
    log_likelihood = 0.0
    for node in range(3):
        parents = dag[node]
        val = int(sample_vals[node])

        # Get probability distribution
        if parents:
            p_vals = tuple(sample_vals[parents].astype(int))
            parent_config_idx = sum(p * (2 ** i) for i, p in enumerate(p_vals))
            prob_dist = cpts[node][parent_config_idx]
        else:
            prob_dist = cpts[node]

        # Access probability using val as index
        p = prob_dist[val]
        log_likelihood += np.log(p)

        # Update expected counts
        key = (node, p_vals if parents else (), val)
        delta[key] += 1
    return log_likelihood, delta

In [222]:
def process_incomplete_sample(sample_vals, dag, cpts, missing_vars):
    """Processes a sample with missing values using probabilistic imputation."""
    delta = defaultdict(float)
    log_likelihood = 0.0
    possible_combos = list(product([0, 1], repeat=len(missing_vars)))
    combo_probs, total_prob = [], 0.0

    # Compute joint probability for each imputation
    for combo in possible_combos:
        imputed = sample_vals.copy()
        for i, var in enumerate(missing_vars):
            imputed[var] = combo[i]

        joint_prob = 1.0
        for node in range(3):
            parents = dag[node]
            val = int(imputed[node])

            if parents:
                # Convert parent values to index
                p_vals = tuple(imputed[parents].astype(int))
                parent_config_index = sum([p * (2 ** i) for i, p in enumerate(p_vals)])
                prob_dist = cpts[node][parent_config_index]
            else:
                prob_dist = cpts[node]

            # Get probability for observed value
            p = prob_dist[val]
            joint_prob *= p

        combo_probs.append(joint_prob)
        total_prob += joint_prob

    if total_prob == 0:
        return 0.0, delta

    # Update expected counts based on imputation probabilities
    for combo, joint_prob in zip(possible_combos, combo_probs):
        prob = joint_prob / total_prob
        imputed = sample_vals.copy()
        for i, var in enumerate(missing_vars):
            imputed[var] = combo[i]

        for node in range(3):
            parents = dag[node]
            val = int(imputed[node])

            if parents:
                p_vals = tuple(imputed[parents].astype(int))
                parent_config_index = sum([p * (2 ** i) for i, p in enumerate(p_vals)])
            else:
                p_vals = ()

            key = (node, p_vals, val)
            delta[key] += prob

        log_likelihood += prob * np.log(joint_prob)

    return log_likelihood, delta

In [223]:
def perform_e_step(dag, train_data, cpts):
    expected_counts = initialize_expected_counts(dag)
    log_likelihood = 0.0
    for _, sample in train_data.iterrows():
        sample_vals = sample.to_numpy()
        missing = [i for i, val in enumerate(sample_vals) if np.isnan(val)]
        if not missing:
            ll, delta = process_complete_sample(sample_vals, dag, cpts)
        else:
            ll, delta = process_incomplete_sample(sample_vals, dag, cpts, missing)
        log_likelihood += ll
        for key, value in delta.items():
            expected_counts[key] += value
    return expected_counts, log_likelihood

In [224]:
def perform_m_step(expected_counts, dag):
    new_cpts = {}
    for node in range(3):
        parents = dag[node]
        if not parents:
            # Root node: [P(0), P(1)]
            count_0 = expected_counts.get((node, (), 0), 0)
            count_1 = expected_counts.get((node, (), 1), 0)
            total = count_0 + count_1
            if total == 0:
                new_cpt = np.array([0.5, 0.5])  # Laplace smoothing
            else:
                new_cpt = np.array([count_0 / total, count_1 / total])
            new_cpts[node] = new_cpt
        else:
            # Child node: [P(0 | parents), P(1 | parents)] for each parent config
            num_parent_configs = 2 ** len(parents)
            new_cpt = np.zeros((num_parent_configs, 2))
            for parent_config in product([0, 1], repeat=len(parents)):
                # Convert parent_config to an integer index
                parent_config_idx = sum(p * (2 ** i) for i, p in enumerate(parent_config))
                count_0 = expected_counts.get((node, parent_config, 0), 0)
                count_1 = expected_counts.get((node, parent_config, 1), 0)
                total = count_0 + count_1
                if total == 0:
                    new_cpt[parent_config_idx] = [0.5, 0.5]
                else:
                    new_cpt[parent_config_idx] = [count_0 / total, count_1 / total]
            new_cpts[node] = new_cpt
    return new_cpts

In [225]:
def learn_em(dag, train_data, max_iters=100, epsilon=1e-3, seed=None):
    cpts = initialize_cpts(dag, seed)
    prev_log_likelihood = -np.inf
    for _ in range(max_iters):
        expected_counts, log_likelihood = perform_e_step(dag, train_data, cpts)
        new_cpts = perform_m_step(expected_counts, dag)
        if np.abs(log_likelihood - prev_log_likelihood) < epsilon:
            break
        prev_log_likelihood = log_likelihood
        cpts = new_cpts
    return cpts

In [226]:
# Train models for each DAG
models = []
for dag in dags:
    cpts = learn_em(dag, train_data)
    models.append((dag, cpts))

In [227]:
def compare_cpts(cpts1, cpts2, tol=1e-2):
    for node in cpts1:
        if isinstance(cpts1[node], np.ndarray):
            if not np.allclose(cpts1[node], cpts2[node], atol=tol):
                return False
        else:
            if abs(cpts1[node] - cpts2[node]) > tol:
                return False
    return True

In [228]:
runs = []
for seed in [42, 99, 123, 7, 2023]:
    np.random.seed(seed)
    mask = np.random.rand(*train_data.shape) < 0.20
    train_data_masked = train_data.mask(mask)
    cpts = learn_em(dags[0], train_data_masked, seed=seed)
    runs.append(cpts)
    print(f"\n--- Run {seed} CPTs ---")
    for node in cpts:
        print(f"Node {node}: {cpts[node]}")

# Compare CPTs
print("\nDo different EM runs produce different models?")
all_same = True
for i in range(len(runs)):
    for j in range(i + 1, len(runs)):
        if not compare_cpts(runs[i], runs[j]):
            all_same = False
            print(f"Run {i + 1} vs Run {j + 1}: Different CPTs")
if all_same:
    print("All runs produced identical models")
else:
    print("Conclusion: Yes, different runs yield different models")


--- Run 42 CPTs ---
Node 0: [0.6203254 0.3796746]
Node 1: [[0.39483997 0.60516003]
 [0.77103091 0.22896909]]
Node 2: [[0.47574632 0.52425368]
 [0.57109615 0.42890385]]

--- Run 99 CPTs ---
Node 0: [0.61243097 0.38756903]
Node 1: [[0.42562893 0.57437107]
 [0.80302404 0.19697596]]
Node 2: [[0.48625452 0.51374548]
 [0.5460241  0.4539759 ]]

--- Run 123 CPTs ---
Node 0: [0.612683 0.387317]
Node 1: [[0.39929339 0.60070661]
 [0.80105197 0.19894803]]
Node 2: [[0.49691766 0.50308234]
 [0.53823283 0.46176717]]

--- Run 7 CPTs ---
Node 0: [0.64145229 0.35854771]
Node 1: [[0.38859526 0.61140474]
 [0.8915102  0.1084898 ]]
Node 2: [[0.51614112 0.48385888]
 [0.55911314 0.44088686]]

--- Run 2023 CPTs ---
Node 0: [0.60494777 0.39505223]
Node 1: [[0.3941908  0.6058092 ]
 [0.82648506 0.17351494]]
Node 2: [[0.44218956 0.55781044]
 [0.55308773 0.44691227]]

Do different EM runs produce different models?
Run 1 vs Run 2: Different CPTs
Run 1 vs Run 3: Different CPTs
Run 1 vs Run 4: Different CPTs
Run 1 vs

In [229]:
results = []
for dag, cpts in models:
    correct = 0
    total = 0
    for _, test_sample in test_data.iterrows():
        party_true = test_sample['party']
        if np.isnan(party_true):
            continue

        vote1 = test_sample['vote1']
        vote2 = test_sample['vote2']
        observed = {}
        if not np.isnan(vote1):
            observed[1] = int(vote1)
        if not np.isnan(vote2):
            observed[2] = int(vote2)

        prob_0 = 1.0
        prob_1 = 1.0

        node = 0
        parents = dag[node]
        if parents:
            p_vals = tuple([observed.get(p, 0) for p in parents])
            parent_config_idx = sum(p * (2 ** i) for i, p in enumerate(p_vals))
            prob_1 *= cpts[node][parent_config_idx][1]
            prob_0 *= cpts[node][parent_config_idx][0]
        else:
            prob_1 *= cpts[node][1]
            prob_0 *= cpts[node][0]

        for node in [1, 2]:
            if node not in observed:
                continue
            val = observed[node]
            parents = dag[node]
            if parents:
                p_vals = tuple([observed.get(p, 0) for p in parents])
                parent_config_idx = sum(p * (2 ** i) for i, p in enumerate(p_vals))
                p = cpts[node][parent_config_idx][val]
            else:
                p = cpts[node][val]

            prob_1 *= p
            prob_0 *= p

        total_prob = prob_0 + prob_1
        if total_prob == 0:
            predicted = 0
        else:
            predicted = 1 if (prob_1 / total_prob) > 0.5 else 0
        correct += (predicted == party_true)
        total += 1

    accuracy = correct / total if total else 0
    results.append((dag, accuracy))

In [230]:
for dag, acc in results:
    print(f"DAG: {dag}, Accuracy: {acc:.4f}")

DAG: {0: [], 1: [0], 2: [1]}, Accuracy: 0.5926
DAG: {0: [], 2: [0], 1: [2]}, Accuracy: 0.5926
DAG: {1: [], 0: [1], 2: [0]}, Accuracy: 0.6815
DAG: {1: [], 2: [1], 0: [2]}, Accuracy: 0.5926
DAG: {2: [], 0: [2], 1: [0]}, Accuracy: 0.5926
DAG: {2: [], 1: [2], 0: [1]}, Accuracy: 0.6815
