<a href="https://colab.research.google.com/github/OlajideFemi/Carbon-Footprint/blob/main/Mixture_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Inputs: V (N x D with values {0,1} and np.nan for missing), H=2
# Outputs: pi (H,), theta (H x D), responsibilities R (N x H)

init pi = np.ones(H)/H
init theta = rng.uniform(0.25, 0.75, size=(H, D))

for iter in range(max_iters):
    # E-step
    logR = np.log(pi)[None, :]  # (1 x H) broadcast to (N x H)
    for k in range(H):
        # add sum over observed dims: v*log(theta) + (1-v)*log(1-theta)
        term = 0
        for d in range(D):
            obs = ~np.isnan(V[:, d])
            v = V[obs, d]
            loglik = v*np.log(theta[k, d]) + (1 - v)*np.log(1 - theta[k, d])
            logR[obs, k] += loglik
    # normalize
    R = softmax(logR, axis=1)

    # M-step
    Nk = R.sum(axis=0)              # (H,)
    pi = Nk / N
    for k in range(H):
        for d in range(D):
            obs = ~np.isnan(V[:, d])
            num = (R[obs, k] * V[obs, d]).sum()
            den = (R[obs, k]).sum()
            if den > 0:
                theta[k, d] = num / den
            # else: keep theta[k,d] or apply prior


SyntaxError: invalid syntax (ipython-input-3667588727.py, line 4)

In [1]:
import numpy as np
import pandas as pd

# Set a seed for reproducibility
np.random.seed(42)

# Parameters
num_respondents = 150
num_questions = 5
missing_percentage = 0.30

# Generate a base dataset with 'yes' (1) and 'no' (0) responses
# We use a random binomial distribution for this example
data = np.random.randint(2, size=(num_respondents, num_questions))
df = pd.DataFrame(data, columns=[f'Q{i+1}' for i in range(num_questions)])

# Convert numerical responses to 'Yes' and 'No' for clarity
df = df.replace({1: 'Yes', 0: 'No'})

# Introduce missing data
missing_mask = np.random.rand(num_respondents, num_questions) < missing_percentage
df_missing = df.mask(missing_mask, pd.NA)

# Add a 'Respondent_ID' column
df_missing.insert(0, 'Respondent_ID', range(1, num_respondents + 1))

# Save the DataFrame to a CSV file
output_filename = 'questionnaire_data.csv'
df_missing.to_csv(output_filename, index=False)

print(f"Dataset generated and saved as '{output_filename}'")
print("\nFirst 5 rows of the generated dataset:")
print(df_missing.head())
print(f"\nTotal missing values: {df_missing.isnull().sum().sum()}")
print(f"Total possible values: {num_respondents * num_questions}")

Dataset generated and saved as 'questionnaire_data.csv'

First 5 rows of the generated dataset:
   Respondent_ID    Q1    Q2    Q3   Q4    Q5
0              1    No   Yes  <NA>   No  <NA>
1              2  <NA>    No    No   No   Yes
2              3    No  <NA>  <NA>   No   Yes
3              4    No   Yes   Yes  Yes    No
4              5   Yes    No   Yes  Yes   Yes

Total missing values: 237
Total possible values: 750


In [3]:
df_missing.shape

(150, 6)

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Set a seed for reproducibility
np.random.seed(42)

# --- 1. Load and preprocess the data ---
file_path = 'questionnaire_data.csv'
df = pd.read_csv(file_path, index_col='Respondent_ID')

# Convert 'Yes'/'No' to 1/0 and keep missing values as np.nan
V = df.replace({'Yes': 1, 'No': 0}).values.astype(float)
N, D = V.shape
H = 2  # Number of components/clusters

print("Data loaded and preprocessed.")
print(f"Dataset shape: {V.shape}")
print(f"Number of respondents (N): {N}")
print(f"Number of questions (D): {D}")
print("-" * 50)

# --- 2. EM Algorithm Implementation ---

def run_em(V, H, max_iters=100, tol=1e-6):
    """
    Runs the EM algorithm for a product-of-Bernoulli mixture.
    V: data matrix (N x D) with 1/0 and np.nan.
    H: number of components.
    """
    # Initialization
    pi = np.ones(H) / H
    theta = np.random.uniform(0.25, 0.75, size=(H, D))
    log_likelihoods = []

    for i in range(max_iters):
        # E-step: Compute responsibilities (rnk)
        log_R = np.log(pi)[None, :]  # (1, H) broadcast to (N, H)
        for k in range(H):
            for d in range(D):
                # Identify observed data for question d
                observed_mask = ~np.isnan(V[:, d])
                v_obs = V[observed_mask, d]

                # Calculate log-likelihood term for component k and question d
                log_theta_k_d = np.log(np.clip(theta[k, d], 1e-6, 1 - 1e-6))
                log_1_minus_theta_k_d = np.log(np.clip(1 - theta[k, d], 1e-6, 1 - 1e-6))

                log_lik_term = v_obs * log_theta_k_d + (1 - v_obs) * log_1_minus_theta_k_d

                # Add term to the log-responsibility for observed data points
                log_R[observed_mask, k] += log_lik_term

        # Normalize log responsibilities using log-sum-exp trick for stability
        log_R -= np.logaddexp.reduce(log_R, axis=1)[:, np.newaxis]
        R = np.exp(log_R)

        # M-step: Update parameters (pi, theta)
        sum_R = R.sum(axis=0)  # sum over N (H,)
        pi = sum_R / N

        for k in range(H):
            for d in range(D):
                observed_mask = ~np.isnan(V[:, d])

                # Numerator: sum(r_nk * v_nd) for observed data
                numerator = (R[observed_mask, k] * V[observed_mask, d]).sum()

                # Denominator: sum(r_nk) for observed data
                denominator = R[observed_mask, k].sum()

                if denominator > 0:
                    theta[k, d] = numerator / denominator
                # else: keep previous theta or apply prior (not implemented here)

        # Calculate incomplete log-likelihood for convergence check
        log_likelihood = np.sum(np.log(np.sum(np.exp(log_R), axis=1)))
        log_likelihoods.append(log_likelihood)

        # Check for convergence
        if i > 1 and np.abs(log_likelihoods[-1] - log_likelihoods[-2]) < tol:
            print(f"EM converged at iteration {i+1}.")
            break

    return pi, theta, R, log_likelihoods

# Run the EM algorithm
pi_final, theta_final, R_final, log_likelihoods = run_em(V, H)

# --- 3. Analysis and Visualization ---

print("\n--- EM Results ---")
print("Final Mixture Weights (π):", np.round(pi_final, 4))
print("Final Bernoulli Probabilities (θ):\n", np.round(theta_final, 4))
print(f"Total iterations: {len(log_likelihoods)}")

# Plotting the posterior probabilities (like Fig 20.4/20.5)
# Sort respondents based on their posterior probability for one cluster (e.g., cluster 2)
p_h2_given_v = R_final[:, 1]
sorted_indices = np.argsort(p_h2_given_v)
sorted_posteriors = p_h2_given_v[sorted_indices]
sorted_respondent_ids = df.index[sorted_indices]

plt.figure(figsize=(10, 6))
plt.plot(sorted_posteriors, marker='.', linestyle='none', color='skyblue', label='Posterior P(h=2 | v)')
plt.title('Posterior Probability of Cluster 2 Membership for Each Respondent')
plt.xlabel('Respondent (sorted by posterior probability)')
plt.ylabel('Posterior Probability')
plt.grid(True, linestyle='--')
plt.legend()
plt.tight_layout()
plt.show()

# Assign hard labels
hard_labels = R_final.argmax(axis=1)
print(f"\nExample of hard cluster assignments (first 10 respondents):\n{hard_labels[:10]}")

# You can now use these hard labels for further analysis or classification.
# For example, to find the number of respondents in each cluster:
cluster_counts = pd.Series(hard_labels).value_counts().sort_index()
print("\nNumber of respondents in each cluster:")
print(cluster_counts)

Data loaded and preprocessed.
Dataset shape: (150, 5)
Number of respondents (N): 150
Number of questions (D): 5
--------------------------------------------------


  V = df.replace({'Yes': 1, 'No': 0}).values.astype(float)


IndexError: boolean index did not match indexed array along axis 0; size of axis is 1 but size of corresponding boolean axis is 150

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- 1. Load and preprocess the data ---
file_path = 'questionnaire_data.csv'
df = pd.read_csv(file_path, index_col='Respondent_ID')

# Convert 'Yes'/'No' to 1/0 and keep missing values as np.nan
V = df.replace({'Yes': 1, 'No': 0}).values.astype(float)
N, D = V.shape
H = 2  # Number of components/clusters

print("Data loaded and preprocessed.")
print(f"Dataset shape: {V.shape}")
print(f"Number of respondents (N): {N}")
print(f"Number of questions (D): {D}")
print("-" * 50)

# --- 2. Corrected EM Algorithm Implementation ---

def run_em(V, H, max_iters=100, tol=1e-6):
    """
    Runs the EM algorithm for a product-of-Bernoulli mixture.
    V: data matrix (N x D) with 1/0 and np.nan.
    H: number of components.
    """
    # Initialization
    pi = np.ones(H) / H
    theta = np.random.uniform(0.25, 0.75, size=(H, D))
    log_likelihoods = []

    for i in range(max_iters):
        # E-step: Compute responsibilities (rnk)
        log_R = np.log(pi)[None, :]  # (1, H) broadcast to (N, H)
        for k in range(H):
            for d in range(D):
                # Identify observed data for question d
                observed_mask = ~np.isnan(V[:, d])
                v_obs = V[observed_mask, d]

                # Calculate log-likelihood term for component k and question d
                log_theta_k_d = np.log(np.clip(theta[k, d], 1e-6, 1 - 1e-6))
                log_1_minus_theta_k_d = np.log(np.clip(1 - theta[k, d], 1e-6, 1 - 1e-6))

                log_lik_term = v_obs * log_theta_k_d + (1 - v_obs) * log_1_minus_theta_k_d

                # CORRECTED LINE: Add term to the log-responsibility only for observed data points
                log_R[observed_mask, k] += log_lik_term

        # Normalize log responsibilities using log-sum-exp trick for stability
        log_R -= np.logaddexp.reduce(log_R, axis=1)[:, np.newaxis]
        R = np.exp(log_R)

        # M-step: Update parameters (pi, theta)
        sum_R = R.sum(axis=0)  # sum over N (H,)
        pi = sum_R / N

        for k in range(H):
            for d in range(D):
                observed_mask = ~np.isnan(V[:, d])

                # Numerator: sum(r_nk * v_nd) for observed data
                numerator = (R[observed_mask, k] * V[observed_mask, d]).sum()

                # Denominator: sum(r_nk) for observed data
                denominator = R[observed_mask, k].sum()

                if denominator > 0:
                    theta[k, d] = numerator / denominator
                # else: keep previous theta or apply prior (not implemented here)

        # Calculate incomplete log-likelihood for convergence check
        log_likelihood = np.sum(np.log(np.sum(np.exp(log_R), axis=1)))
        log_likelihoods.append(log_likelihood)

        # Check for convergence
        if i > 1 and np.abs(log_likelihoods[-1] - log_likelihoods[-2]) < tol:
            print(f"EM converged at iteration {i+1}.")
            break

    return pi, theta, R, log_likelihoods

# Run the EM algorithm
pi_final, theta_final, R_final, log_likelihoods = run_em(V, H)

# --- 3. Analysis and Visualization ---

print("\n--- EM Results ---")
print("Final Mixture Weights (π):", np.round(pi_final, 4))
print("Final Bernoulli Probabilities (θ):\n", np.round(theta_final, 4))
print(f"Total iterations: {len(log_likelihoods)}")

# Plotting the posterior probabilities (like Fig 20.4/20.5)
p_h2_given_v = R_final[:, 1]
sorted_indices = np.argsort(p_h2_given_v)
sorted_posteriors = p_h2_given_v[sorted_indices]

plt.figure(figsize=(10, 6))
plt.plot(sorted_posteriors, marker='.', linestyle='none', color='skyblue', label='Posterior P(h=2 | v)')
plt.title('Posterior Probability of Cluster 2 Membership for Each Respondent')
plt.xlabel('Respondent (sorted by posterior probability)')
plt.ylabel('Posterior Probability')
plt.grid(True, linestyle='--')
plt.legend()
plt.tight_layout()
plt.show()

# Assign hard labels
hard_labels = R_final.argmax(axis=1)
print(f"\nExample of hard cluster assignments (first 10 respondents):\n{hard_labels[:10]}")

# You can now use these hard labels for further analysis or classification.
cluster_counts = pd.Series(hard_labels).value_counts().sort_index()
print("\nNumber of respondents in each cluster:")
print(cluster_counts)

Data loaded and preprocessed.
Dataset shape: (150, 5)
Number of respondents (N): 150
Number of questions (D): 5
--------------------------------------------------


  V = df.replace({'Yes': 1, 'No': 0}).values.astype(float)


IndexError: boolean index did not match indexed array along axis 0; size of axis is 1 but size of corresponding boolean axis is 150

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- 1. Load and preprocess the data ---
file_path = 'questionnaire_data.csv'
df = pd.read_csv(file_path, index_col='Respondent_ID')

# Convert 'Yes'/'No' to 1/0 and keep missing values as np.nan
V = df.replace({'Yes': 1, 'No': 0}).values.astype(float)
N, D = V.shape
H = 2  # Number of components/clusters

print("Data loaded and preprocessed.")
print(f"Dataset shape: {V.shape}")
print(f"Number of respondents (N): {N}")
print(f"Number of questions (D): {D}")
print("-" * 50)

# --- 2. Corrected EM Algorithm Implementation ---

def run_em(V, H, max_iters=100, tol=1e-6):
    """
    Runs the EM algorithm for a product-of-Bernoulli mixture.
    V: data matrix (N x D) with 1/0 and np.nan.
    H: number of components.
    """
    # Initialization
    pi = np.ones(H) / H
    theta = np.random.uniform(0.25, 0.75, size=(H, D))
    log_likelihoods = []

    for i in range(max_iters):
        # E-step: Compute responsibilities (rnk)
        # CORRECTED: Initialize log_R with full N x H shape here.
        log_R = np.log(pi)[None, :]  # (1, H) broadcast to (N, H)

        for k in range(H):
            for d in range(D):
                # Identify observed data for question d
                observed_mask = ~np.isnan(V[:, d])
                v_obs = V[observed_mask, d]

                # Calculate log-likelihood term for component k and question d
                log_theta_k_d = np.log(np.clip(theta[k, d], 1e-6, 1 - 1e-6))
                log_1_minus_theta_k_d = np.log(np.clip(1 - theta[k, d], 1e-6, 1 - 1e-6))
                log_lik_term = v_obs * log_theta_k_d + (1 - v_obs) * log_1_minus_theta_k_d

                # Add term to the log-responsibility only for observed data points
                log_R[observed_mask, k] += log_lik_term

        # Normalize log responsibilities using log-sum-exp trick for stability
        log_R -= np.logaddexp.reduce(log_R, axis=1)[:, np.newaxis]
        R = np.exp(log_R)

        # M-step: Update parameters (pi, theta)
        sum_R = R.sum(axis=0)  # sum over N (H,)
        pi = sum_R / N

        for k in range(H):
            for d in range(D):
                observed_mask = ~np.isnan(V[:, d])

                # Numerator: sum(r_nk * v_nd) for observed data
                numerator = (R[observed_mask, k] * V[observed_mask, d]).sum()

                # Denominator: sum(r_nk) for observed data
                denominator = R[observed_mask, k].sum()

                if denominator > 0:
                    theta[k, d] = numerator / denominator
                # else: keep previous theta or apply prior (not implemented here)

        # Calculate incomplete log-likelihood for convergence check
        log_likelihood = np.sum(np.log(np.sum(np.exp(log_R), axis=1)))
        log_likelihoods.append(log_likelihood)

        # Check for convergence
        if i > 1 and np.abs(log_likelihoods[-1] - log_likelihoods[-2]) < tol:
            print(f"EM converged at iteration {i+1}.")
            break

    return pi, theta, R, log_likelihoods

# Run the EM algorithm
pi_final, theta_final, R_final, log_likelihoods = run_em(V, H)

# --- 3. Analysis and Visualization ---

print("\n--- EM Results ---")
print("Final Mixture Weights (π):", np.round(pi_final, 4))
print("Final Bernoulli Probabilities (θ):\n", np.round(theta_final, 4))
print(f"Total iterations: {len(log_likelihoods)}")

# Plotting the posterior probabilities (like Fig 20.4/20.5)
p_h2_given_v = R_final[:, 1]
sorted_indices = np.argsort(p_h2_given_v)
sorted_posteriors = p_h2_given_v[sorted_indices]

plt.figure(figsize=(10, 6))
plt.plot(sorted_posteriors, marker='.', linestyle='none', color='skyblue', label='Posterior P(h=2 | v)')
plt.title('Posterior Probability of Cluster 2 Membership for Each Respondent')
plt.xlabel('Respondent (sorted by posterior probability)')
plt.ylabel('Posterior Probability')
plt.grid(True, linestyle='--')
plt.legend()
plt.tight_layout()
plt.show()

# Assign hard labels
hard_labels = R_final.argmax(axis=1)
print(f"\nExample of hard cluster assignments (first 10 respondents):\n{hard_labels[:10]}")

# You can now use these hard labels for further analysis or classification.
cluster_counts = pd.Series(hard_labels).value_counts().sort_index()
print("\nNumber of respondents in each cluster:")
print(cluster_counts)

Data loaded and preprocessed.
Dataset shape: (150, 5)
Number of respondents (N): 150
Number of questions (D): 5
--------------------------------------------------


  V = df.replace({'Yes': 1, 'No': 0}).values.astype(float)


IndexError: boolean index did not match indexed array along axis 0; size of axis is 1 but size of corresponding boolean axis is 150