In [1]:
"""
Experiment 14:
Expectation Maximization (EM) using Gaussian Mixture Model (GMM)
Extended version using CSV dataset: exp14_em.csv
"""

# ---------------------------------------------------
# STEP 1: IMPORT LIBRARIES
# ---------------------------------------------------
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

# ---------------------------------------------------
# STEP 2: LOAD DATASET
# ---------------------------------------------------
df = pd.read_csv("exp14_em.csv")
print("\n=== ORIGINAL DATASET ===")
print(df)

X = df[['Income', 'Spend']]   # Features

# ---------------------------------------------------
# STEP 3: CREATE EM MODEL
# ---------------------------------------------------
gmm = GaussianMixture(
    n_components=2,          # Number of clusters
    covariance_type='full',  # Most flexible
    random_state=0
)

# Fit the model
gmm.fit(X)

# ---------------------------------------------------
# STEP 4: GET OUTPUT VALUES
# ---------------------------------------------------
labels = gmm.predict(X)            # HARD CLUSTER LABELS
probs = gmm.predict_proba(X)       # SOFT PROBABILITIES
means = gmm.means_                 # CLUSTER CENTERS
log_likelihood = gmm.score(X)      # Model quality
aic = gmm.aic(X)                   # Compare models
bic = gmm.bic(X)                   # Compare models
sil_score = silhouette_score(X, labels)

# ---------------------------------------------------
# STEP 5: PRINT RESULTS
# ---------------------------------------------------
print("\n=== HARD CLUSTER LABELS (like K-Means) ===")
print(labels)

print("\n=== SOFT PROBABILITY SCORES (EM Output) ===")
print(probs)

print("\n=== CLUSTER MEANS (Centers of Gaussians) ===")
print(means)

print("\n=== PERFORMANCE METRICS ===")
print("Log Likelihood:", log_likelihood)
print("AIC:", aic)
print("BIC:", bic)
print("Silhouette Score:", sil_score)

# ---------------------------------------------------
# STEP 6: MERGE LABELS INTO DATASET
# ---------------------------------------------------
df["Cluster"] = labels
print("\n=== FINAL DATA WITH CLUSTER ASSIGNMENTS ===")
print(df)

# ---------------------------------------------------
# STEP 7: INTERPRETATION (Human Friendly)
# ---------------------------------------------------
print("\nInterpretation:")
print("• Cluster 0 represents low income & low spending customers.")
print("• Cluster 1 represents high income & high spending customers.")
print("• Probability values show soft assignments—how much a point belongs to each cluster.")
print("• Silhouette score shows clustering quality (closer to 1 = better).")



=== ORIGINAL DATASET ===
   Income  Spend
0      20     30
1      25     35
2      30     40
3      35     45
4      80     70
5      85     75
6      90     80
7      95     85

=== HARD CLUSTER LABELS (like K-Means) ===
[1 1 1 1 0 0 0 0]

=== SOFT PROBABILITY SCORES (EM Output) ===
[[0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]

=== CLUSTER MEANS (Centers of Gaussians) ===
[[87.5 77.5]
 [27.5 37.5]]

=== PERFORMANCE METRICS ===
Log Likelihood: 2.0591477451212965
AIC: -10.946363921940744
BIC: -10.072506963462551
Silhouette Score: 0.8344314786323223

=== FINAL DATA WITH CLUSTER ASSIGNMENTS ===
   Income  Spend  Cluster
0      20     30        1
1      25     35        1
2      30     40        1
3      35     45        1
4      80     70        0
5      85     75        0
6      90     80        0
7      95     85        0

Interpretation:
• Cluster 0 represents low income & low spending customers.
• Cluster 1 represents high income & high spending customers.