<a href="https://colab.research.google.com/github/Rohil72/ML_LAB/blob/main/MLLAB12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo


# --- Load Hepatitis C dataset directly from UCI ---
# UCI dataset ID for "Hepatitis C Virus (HCV) for Egyptian patients" = 571
# Reference: https://archive.ics.uci.edu/ml/datasets/HCV+data
hcv = fetch_ucirepo(id=571)


# Extract features (X) and target (y)
df = hcv.data.original.copy()


# --- Data Preprocessing ---
df = df.replace('?', np.nan)


# Convert numeric columns to float
numeric_cols = ['ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'CGT', 'PROT']
for col in numeric_cols:
   df[col] = pd.to_numeric(df[col])


# Drop missing rows
df = df.dropna()


# Encode 'Sex'
df['Sex'] = df['Sex'].map({'m': 1, 'f': 0})


# Fix the 'Category' column format — same as before
df['Category'] = df['Category'].str.replace('0s=', '0=').str.split('=').str[0].astype(int)


# Split features and labels
X = df.drop('Category', axis=1)
y_true = df['Category']


# --- Standardization ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# --- EM Algorithm via Gaussian Mixture Model ---
n_components = 5  # There are 5 known classes (0–4)
gmm = GaussianMixture(n_components=n_components, random_state=42, max_iter=100)
gmm.fit(X_scaled)


# Predict cluster assignments
gmm_clusters = gmm.predict(X_scaled)


# Get responsibilities (posterior probabilities)
responsibilities = gmm.predict_proba(X_scaled)


# --- Display Results ---
print("--- Expectation-Maximization (Gaussian Mixture Model) Results ---")
print(f"Number of Iterations to Converge: {gmm.n_iter_}")
print(f"Final Log-Likelihood: {gmm.lower_bound_:.4f}")


print("\nModel Parameters (Estimated by EM):")
print(f"Means of the {n_components} Gaussian Components (First 5 features):\n{gmm.means_[:, :5].round(2)}")
print(f"Weights (Priors) of the Components: {gmm.weights_.round(2)}")


print("\nCluster Assignment Comparison (True Categories vs GMM Clusters):")
print("Note: Cluster labels are arbitrary and may not map directly to true categories.")
print(pd.crosstab(y_true, gmm_clusters, rownames=['True Category'], colnames=['GMM Cluster']))


print(f"\nSample Responsibilities (First 3 samples): \n{responsibilities[:3].round(4)}")

--- Expectation-Maximization (Gaussian Mixture Model) Results ---
Number of Iterations to Converge: 17
Final Log-Likelihood: -3.9794

Model Parameters (Estimated by EM):
Means of the 5 Gaussian Components (First 5 features):
[[ 0.26 -0.14  0.79  0.3  -0.51]
 [ 0.74 -0.02 -1.27 -0.13 -0.04]
 [-0.79 -0.05  0.79  0.24  0.05]
 [ 1.61  0.7  -0.15 -1.63  0.58]
 [ 1.69 -0.14  0.79 -1.15  1.6 ]]
Weights (Priors) of the Components: [0.09 0.36 0.49 0.06 0.  ]

Cluster Assignment Comparison (True Categories vs GMM Clusters):
Note: Cluster labels are arbitrary and may not map directly to true categories.
GMM Cluster     0    1    2   3  4
True Category                     
0              29  208  289   7  0
1              14    0    0   6  0
2               8    1    0   3  0
3               2    1    1  19  1

Sample Responsibilities (First 3 samples): 
[[9.000e-04 0.000e+00 9.991e-01 0.000e+00 0.000e+00]
 [1.000e-04 0.000e+00 9.999e-01 0.000e+00 0.000e+00]
 [7.474e-01 0.000e+00 2.526e-01 0.000e+