DEMO TO Joyanthan Nanduri

In [1]:
import requests

url = 'https://www.ccs.neu.edu/home/vip/teach/DMcourse/2_cluster_EM_mixt/HW2/2gaussian.txt'
response = requests.get(url)

with open('2gaussian.txt', 'w') as file:
    file.write(response.text)

In [2]:
import pandas as pd
from sklearn.mixture import GaussianMixture

# Load the data
data = pd.read_csv('2gaussian.txt', delim_whitespace=True, header=None)
X = data.values

# Fit Gaussian Mixture Model
gmm = GaussianMixture(n_components=2, random_state=0)
gmm.fit(X)

# Print the parameters
print("Means:", gmm.means_)
print("Covariances:", gmm.covariances_)
print("Weights:", gmm.weights_)

Means: [[3.02812092 3.04761164]
 [7.03006483 3.9934325 ]]
Covariances: [[[1.06804592 0.01976191]
  [0.01976191 2.91181392]]

 [[0.94625701 0.48018796]
  [0.48018796 0.98610351]]]
Weights: [0.3404511 0.6595489]


  data = pd.read_csv('2gaussian.txt', delim_whitespace=True, header=None)


In [3]:
import requests

url = 'https://www.ccs.neu.edu/home/vip/teach/DMcourse/2_cluster_EM_mixt/HW2/3gaussian.txt'
response = requests.get(url)

with open('3gaussian.txt', 'w') as file:
    file.write(response.text)

In [4]:
import pandas as pd
from sklearn.mixture import GaussianMixture

# Load the data
data = pd.read_csv('3gaussian.txt', delim_whitespace=True, header=None)
X = data.values

# Fit Gaussian Mixture Model
gmm = GaussianMixture(n_components=3, random_state=0)
gmm.fit(X)

# Print the parameters
print("Means:", gmm.means_)
print("Covariances:", gmm.covariances_)
print("Weights:", gmm.weights_)

Means: [[3.03154145 2.89101341]
 [7.02667135 4.02590275]
 [4.97663138 6.98270082]]
Covariances: [[[ 1.05349568 -0.03031604]
  [-0.03031604  3.02227499]]

 [[ 0.97768457  0.48539081]
  [ 0.48539081  0.98967531]]

 [[ 1.02561306  0.21625819]
  [ 0.21625819  0.99652383]]]
Weights: [0.19633294 0.29821127 0.50545579]


  data = pd.read_csv('3gaussian.txt', delim_whitespace=True, header=None)


In [5]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from tqdm import tqdm
import pandas as pd

In [6]:
def gaussian_pdf(x, mean, cov): #multi-gaussian
    k = mean.shape[0]
    det_cov = np.linalg.det(cov)
    inv_cov = np.linalg.inv(cov)
    norm_const = 1.0/ (np.power((2 * np.pi), k / 2) * np.sqrt(det_cov))
    diff = x - mean
    exponent = -0.5 * np.sum(diff @ inv_cov * diff, axis=1)
    return norm_const * np.exp(exponent)

In [7]:
def load_data(filename):
    return np.loadtxt(filename)

def initialize_parameters(data, K):
    n, d = data.shape
    means = np.random.rand(K, d) * np.max(data, axis=0)
    covariances = np.array([np.eye(d) for _ in range(K)])  # Initialize with identity matrices
    weights = np.ones(K) / K
    return means, covariances, weights

# E-step: Compute responsibilities
def e_step(data, means, covariances, weights):
    N, K = len(data), len(weights)
    responsibilities = np.zeros((N, K))
    for k in range(K):
        responsibilities[:, k] = weights[k] * gaussian_pdf(data, means[k], covariances[k])
    responsibilities /= responsibilities.sum(axis=1, keepdims=True)
    return responsibilities

# M-step: Update parameters
def m_step(data, responsibilities):
    N, d = data.shape
    K = responsibilities.shape[1]
    means = np.zeros((K, d))
    covariances = np.zeros((K, d, d))
    weights = np.zeros(K)

    for k in range(K):
        Nk = responsibilities[:, k].sum()
        weights[k] = Nk / N #update weights
        means[k] = (responsibilities[:, k] @ data) / Nk #update means
        diff = data - means[k] 
        covariances[k] = (responsibilities[:, k, None, None] * (diff[:, :, None] @ diff[:, None, :])).sum(axis=0) / Nk # update covariances

    return means, covariances, weights

# Log-likelihood computation
def log_likelihood(data, means, covariances, weights):
    N = len(data)
    K = len(weights)
    likelihood = np.zeros(N)
    for k in range(K):
        likelihood += weights[k] * gaussian_pdf(data, mean=means[k], cov=covariances[k])
    return np.sum(np.log(likelihood))

# EM algorithm
def em_algorithm(data, K, max_iter=250, tol=1e-6):
    means, covariances, weights = initialize_parameters(data, K)
    log_likelihoods = []

    for iteration in range(max_iter):
        responsibilities = e_step(data, means, covariances, weights)
        means, covariances, weights = m_step(data, responsibilities)
        log_likelihoods.append(log_likelihood(data, means, covariances, weights))

        if iteration > 0 and abs(log_likelihoods[-1] - log_likelihoods[-2]) < tol:
            break

    return means, covariances, weights


In [8]:
file_2gaussian = "2gaussian.txt"
file_3gaussian = "3gaussian.txt"

# Load data
data_2gaussian = load_data(file_2gaussian)
data_3gaussian = load_data(file_3gaussian)

# Run EM for 2-Gaussian mixture
means_2, covs_2, weights_2= em_algorithm(data_2gaussian, K=2)
print("2-Gaussian Results:")
print("Means:", means_2)
print("Covariances:", covs_2)
print("Weights:", weights_2)

# Run EM for 3-Gaussian mixture
means_3, covs_3, weights_3= em_algorithm(data_3gaussian, K=3)
print("3-Gaussian Results:")
print("Means:", means_3)
print("Covariances:", covs_3)
print("Weights:", weights_3)


2-Gaussian Results:
Means: [[7.01313621 3.98312727]
 [2.99410852 3.05209937]]
Covariances: [[[0.97478003 0.49748229]
  [0.49748229 1.00115271]]

 [[1.01019574 0.02719559]
  [0.02719559 2.93784107]]]
Weights: [0.66520818 0.33479182]
3-Gaussian Results:
Means: [[3.03973478 3.04860601]
 [5.01174455 7.00148652]
 [7.02157133 4.01546601]]
Covariances: [[[1.02853502 0.02690254]
  [0.02690254 3.3849301 ]]

 [[0.97969377 0.18514551]
  [0.18514551 0.97452511]]

 [[0.99039647 0.50095308]
  [0.50095308 0.99564799]]]
Weights: [0.20560582 0.49595885 0.29843532]
