<a href="https://colab.research.google.com/github/Qu1nnD/CS290/blob/main/GaussianDistribution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Follow the examples in Chapter 9 of Hands-on Machine Learning to fit Gaussian Mixture Models to both the iris and penguins datasets. After fitting GMMs to these datasets,
Plot the centers of the Gaussian distributions in your GMM along with the centroids from your 𝑘-means modeling results. How different are they?
Calculate the overall accuracy of each GMM, and compare with the overall accuracy of your 𝑘-means models.

In [526]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.datasets import load_iris

In [527]:
penguins = pd.read_csv("https://github.com/mbrudd/csci290/raw/refs/heads/main/data/penguins.csv")
iris = load_iris()
iris_data = pd.DataFrame( iris.data, columns = iris.feature_names)

In [528]:
def ini_Centroids(k, dataset, centroid_type, features):
  if(centroid_type == "random"): # Makes the centroid random
    indices = np.random.choice(len(features), size=k, replace=False)
    return features.to_numpy()[indices]
  elif(centroid_type == 'arthur'): #arthur vestile method
    centroids = []
    first_centroid = features.sample(n=1)
    centroids.append(first_centroid.values[0])
    for x in range(1, k):
      distances = np.array([min(np.linalg.norm(x - np.array(centroid), axis=0) ** 2 for centroid in centroids) for x in features.to_numpy()])
      total_distance = distances.sum()
      probabilities = distances / total_distance  # Probability: D(x_i)^2 / sum(D(x_j)^2)
      chosen_idx = np.random.choice(len(features), p=probabilities)
      centroids.append(features.iloc[chosen_idx].values)
    return np.array(centroids)
  else: # Makes the centroid farthest away from everything
    centroids = [features.sample(n=1).values[0]]  # Choose the first centroid randomly
    for x in range(1, k):
        distances = np.array([min(np.linalg.norm(point - np.array(c)) for c in centroids)for point in features.to_numpy()])
        farthest_idx = np.argmax(distances)
        centroids.append(features.iloc[farthest_idx].values)
    return np.array(centroids)

In [529]:
def accuracy(true_labels, predicted_labels):
    correct_predictions = np.sum(true_labels == predicted_labels)
    return correct_predictions / len(true_labels)

In [530]:
def K_Means(k, dataset, centroid_type, feature_x, feature_y, target):
    converged = False
    features = dataset[[feature_x, feature_y]].dropna()
    centroids=ini_Centroids(k, dataset, centroid_type, features) # initializes the first centroids
    while not converged: # Loops until there isn't massive changes between centroids and new_centroids
        #Calculations
        distances = np.linalg.norm(features.to_numpy()[:, np.newaxis] - centroids, axis=2) # calcuates the eudclidean distance between data points and centroids
        labels = np.argmin(distances, axis=1) # assigns data points to the closest centroid
        new_centroids = np.array([features[labels == i].mean(axis=0) if np.any(labels == i) else centroids[i] for i in range(k)]) # calculates updated centroids given the mean of all data points from each cluster
        #Convergence test and updating centroids
        converged = np.all(np.abs(centroids - new_centroids) <= 0.001) # Tests to see if there is major differences between the current and new centroids to see if it needs to continue the loop again
        centroids = new_centroids
    label_mapping = {}
    for i in range(k):
        cluster_indices = np.where(labels == i)[0]
        if len(cluster_indices) > 0:
            majority_label = dataset.iloc[cluster_indices][target].mode().iloc[0]
            label_mapping[i] = majority_label
    # Replace numerical labels with string labels
    string_labels = [label_mapping[label] for label in labels]
    #Add making the predicted labels have a string value
    return string_labels, labels, centroids

In [531]:
def plot_GMM(gmm, dataset, feature_x=0, feature_y=1):
  dataset = dataset[[feature_x, feature_y]].to_numpy()
  x_min, x_max = dataset[:, 0].min() - 0.1, dataset[:, 0].max() + 0.1
  y_min, y_max = dataset[:, 1].min() - 0.1, dataset[:, 1].max() + 0.1
  xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
  grid = np.c_[xx.ravel(), yy.ravel()]
  Z = -gmm.score_samples(grid).reshape(xx.shape)
  plt.scatter(dataset[:, 0], dataset[:, 1], c='k', s=10, alpha=0.5, label="Data Points")
  plt.scatter(kmeans_centroids[:, 0], kmeans_centroids[:, 1], c='red', marker='x', s=150, label="KMeans Centroids")
  plt.contourf(xx, yy, Z, levels=15, cmap="viridis", alpha=0.8)
  plt.contour(xx, yy, Z, levels=15, colors="k", linewidths=0.5)
  plt.scatter(gmm.means_[:, 0], gmm.means_[:, 1], c='yellow', edgecolor='black', s=200, label="GMM Centers")
  plt.title("Gaussian Mixture Model")
  plt.xlabel(f"Feature {feature_x}" if isinstance(feature_x, int) else feature_x)
  plt.ylabel(f"Feature {feature_y}" if isinstance(feature_y, int) else feature_y)
  plt.legend()
  plt.colorbar(label="Negative Log-Likelihood")
  plt.show()

Iris Dataset

In [532]:
gm1 = GaussianMixture(n_components=3, n_init=10)
gm1.fit(iris_data)

In [533]:
gm1.weights_

array([0.30118609, 0.33333333, 0.36548058])

In [534]:
gm1.means_

array([[5.91697517, 2.77803998, 4.20523542, 1.29841561],
       [5.006     , 3.428     , 1.462     , 0.246     ],
       [6.54632887, 2.94943079, 5.4834877 , 1.98716063]])

In [535]:
gm1.covariances_

array([[[0.27550587, 0.09663458, 0.18542939, 0.05476915],
        [0.09663458, 0.09255531, 0.09103836, 0.04299877],
        [0.18542939, 0.09103836, 0.20227635, 0.0616792 ],
        [0.05476915, 0.04299877, 0.0616792 , 0.03232217]],

       [[0.121765  , 0.097232  , 0.016028  , 0.010124  ],
        [0.097232  , 0.140817  , 0.011464  , 0.009112  ],
        [0.016028  , 0.011464  , 0.029557  , 0.005948  ],
        [0.010124  , 0.009112  , 0.005948  , 0.010885  ]],

       [[0.38741443, 0.09223101, 0.30244612, 0.06089936],
        [0.09223101, 0.11040631, 0.08386768, 0.0557538 ],
        [0.30244612, 0.08386768, 0.32595958, 0.07283247],
        [0.06089936, 0.0557538 , 0.07283247, 0.08488025]]])

In [536]:
gm1.converged_

True

In [537]:
gm1.n_iter_

17

In [538]:
gm1.predict(iris_data)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [539]:
gm1.predict_proba(iris_data).round(3)

array([[0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.   , 1.   , 0.   ],
       [0.

In [540]:
x_new,y_new=gm1.sample(6)

In [541]:
x_new

array([[5.8739881 , 3.04281697, 4.50842516, 1.29981542],
       [4.91261748, 3.91494563, 1.46571531, 0.29002893],
       [4.62975557, 3.56891022, 1.2372444 , 0.04761629],
       [5.60658082, 4.25528489, 1.49092053, 0.44029177],
       [4.86012948, 3.50599667, 1.24721354, 0.2922667 ],
       [6.04169669, 2.83379513, 5.22662923, 1.8946802 ]])

In [542]:
y_new

array([0, 1, 1, 1, 1, 2])

In [543]:
gm1.score_samples(iris_data).round(2)

array([ 1.57,  0.74,  1.14,  0.93,  1.41, -0.09,  0.05,  1.62,  0.27,
        0.17,  0.83,  0.77,  0.3 , -1.79, -3.42, -2.11, -1.13,  1.48,
       -0.85,  0.98, -0.93,  0.41, -3.84, -1.89, -3.17, -0.12,  0.51,
        1.38,  1.12,  0.69,  0.78, -0.69, -2.13, -0.88,  1.15,  0.12,
       -1.12,  0.23,  0.13,  1.5 ,  0.94, -4.49, -0.34, -4.48, -2.59,
        0.68,  0.39,  1.04,  1.16,  1.55, -2.04, -0.27, -0.85, -2.33,
       -1.16, -0.79, -0.82, -1.4 , -0.45, -1.64, -2.59, -0.6 , -2.52,
       -0.11, -1.93, -1.16, -1.27, -2.94, -5.17,  0.26, -2.49, -0.02,
       -2.19, -2.58,  0.09, -0.48, -1.33, -2.27, -0.06, -1.86,  0.13,
       -0.49,  0.54, -1.43, -2.52, -2.22, -0.32, -3.75, -0.44, -0.35,
       -1.84,  0.24,  0.7 , -1.18,  0.44, -1.74,  0.35,  0.59, -4.2 ,
        0.75, -4.19, -1.3 , -0.77, -1.39, -1.12, -2.55, -4.41, -2.35,
       -2.07, -2.53, -1.52, -0.62, -0.7 , -2.27, -3.41, -1.35, -0.8 ,
       -5.03, -7.06, -2.99, -1.04, -1.69, -3.85, -1.35, -0.95, -1.97,
       -1.41, -1.23,

In [544]:
plot_GMM(gm1, iris_data, "petal length (cm)", "petal width (cm)")



ValueError: X has 2 features, but GaussianMixture is expecting 4 features as input.

Penguins dataset

In [None]:
gm2 = GaussianMixture(n_components=3, n_init=10)
penguins = penguins[["flipper_length_mm","bill_length_mm"]]
penguins = penguins.dropna()
gm2.fit(penguins)

In [None]:
gm2.weights_

In [None]:
gm2.means_

In [None]:
gm2.covariances_

In [None]:
gm2.converged_

In [None]:
gm2.n_iter_

In [None]:
gm2.predict(penguins)

In [None]:
gm2.predict_proba(penguins).round(3)

In [None]:
x_new2,y_new2=gm1.sample(6)

In [None]:
x_new2

In [None]:
y_new2

In [None]:
gm2.score_samples(penguins).round(2)

In [None]:
plot_GMM(3, penguins, "flipper_length_mm","bill_length_mm")

In [None]:
predicted_labels_gmm2 = gm2.predict(penguins)
len(penguins)

In [None]:
predicted_labels_gmm2 = gm2.predict(penguins)
penguins = pd.read_csv("https://github.com/mbrudd/csci290/raw/refs/heads/main/data/penguins.csv")
penguins_cleaned = penguins.drop([3])
penguins = penguins_cleaned.drop([271])
label_mapping = {}
for i in range(3):
  cluster_indices = np.where(predicted_labels_gmm2 == i)[0]
  if len(cluster_indices) > 0:
    majority_label = penguins.iloc[cluster_indices]['species'].mode().iloc[0]
    label_mapping[i] = majority_label
string_labels = [label_mapping[label] for label in predicted_labels_gmm2]
true_labels = penguins["species"].dropna()
accurate = accuracy(true_labels, string_labels)
print(f'Accuracy: {accurate:.2f}')