In [2]:
# Author: Hassan Ali
# Importing libraries 
import matplotlib.pyplot as plt  # Used for plotting graphs
import pandas as pd  # Used for data manipulation and analysis

# Import machine learning algorithms and dataset utilities
from sklearn.mixture import GaussianMixture  # Imports the Gaussian Mixture Model for clustering
# The `GaussianMixture` class from `sklearn.mixture` is a powerful tool for implementing
# the Expectation-Maximization (EM) algorithm for clustering.
# GaussianMixture is used to implement EM because it efficiently estimates the parameters of multiple Gaussian distributions
# in a dataset, allowing for soft clustering by probabilistically assigning data points to clusters based on these distributions.

In [94]:
# Load the dataset from a CSV file
df = pd.read_csv('User_knowledge.csv')  # Load data into a pandas DataFrame

# Make a copy of the DataFrame to avoid chained assignment issues
df = df.copy()

# Replace string values in the 'UNS' column with numerical values and cast to int
df['UNS'] = df['UNS'].replace({'very_low': 0, 'Low': 1, 'Middle': 2, 'High': 3}).astype(int)

# Initialize the Gaussian Mixture Model for clustering with 4 components
# 'init_params' is set to 'random' to start with randomly chosen means,
# and 'covariance_type' is set to 'full' to allow each component its own general covariance matrix.
em_gaussian = GaussianMixture(n_components=4, init_params='random', covariance_type='full')

# Fit the model on the dataset and predict the cluster for each instance
cluster_preds = em_gaussian.fit_predict(df)

# Evaluate the model using Akaike Information Criterion (AIC) and Bayesian Information Criterion (BIC)
aic_score = em_gaussian.aic(df)
bic_score = em_gaussian.bic(df)

print("AIC Score:", aic_score)
print("BIC Score:", bic_score)

AIC Score: -276.6015621815873
BIC Score: 117.77695174471222


  df['UNS'] = df['UNS'].replace({'very_low': 0, 'Low': 1, 'Middle': 2, 'High': 3}).astype(int)
