In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from sqlalchemy import create_engine
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn import datasets, metrics

### 1) Apply GMM to the heart disease data by setting n_components=2. 
Get ARI and silhoutte scores for your solution and compare it with those of the k-means and hierarchical clustering solutions that you implemented in the assignments of the previous checkpoints. Which algorithm does perform better?

In [6]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'heartdisease'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

df = pd.read_sql_query('SELECT * FROM heartdisease', con=engine)

engine.dispose()

In [7]:
# Define the features and the outcome
X = df.iloc[:, :13]
y = df.iloc[:, 13]

# Replace missing values (marked by ?) with a 0
X = X.replace(to_replace='?', value=0)

# Binarize y so that 1 means heart disease diagnosis and 0 means no diagnosis
y = np.where(y > 0, 0, 1)

In [9]:
scale = StandardScaler()
scaled = scale.fit_transform(X)

gmm = GaussianMixture(n_components=2)
prediction = gmm.fit_predict(scaled)

print('ARI: {}'.format(metrics.adjusted_rand_score(y, prediction)))
print('Silhouette: {}'.format(metrics.silhouette_score(scaled, prediction, metric='euclidean')))

ARI: 0.4207322145049338
Silhouette: 0.16118591340148433


This is an improvement from the heirarchical ranking, but just a hair worse than the scores produced by KMeans

### 2) GMM implementation of scikit-learn has a parameter called covariance_type. This parameter determines the type of covariance parameters to use. Specifically, there are four types you can specify:

#### a) full: This is the default. Each component has its own general covariance matrix.

In [10]:
# Default score is listed above 

#### b) tied: All components share the same general covariance matrix.

In [11]:
gmm = GaussianMixture(n_components=2, covariance_type='tied')
prediction = gmm.fit_predict(scaled)

print('ARI: {}'.format(metrics.adjusted_rand_score(y, prediction)))
print('Silhouette: {}'.format(metrics.silhouette_score(scaled, prediction, metric='euclidean')))

ARI: 0.18389186035089963
Silhouette: 0.13628813153331445


#### c) diag: Each component has its own diagonal covariance matrix.

In [12]:
gmm = GaussianMixture(n_components=2, covariance_type='diag')
prediction = gmm.fit_predict(scaled)

print('ARI: {}'.format(metrics.adjusted_rand_score(y, prediction)))
print('Silhouette: {}'.format(metrics.silhouette_score(scaled, prediction, metric='euclidean')))

ARI: 0.37878842718089933
Silhouette: 0.15836933745078682


#### d) spherical: Each component has its own single variance.

In [13]:
gmm = GaussianMixture(n_components=2, covariance_type='spherical')
prediction = gmm.fit_predict(scaled)

print('ARI: {}'.format(metrics.adjusted_rand_score(y, prediction)))
print('Silhouette: {}'.format(metrics.silhouette_score(scaled, prediction, metric='euclidean')))

ARI: 0.20765243525722465
Silhouette: 0.12468753110276873


Out of these 4 covariance types, it seems like 'full' produced the one with the highest ARI and Silhouette scores