In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.pipeline import Pipeline

The following cell opens the normalized images and does PCA. This takes several minutes and since we are only interested in using the dimensionality reduced data, this cell should only be run once to write data to a new file, enabling faster loading.

In [2]:
# flattened_image_dir= "processed_galaxy_data/flattened_normalized_images.csv"

# #load flattened images
# flattened_data = pd.read_csv(flattened_image_dir, header = None)
# pca = PCA(n_components = 35)
# fitted_pca = pca.fit_transform(flattened_data)
# np.savetxt('processed_galaxy_data/pca_normalized.csv', fitted_pca, delimiter=',')

In [3]:
fitted_pca = pd.read_csv("processed_galaxy_data/pca_normalized.csv", header = None)
labels = pd.read_csv("processed_galaxy_data/labels_mappings_filtered.csv")

We want to look into how GMM will cluster our data and its effectiveness. First, we get a random subset of the data.

In [4]:
X_train, X_test, y_train, y_test = train_test_split(fitted_pca, labels["shape_label"], train_size=0.8, random_state=42)
labels["shape_label"].value_counts()

shape_label
E     97643
S     95818
SB    45568
A       544
Name: count, dtype: int64

Next, we declare the GMM and its parameters. Since we know that our data has 4 labels, we can explicitly set `n_components` to 4. GMMs have a parameter to dictate the covariance type to optimize over. We set up a grid search for the multiple types of covariance and cross validate over our randomly chosen data.

In [5]:
gmm = GaussianMixture(n_components = 4)
pipe_gmm = Pipeline([
    ("gmm", gmm)
])
param_grid_gmm = {
    "gmm__covariance_type": ["full", "tied", "spherical", "diag"]
}
grid_gmm = GridSearchCV(pipe_gmm, param_grid_gmm, scoring = "adjusted_rand_score", n_jobs = -1)

In [None]:
fitted_gmm = grid_gmm.fit(X_train, y_train)

We used the Adjusted Rand Index to score our models.

In [None]:
results_gmm = pd.DataFrame(grid_gmm.cv_results_['params'] )
results_gmm['Mean Adjusted Rand Index'] = grid_gmm.cv_results_['mean_test_score']
results_gmm.head()

Our scores are fairly poor -- almost close to 0. The reason for this may be explored more. Let's try another clustering method to see if results are any better.

Spectral clustering has the ability to fit non-convex data. However, unlike some other clustering methods, it cannot predict labels -- thus, a cross validation method can't be used.

To get around this, we randomly subset our data multiple times and train a Spectral Clustering model on each subset with varying `n_neighbors`. Since Spectral Clustering makes heavy use of adjacency and Laplacian matrices, the time and space complexity is high. A few thousand is relatively representative of our data.

In [None]:
nn_list = [1,5,10,20]
spectral_ari = pd.DataFrame(index =  range(4), columns = [1,5,10,20])
for i in range(4):
    X_train, _, y_train, _ = train_test_split(fitted_pca, labels["shape_label"], train_size=2000, random_state = i)
    for nn in nn_list:
        spectral = SpectralClustering(n_clusters = 4, n_neighbors = nn)
        y_labels = spectral.fit(X_train).labels_
        spectral_ari.loc[i, nn] = metrics.adjusted_rand_score(y_labels, y_train)

In [None]:
spectral_ari