# Understanding Unsupervised Learning
A wide variety of clustering algorithms are available in scikit-learn. The hard part is interpreting the results.

In [None]:
import json

with open("./genres_data.json", "r") as infile:
    genres_data = json.load(infile)
print(len(genres_data))
print(genres_data[0])

In [None]:
import numpy as np

# Preprocess Data:
data_matrix = []
for datum in genres_data:
    fields = ["total_books_bought", "literary_fiction_fraction", "mystery_novels_fraction",
              "programming_references_fraction", "popular_science_fraction", "science_fiction_fraction", "fantasy_fraction"]
    data_vector = [datum[field] for field in fields]
    data_matrix.append(data_vector)

data_matrix = np.array(data_matrix)

print(data_matrix[0])

In [None]:
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt

In [None]:
model = KMeans(n_clusters=2)
model.fit(data_matrix)
print(model.cluster_centers_)

In [None]:
cluster_nums = list(range(2, 10 + 1))
scores = []
for num in cluster_nums:
    model = KMeans(n_clusters=num)
    model.fit(data_matrix)
    scores.append(model.score(data_matrix))
plt.title("Scores versus n_clusters")
plt.plot(cluster_nums, scores, 'b*-')
plt.show()

In [None]:
model = KMeans(n_clusters=5)
model.fit(data_matrix)
print(model.cluster_centers_)

# Expectation Maximization - Mixture of Gaussians

In [None]:
from sklearn.mixture import GaussianMixture
from matplotlib import pyplot as plt

def best_model(scores, models, func=min):
    best_model = models[0]
    best_score = scores[0]
    for i in range(1, len(scores)):
        if scores[i] < best_score:
            best_score = scores[i]
            best_model = models[i]
    return best_score, best_model

def try_models(n_clusters, n_trials, data):
    scores = []
    models = []
    for i in range(n_trials):
        model = GaussianMixture(n_components=n_clusters)
        model.fit(data)
        models.append(model)
        scores.append(model.bic(data_matrix)) # Bayesian Information Criterion measures usefulness (lower is better)
    return scores, models

cluster_nums = list(range(2, 20 + 1))
best_scores = []
best_models = []
for num in cluster_nums:
    scores, models = try_models(num, 5, data_matrix)
    min_score, model = best_model(scores, models)
    best_models.append(model)
    best_scores.append(min_score)
plt.title("Scores versus n_clusters")
plt.plot(cluster_nums, best_scores, 'b*-')
plt.show()

In [None]:
float_formatter = lambda x: "%.2f" % x # Two decimal places is probably enough here.
np.set_printoptions(formatter={'float_kind':float_formatter})
print(best_models[9].means_)

## Note:
The n_components in a Gaussian Mixture Model is the number of Gaussians. This has a different interpretation than the number of clusters in, e.g., k-means. In particular, multiple Gaussians may be representing very similar/overlapping data in the Gaussian Mixture Model, whereas in other clustering algorithms, the data are often well-separated. Gaussian Mixture Models are considered a "Generative Model" of data, meaning that they provide a model from which the data could plausibly have been generated.

# Embeddings with t-SNE
("t-distributed Stochastic Neighbor Embedding")

In [None]:
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

# Whiten Data and run PCA
whitened_data = scale(data_matrix)

pca = PCA()
pca_data = pca.fit_transform(whitened_data)

float_formatter = lambda x: "%.7f" % x # Two decimal places is probably enough here.
np.set_printoptions(formatter={'float_kind':float_formatter})
print(pca.explained_variance_ratio_)

In [None]:
# Since the last dimension appears to be redundant, we can safely eliminate it.
pca = PCA(n_components=6)
pca_data = pca.fit_transform(whitened_data)
print(pca.explained_variance_ratio_)

In [None]:
from sklearn.manifold import TSNE

embedding = TSNE(n_components=2) # Here, the n_components is the number of dimensions, not number of clusters.
embedded_data = embedding.fit_transform(pca_data) # Creates 2-d embeddings of the original data.
print("Data Shape", embedded_data.shape)

# We can now plot the data:
plt.title("t-SNE embeddings of our data")
plt.scatter(embedded_data[:,0], embedded_data[:,1])
plt.show()

In [None]:
# We can cluster in the embedding space
model = KMeans(n_clusters=6)
labels = model.fit_predict(embedded_data)

plt.title("Embedding Space Clusters")
plt.scatter(embedded_data[:,0], embedded_data[:,1], c=labels)
plt.show()

# Hierarchical Clustering

In [None]:
# Note that this is from scipy, not sklearn!
from scipy.cluster.hierarchy import dendrogram, linkage
connections = linkage(data_matrix, 'ward') # Links clusters so that variance of the group is minimized

# calculate full dendrogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    connections,
    #leaf_rotation=90.,  # rotates the x axis labels
    #leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering

model = AgglomerativeClustering(n_clusters=2)
clusters = model.fit_predict(data_matrix)

plt.scatter(embedded_data[:,0], embedded_data[:,1], c=clusters, cmap='prism')  # plot points with cluster dependent colors
plt.show()