# Unsupervised Learning Model Evaluation Lab

Complete the exercises below to solidify your knowledge and understanding of unsupervised learning model evaluation.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn import datasets

data = datasets.load_wine()

X = pd.DataFrame(data["data"], columns=data["feature_names"])
y = pd.Series(data["target"])

## 1. Train a KMeans clustering model on the data set using 8 clusters and compute the silhouette score for the model.

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
kmeans8 = KMeans(n_clusters=8)
preds8 = kmeans8.fit_predict(X)

print("Model 1 Silhouette Score: {}".format(silhouette_score(X, preds8)))

## 2. Train a KMeans clustering model on the data set using 5 clusters and compute the silhouette score for the model.

In [None]:
kmeans_5 = KMeans(n_clusters=5)
preds_5 = kmeans_5.fit_predict(X)

print("Model 2 Silhouette Score: {}".format(silhouette_score(X, preds_5)))

## 3. Train a KMeans clustering model on the data set using 3 clusters and compute the silhouette score for the model.

In [None]:
kmeans_3 = KMeans(n_clusters=3)
preds_3 = kmeans_3.fit_predict(X)

print("Model 3 Silhouette Score: {}".format(silhouette_score(X, preds_3)))

## 4. Use elbow curve visualizations to see if you can determine the best number of clusters to use.

The Yellowbrick library has 3 metrics that you can plot using the `metric` parameter:

- **distortion**: mean sum of squared distances to centers
- **silhouette**: mean ratio of intra-cluster and nearest-cluster distance
- **calinski_harabaz**: ratio of within to between cluster dispersion

In [None]:
from yellowbrick.cluster import KElbowVisualizer

model = KMeans()
visualizer = KElbowVisualizer(model, k=(4,12))
visualizer.fit(X)
visualizer.poof();

In [None]:
model = KMeans()
visualizer = KElbowVisualizer(model, k=(4,12), metric = 'silhouette')
visualizer.fit(X)
visualizer.poof();

## 5. Try performing the same elbow tests with an AgglomerativeClustering model and compare the results you get to the KMeans results.

In [None]:
from sklearn.cluster import AgglomerativeClustering

model_2 = AgglomerativeClustering()
visualizer = KElbowVisualizer(model_2, k=(4,12))
visualizer.fit(X)
visualizer.poof();


## 6. Create and plot a scatter matrix showing how the clusters are grouped across all the different combinations of variables in the data.

Use the model and number of clusters that returned the best result above.

In [None]:
import seaborn as sns

In [None]:
model_6 = KMeans(n_clusters = 6)
model_6.fit(X)model_6.labels_
model_6.labels_


In [None]:
X['clusters'] = model_6.labels_
sns.pairplot(data = X, hue = 'clusters', palette = 'tab10')

## 7. Apply a PCA transform and plot the first two principle components with the plot point colors determined by cluster.

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA()
pca_X = pca.fit_transform(X)

pca_X = pd.DataFrame(pca_X)
pca_X

In [None]:
sns.scatterplot(data = pca_X, x = pca_X[0], y = pca_X[1], hue = model_6.labels_, palette = 'tab10')

## 8. Generate a series of t-SNE plots showing the clusters at a variety of perplexities.

In [None]:
from sklearn.manifold import TSNE

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(15, 7))
for i, perp in enumerate([5, 30, 50, 100]):
    tsne = TSNE(perplexity=perp)
    x_embedded = tsne.fit_transform(pca_X[[0,1]])
    ax[i].scatter(x_embedded[:, 0], x_embedded[:, 1], c=model_6.labels_)
    ax[i].set_title("Perplexity = {}".format(perp))
    
plt.show();