In [None]:
#Importing libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
%matplotlib inline


plt.rcParams['figure.figsize'] = (10,6)

# Clustering example

In [None]:
data = pd.read_csv("iris.csv")
data.head()

In [None]:
y  = data["species"].copy()
X = data.drop("species", axis=1)

X.head(2)

In [None]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=3).fit(X)

centroids = model.cluster_centers_
centroids = pd.DataFrame(centroids, columns=X.columns)

clusters = model.labels_

fig, ax = plt.subplots(1,2,sharey=True,figsize=(12,6) )

ax[0].scatter(X['petal_length'], X['petal_width'], c= clusters, s=50, alpha=0.6)
ax[0].scatter(centroids.loc[:, 'petal_length'], centroids.loc[:, 'petal_width'], c='red', s=100, marker="+", label="centroids")
ax[0].set_xlabel("petal length")
ax[0].set_ylabel("petal width")
ax[0].legend()
ax[0].set_title("Predicted clusters")


ax[1].scatter(X['petal_length'], X['petal_width'], c= y.map( {"setosa":0, "versicolor":1, "virginica":2}), s=50, alpha=0.8)
ax[1].set_title("Real species")

plt.show()

In [None]:
from sklearn.metrics import mutual_info_score, homogeneity_score, silhouette_score
from sklearn.metrics.cluster import contingency_matrix

print("Agreement with true labels = {:.1%}".format(mutual_info_score(clusters, y)))
print("Homogeneity with true labels = {:.1%}".format(homogeneity_score(clusters, y))) # [0,1] each cluster contains only members of a single class
print("Silhouette coefficient = {:.2}".format(silhouette_score(X, clusters, metric='euclidean'))) # [-1, 1] the higher, the more compact distinct clusters
display(contingency_matrix(clusters,y))

In [None]:
intertias = []

for i in range(1, 15):
    
    km = KMeans(
        n_clusters=i,
        random_state=0
    )
    km.fit(X)
    intertias.append(km.inertia_)

# plot
plt.figure(figsize=(10,6))
plt.plot(range(1, 15), intertias, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('intertias')
plt.show()

# Exercise

For this exercise you will use another dataset called HTRU2.

You can find it here: https://archive.ics.uci.edu/ml/datasets/HTRU2

This dataset contains data about pulsar candidates.

    Pulsars are a rare type of Neutron star that produce radio emission detectable here on Earth. They are of considerable scientific interest as probes of space-time, the inter-stellar medium, and states of matter (see [2] for more uses).

    As pulsars rotate, their emission beam sweeps across the sky, and when this crosses our line of sight, produces a detectable pattern of broadband radio emission. As pulsars rotate rapidly, this pattern repeats periodically. Thus pulsar search involves looking for periodic radio signals with large radio telescopes.

    Each pulsar produces a slightly different emission pattern, which varies slightly with each rotation. Thus a potential signal detection known as a 'candidate', is averaged over many rotations of the pulsar, as determined by the length of an observation. In the absence of additional info, each candidate could potentially describe a real pulsar. However in practice almost all detections are caused by radio frequency interference (RFI) and noise, making legitimate signals hard to find.

The data set shared here contains 16,259 spurious examples caused by RFI/noise, and 1,639 real pulsar examples. The class labels used are 0 (negative) and 1 (positive).

NB: don't change the existing code in the cells below, only add code to make it work properly

## Preparing data

Import the HTRU_2.csv dataset and have a look at it using head

In [None]:
df = ...
df.head()

Check if there are missing values in the dataset (you can use isna and sum methods)

In [None]:
...

extract the target values in one variable and keep all the other columns in another variable

In [None]:
y = ...
X = ...

In [None]:
cols = X.columns

## Modeling

* Create a KMeans model
* Fit it to your dataset
* select 2 columns for the plots
* create a scatter plot to show your columns on the 2 axes colored by the cluster they belong to
* then on the same plot, add the centroids points
* finally create a second plot with the real data colored by their target values
* then try with different columns to find the best clusters

In [None]:
y.value_counts()

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X = pd.DataFrame(sc.fit_transform(X))
X.columns = cols

In [None]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=...).fit(...)

centroids = model.cluster_centers_
centroids = pd.DataFrame(centroids, columns=X.columns)

clusters = model.labels_

# choose columns for visualization
column1 = ...
column2 = ...

fig, ax = plt.subplots(1,2,sharey=True,figsize=(12,6) )


ax[0].scatter(X[column1], X[column2], c= clusters, s=50, alpha=0.6)
ax[0].scatter(centroids.loc[:, column1], centroids.loc[:, column2], c='red', s=100, marker="+", label="centroids")
ax[0].set_xlabel(column1)
ax[0].set_ylabel(column2)
ax[0].legend()
ax[0].set_title("Predicted clusters")

ax[1].scatter(X[column1], X[column2], c= y, s=50, alpha=0.8)
ax[1].set_title("Real heart attack")

plt.show()

In [None]:
from sklearn.metrics import mutual_info_score, homogeneity_score, silhouette_score
from sklearn.metrics.cluster import contingency_matrix

print("Agreement with true labels = {:.1%}".format(mutual_info_score(..., ...)))
print("Homogeneity with true labels = {:.1%}".format(homogeneity_score(..., ...))) # [0,1] each cluster contains only members of a single class
print("Silhouette coefficient = {:.2}".format(silhouette_score(..., ..., metric='euclidean'))) # [-1, 1] the higher, the more compact distinct clusters
display(contingency_matrix(...,...))