In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)



In [None]:
# For this excercise we do clustering on a multiclass 2d dataset.
# Source: https://cs.joensuu.fi/sipu/datasets/ Aggregation shape

ds = pd.read_csv('Aggregation.txt')
ds = ds.sample(frac=1).reset_index(drop=True)

ds.head()

In [None]:
ds.describe()
# So we have 7 classes of coordinates on the plane

In [None]:
%%time
# Let's visualize this!
plt.figure(figsize = (20,20))

sns.stripplot(data=ds, x='X', y='Y', hue='Group', size=15)

In [None]:
%%time
# Let's try a simple KMeans clustering to identify our 7 groups

from sklearn.cluster import KMeans

ds_clustered = ds[['X','Y']]
kmeans = KMeans(n_clusters=7, random_state=0)
kmeans.fit(ds_clustered)

ds_clustered['Group'] = kmeans.labels_


plt.figure(figsize = (20,20))
sns.stripplot(data=ds_clustered, x='X', y='Y', hue='Group', size=15)

In [None]:
%%time
# So KMeans worked quite good, except for identifying the boundaries of group 4,5,7
# Let's see another method, DBSCAN
from sklearn.cluster import DBSCAN


ds_clustered = ds[['X','Y']]
# The success of dbscan lies in choosing the right epsilon and min_samples values.
# Epsilon tells dscan what is the maximum euclidean distance we allow between neighbours,
# and min_samples tells us how many neighbours are required to accept a point to a class.
# These current settings are not perfect.
# The problems are: 
#     - Group number 7 is not seperated from number 4
#     - Group number 6 and 3 form one group
dbscan = DBSCAN(eps=1.5, min_samples=3)
dbscan.fit(ds_clustered)

ds_clustered['Group'] = dbscan.labels_

plt.figure(figsize = (20,20))
sns.stripplot(data=ds_clustered, x='X', y='Y', hue='Group', size=15)

In [None]:
%%time
# Let's take a look at AgglomerativeClustering

from sklearn.cluster import AgglomerativeClustering

ds_clustered = ds[['X','Y']]
agglomerative = AgglomerativeClustering(n_clusters=7)
agglomerative.fit(ds_clustered)

ds_clustered['Group'] = agglomerative.labels_


plt.figure(figsize = (20,20))
sns.stripplot(data=ds_clustered, x='X', y='Y', hue='Group', size=15)
# The problems are the same as with KMeans, but with more consistent results ie. Less random outliers.