In [39]:
from scipy.cluster.hierarchy import linkage,dendrogram

from sklearn.cluster import dbscan,DBSCAN, KMeans
from sklearn.metrics import silhouette_score


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.model_selection import train_test_split

<h1 style = 'color:orange'>DBSCAN</h1>

In [11]:
milk = pd.read_csv('../Datasets/milk.csv',index_col=0)
milk_scaled = StandardScaler().set_output(transform='pandas').fit_transform(milk)

clust = DBSCAN(eps=0.6,min_samples = 2)
clust.fit(milk_scaled)
print(clust.labels_) #labels


milk_copy = milk.copy()
milk_copy['label'] = clust.labels_
milk_copy

[ 0  0  0  0 -1  1  1  2 -1 -1  2  1  0 -1  1  2 -1 -1 -1 -1  3  3 -1 -1
 -1]


Unnamed: 0_level_0,water,protein,fat,lactose,ash,label
Animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HORSE,90.1,2.6,1.0,6.9,0.35,0
ORANGUTAN,88.5,1.4,3.5,6.0,0.24,0
MONKEY,88.4,2.2,2.7,6.4,0.18,0
DONKEY,90.3,1.7,1.4,6.2,0.4,0
HIPPO,90.4,0.6,4.5,4.4,0.1,-1
CAMEL,87.7,3.5,3.4,4.8,0.71,1
BISON,86.9,4.8,1.7,5.7,0.9,1
BUFFALO,82.1,5.9,7.9,4.7,0.78,2
GUINEA PIG,81.9,7.4,7.2,2.7,0.85,-1
CAT,81.6,10.1,6.3,4.4,0.75,-1


In [19]:
#exluding in the outliers (label = -1)
inliers = milk_scaled.copy()
inliers['label'] = clust.labels_
inliers = inliers[inliers['label'] != -1]  #removes the outliers
silhouette_score(inliers.iloc[:,:-1],inliers['label'])

0.5934459505692155

In [30]:
#for different values of eosilon and minimum points

epsilon = [0.2,0.4,0.6,0.8,1.2]
min_points = [2,3,4,5]
score = []

for e in epsilon:
    for m in min_points:
        clust = DBSCAN(eps = e,min_samples=m)
        clust.fit(milk_scaled)
        inliers = milk_scaled.copy()
        inliers['label'] = clust.labels_
        inliers = inliers[inliers['label']!=-1] 
        if len(inliers['label'].value_counts())>1:
            s =silhouette_score(inliers.drop('label',axis = 1),inliers['label'])
            score.append([e,m,s])
            
        
score = pd.DataFrame(score, columns=['Epsilon','Minimum Points','Score'])
score.sort_values('Score',ascending=False)

Unnamed: 0,Epsilon,Minimum Points,Score
0,0.4,2,0.651894
2,0.6,2,0.593446
8,1.2,2,0.552889
4,0.6,4,0.551975
1,0.4,3,0.538518
3,0.6,3,0.534443
6,0.8,3,0.533038
5,0.8,2,0.464674
7,0.8,4,0.457151


In [54]:
kyph = pd.read_csv('../Cases/Kyphosis/Kyphosis.csv')
scaler = StandardScaler().set_output(transform='pandas')


train, test = train_test_split(kyph, random_state=25, stratify=y, test_size=0.3)

X_train = train.drop('Kyphosis', axis=1)
X_train = scaler.fit_transform(X_train)
Y_train = train['Kyphosis']

X_test = test.drop('Kyphosis', axis=1)
X_test = scaler.fit_transform(X_test)
Y_test = test['Kyphosis']


scores = []
for i in range(2,11):
    clust = KMeans(random_state=25,n_clusters=i)
    clust.fit(X_train)
    scores.append([i,silhouette_score(X_train,clust.labels_)])
scores = pd.DataFrame(scores,columns=['clusters','score'])
scores.sort_values('score',ascending=False)

Unnamed: 0,clusters,score
2,4,0.396206
8,10,0.39334
7,9,0.366559
3,5,0.351667
0,2,0.348402
1,3,0.330521
6,8,0.323228
5,7,0.287655
4,6,0.285068


In [58]:
kmeans = KMeans(random_state=25, n_clusters=4)
kmeans.fit(X_train)
train['cluster'] = kmeans.labels_
print(silhouette_score(X_train, kmeans.labels_))
kmeans.predict(X_test)

0.39620630081028957


array([0, 2, 2, 3, 3, 0, 1, 3, 3, 3, 2, 2, 2, 2, 1, 0, 3, 0, 2, 3, 3, 3,
       2, 1, 3], dtype=int32)

In [70]:
print(train['cluster'].unique())

train[train['cluster'] == 0]

[3 1 2 0]


Unnamed: 0,Kyphosis,Age,Number,Start,cluster
21,present,105,6,5,0
58,absent,51,7,9,0
52,present,139,10,6,0
79,present,42,7,6,0
24,present,15,7,2,0
42,absent,143,9,3,0
9,present,59,6,12,0
77,absent,26,7,13,0
