In [1]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.preprocessing import LabelEncoder

from sklearn.cluster import KMeans
from K_means import *

## Load Ecoil Dataset

In [2]:
# a list of the name of columns
names = ["Sequence_Name", "mcg", "gvh", "lip", "chg", "aac", "alm1", 
         "alm2", "class"]
# load the data from CSV file
df = pd.read_csv('ecoli.csv', header=None, names= names)
df.head()

Unnamed: 0,Sequence_Name,mcg,gvh,lip,chg,aac,alm1,alm2,class
0,AAT_ECOLI,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,ACEA_ECOLI,0.07,0.4,0.48,0.5,0.54,0.35,0.44,cp
2,ACEK_ECOLI,0.56,0.4,0.48,0.5,0.49,0.37,0.46,cp
3,ACKA_ECOLI,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,ADI_ECOLI,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp


Drop the Sequence_Name and label the class using int

In [3]:
# transform the dataset
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])

# Drop the Sequence name
df = df.drop(["Sequence_Name"], axis = 1)
df.head()

Unnamed: 0,mcg,gvh,lip,chg,aac,alm1,alm2,class
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,0
1,0.07,0.4,0.48,0.5,0.54,0.35,0.44,0
2,0.56,0.4,0.48,0.5,0.49,0.37,0.46,0
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,0
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,0


In [4]:
X = df.iloc[:, 0:-1].values
y = df.iloc[:, -1]

## Implementation of k-means

In [6]:
# using the custom k-means to fix the data

my_kmeans = K_means(n_clusters = 2)
my_kmeans.fit(X)
my_centroids = my_kmeans._centroids
print(my_centroids)

my_labels = my_kmeans._labels
print(my_labels)

[[ 0.5871028   0.48700935  0.49943925  0.5         0.5611215   0.76102804
   0.77130841]
 [ 0.45938865  0.50606987  0.49362445  0.50218341  0.47148472  0.37829694
   0.37283843]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 1 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]


In [7]:
# using Scikit-learn to get the centroids
skl_kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

In [10]:
skl_centroids = skl_kmeans.cluster_centers_
skl_labels = skl_kmeans.labels_

## Comparsion

In [13]:
# the difference between the centorids
centroids_diff = (skl_centroids - my_centroids).sum()
print("diff in centroids:", centroids_diff)

diff in centroids: 5.55111512313e-17


In [15]:
# difference in labeling
label_diff = (skl_labels - my_labels).sum()
print("diff in labels", label_diff)

diff in labels 0
