In [131]:
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
def standardize(labels):
    return (labels-labels.mean())/labels.std()

def modeVal(a, axis=0):
    scores = np.unique(np.ravel(a))      
    testshape = list(a.shape)
    testshape[axis] = 1
    oldmostfreq = np.zeros(testshape)
    oldcounts = np.zeros(testshape)

    for score in scores:
        template = (a == score)
        counts = np.expand_dims(np.sum(template, axis),axis)
        mostfrequent = np.where(counts > oldcounts, score, oldmostfreq)
        oldcounts = np.maximum(counts, oldcounts).astype(int)

    return oldcounts[0]

def mode2(a, axis=0):
    scores = np.unique(np.ravel(a))      
    testshape = list(a.shape)
    testshape[axis] = 1
    oldmostfreq = np.zeros(testshape)
    oldcounts = np.zeros(testshape)

    for score in scores:
        template = (a == score)
        counts = np.expand_dims(np.sum(template, axis),axis)
        mostfrequent = np.where(counts > oldcounts, score, oldmostfreq)
        oldcounts = np.maximum(counts, oldcounts)
        oldmostfreq = mostfrequent

    return mostfrequent[0]

In [3]:
a = pd.read_csv('data/avir-tzek.tsv', sep='\t')
a = a.round(1)
a2 = a.dropna().copy()

for col in a2:
    typ = str(a2[col].dtype)
    if typ in ['int', 'int32', 'int64','float', 'float32', 'float64']:
        a2[col] = standardize(a2[col])

In [241]:
class Feature:
    def __init__(self, featType, labels):
        self.vals = labels.as_matrix()
        self.type = featType
        vals, inds, counts = np.unique(self.vals, return_inverse=True, return_counts=True)
        self.valDict = {v:c for v,c in  zip(vals, counts)}
        self.valFreq = counts[inds]
    
    def getCenter(self, cntrMask):
        if self.type==float:
            return self.vals[cntrMask].mean()
        else:
            return mode2(self.vals[cntrMask])
      
    def getSimilarity(self, cntrMask, cntr, it=1):
        if self.type==float:
            return (self.vals - cntr)**2
        else:
            weights = (self.valFreq + self.valDict[cntr])/(self.valFreq * self.valDict[cntr])
            
            return weights * 1-(self.vals==cntr)*(modeVal(self.vals[cntrMask])/cntrMask.sum()) if it>0 else (self.vals!=cntr).astype(int)
            
        
#######################################################################################################################

class KModes:
    def __init__(self, data, k, it=10000):
        self.k, self.it = k, it
        self.n_data, self.n_features = data.shape[0], data.shape[1]
        self.n_chng = round(self.n_data * 0.1)
        #Create features
        self.gamma = 0.5
        self.features = np.array([Feature(data.dtypes[col], data[col]) for col in data])
        self.n_numFeats = np.sum([1 for feat in self.features if feat.type==float])
        self.n_catFeats = self.n_features - self.n_numFeats
        #Init the centers
        self.initCenters()
        return 
    
    def initCenters(self):
        cntrs = [0 for i in range(self.k)]
        dens = np.zeros(self.n_data)
        for feat in self.features:
            dens += (feat.valFreq/(self.n_data*self.n_features))
        cntrs[0] = [feat.vals[dens[0].argmax()] for feat in self.features]
        
        for ki in range(1, self.k):
            dens2 = np.zeros((ki, self.n_data))
            for kii in range(ki):
                sims = np.zeros((self.n_data, self.n_features))
                for f, feat in enumerate(self.features):
                    sims[:,f] = feat.getSimilarity(None, cntrs[kii][f], 0)
                dens2[kii] = dens * sims.sum(axis=1)
            cntrs[ki] = [feat.vals[np.argmax(np.min(dens2, axis=0))] for feat in self.features] 
        self.cntrs = cntrs
        self.clsses = self.assignClsses(0, None)
        self.fitData()
        return self
    
    def assignClsses(self, it, clsses):
    #Assign data to closest Mode
        dists = np.zeros((self.n_data, self.k))
        for i, cntr in enumerate(self.cntrs):
            num_feats = np.zeros((self.n_data, self.n_numFeats))
            cat_feats = np.zeros((self.n_data,self.n_catFeats))
            n, c = 0, 0
            for f, feat in enumerate(self.features):
                if feat.type == float:
                    num_feats[:,n] += feat.getSimilarity(clsses, cntr[f], it)
                    n += 1
                else:
                    cat_feats[:,c] += feat.getSimilarity(clsses, cntr[f], it)
                    c += 1
            dists[:,i] = num_feats.sum(axis=1) + self.gamma * cat_feats.sum(axis=1)
        clsses = dists.argmin(axis=1)
        if np.unique(clsses).shape[0] != self.k:
            self.newCenters(clsses)
            self.assignClsses(it, clsses)
        return clsses
    
    def fitData(self):
    #Iterate until Modes are set
        cnvrg = 0
        clsses = self.clsses
        for i in range(self.it):
            self.setCenters(clsses)
            new_clsses = self.assignClsses(i, clsses)
            chng = np.where(clsses != new_clsses)[0]
            if ((chng.shape[0] < self.n_chng)&(cnvrg==10))|(chng.shape[0]==0):
                print('I converged at iteration {}'.format(i))
                self.clsses = new_clsses
                self.matForm()
                return self
            elif((chng.shape[0] <= self.n_chng))&(cnvrg<10):
                print('Converging: {} iterations till convergence condition'.format(10-cnvrg))
                cnvrg += 1
            else:
                cnvrg = 0
            clsses = new_clsses
        self.clsses = clsses
        self.matForm()
        return self
        
    def setCenters(self, clsses):
    #Get centers for each feature for each mode
        cntrs = [0 for c in range(self.k)]
        for i in range(self.k):
            cntrs[i] = [feat.getCenter(clsses==i) for feat in self.features]
        self.cntrs= cntrs
        return self
    
    def printCenters(self):
        for cntr in self.cntrs:
            print()
            print(cntr)
            print()
            
    def newCenters(self, clsses):
        missing = np.setdiff1d(range(0,self.k), clsses)
        new_cntrs = np.random.choice(np.where(d == stats.mode(clsses)[0][0])[0], missing.shape[0])
        for m, nc in zip(missing, new_cntrs):
            self.cntrs[m] = [feat.vals[new_cntrs[nc]] for feat in self.features]
        return
    
    def matForm(self):
    #Class array to matrix form
        clssesMat = np.zeros((self.n_data, self.k))
        for r, cls in enumerate(self.clsses):
            clssesMat[r, cls] += 1
            
        self.clssesMat = clssesMat
        return self
            


In [242]:
aC2 = KModes(a2, 100, 1000)

Converging: 10 iterations till convergence condition
Converging: 9 iterations till convergence condition
Converging: 8 iterations till convergence condition
Converging: 7 iterations till convergence condition
I converged at iteration 4


In [225]:
soy = pd.read_csv('data/soybean.csv', header=None)
soy2 = soy.copy()
for col in soy2:
    typ = str(soy2[col].dtype)
    if typ in ['int', 'int32', 'int64','float', 'float32', 'float64']:
        soy2[col] = standardize(soy2[col])
soy2

soy2.dropna(inplace=True, axis=1)
soyClusters = KModes(soy2.drop([35], axis=1), 4, 1000)

Converging: 10 iterations till convergence condition
I converged at iteration 1


In [227]:
classtable = np.zeros((4, 4), dtype=int)
for ii, _ in enumerate(y):
    classtable[int(y[ii][-1]) - 1, soyClusters.clsses[ii]] += 1

print("\n")
print("    | Cl. 1 | Cl. 2 | Cl. 3 | Cl. 4 |")
print("----|-------|-------|-------|-------|")
for ii in range(4):
    prargs = tuple([ii + 1] + list(classtable[ii, :]))
    print(" D{0} |    {1:>2} |    {2:>2} |    {3:>2} |    {4:>2} |".format(*prargs))



    | Cl. 1 | Cl. 2 | Cl. 3 | Cl. 4 |
----|-------|-------|-------|-------|
 D1 |    10 |     0 |     0 |     0 |
 D2 |     0 |    10 |     0 |     0 |
 D3 |     0 |     0 |    10 |     0 |
 D4 |     0 |     0 |     0 |    17 |


In [228]:
stocks = pd.read_csv('data/stocks.csv', header=None)
stocks2 = stocks.copy()
for col in stocks2:
    typ = str(stocks2[col].dtype)
    if typ in ['int', 'int32', 'int64','float', 'float32', 'float64']:
        stocks2[col] = standardize(stocks2[col])
stocks2
stockClusters = KModes(stocks2.drop([0],axis=1), 4, 100)

I converged at iteration 0


In [231]:
stocks2

Unnamed: 0,0,1,2,3
0,AAPL,2.854111,tech,USA
1,XOM,0.327117,nrg,USA
2,GOOGL,0.318214,tech,USA
3,MSFT,0.170977,tech,USA
4,BRK-A,0.149063,fin,USA
5,WFC,-0.269363,fin,USA
6,CHL,-0.271418,tel,CN
7,JNJ,-0.287854,cons,USA
8,WMT,-0.441939,cons,USA
9,VZ,-0.798046,tel,USA


In [235]:
import numpy as np
from kmodes.kmodes import KModes

# reproduce results on small soybean data set
x = np.genfromtxt('data/soybean.csv', dtype=int, delimiter=',')[:, :-1]
y = np.genfromtxt('data/soybean.csv', dtype=str, delimiter=',', usecols=(35, ))

kmodes_huang = KModes(n_clusters=4, init='Huang', verbose=1)
kmodes_huang.fit(x)

# Print cluster centroids of the trained model.
print('k-modes (Huang) centroids:')
print(kmodes_huang.cluster_centroids_)
# Print training statistics
print('Final training cost: {}'.format(kmodes_huang.cost_))
print('Training iterations: {}'.format(kmodes_huang.n_iter_))

kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=1)
kmodes_cao.fit(x)

# Print cluster centroids of the trained model.
print('k-modes (Cao) centroids:')
print(kmodes_cao.cluster_centroids_)
# Print training statistics
print('Final training cost: {}'.format(kmodes_cao.cost_))
print('Training iterations: {}'.format(kmodes_cao.n_iter_))

print('Results tables:')
for result in (kmodes_huang, kmodes_cao):
    classtable = np.zeros((4, 4), dtype=int)
    for ii, _ in enumerate(y):
        classtable[int(y[ii][-1]) - 1, result.labels_[ii]] += 1

    print("\n")
    print("    | Cl. 1 | Cl. 2 | Cl. 3 | Cl. 4 |")
    print("----|-------|-------|-------|-------|")
    for ii in range(4):
        prargs = tuple([ii + 1] + list(classtable[ii, :]))
        print(" D{0} |    {1:>2} |    {2:>2} |    {3:>2} |    {4:>2} |".format(*prargs))

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 199.0
k-modes (Huang) centroids:
[[0 1 2 1 0 3 1 2 1 0 1 1 0 2 2 0 0 0 1 0 2 2 0 0 0 0 0 3 4 0 0 0 0 0 1]
 [0 1 2 0 0 0 1 1 0 1 1 0 0 2 2 0 0 0 1 0 1 1 0 1 0 0 0 3 4 0 0 0 0 0 0]
 [3 0 2 1 0 1 0 1 0 2 1 1 0 2 2 0 0 0 1 0 3 1 1 1 0 0 0 0 4 0 0 0 0 0 0]
 [5 0 0 2 1 1 2 1 0 0 1 1 0 2 2 0 0 0 1 0 0 3 0 0 0 2 1 0 4 0 0 0 0 0 0]]
Final training cost: 199.0
Training iterations: 1
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 2, cost: 204.0
Run 1, iteration: 2/100, moves: 2, cost: 199.0
Run 1, iteration: 3/100, moves: 0, cost: 199.0
k-modes (Cao) centroids:
[[0 1 2 1 0 3 1 2 1 0 1 1 0 2 2 0 0 0 1 0 2 2 0 0 0 0 0 3 4 0 0 0 0 0 1]
 [5 0 0 2 1 1 2 1 0 0 1 1 0 2 2 0 0 0 1 0 0 3 0 0 0 2 1 0 4 0 0 0 0 0 0]
 [3 0 2 1 0 1 0 1 0 2 1 1 0 2 2 0 0 0 1 0 3 1 1 1 0 0 0 0 4 0 0 0 0 0 0]
 [0 1 2 0 0 3 1 1 0 1 1 0 0 2 2 0 0 0 