In [35]:
import numpy as np
import pandas as pd
from collections import OrderedDict

In [439]:
def standardize(labels):
    return (labels-labels.mean())/labels.std()

def modeVal(a, axis=0):
    scores = np.unique(np.ravel(a))      
    testshape = list(a.shape)
    testshape[axis] = 1
    oldmostfreq = np.zeros(testshape)
    oldcounts = np.zeros(testshape)

    for score in scores:
        template = (a == score)
        counts = np.expand_dims(np.sum(template, axis),axis)
        mostfrequent = np.where(counts > oldcounts, score, oldmostfreq)
        oldcounts = np.maximum(counts, oldcounts).astype(int)

    return oldcounts[0]

In [2]:
a = pd.read_csv('data/avir-tzek.tsv', sep='\t')
a = a.round(1)
a2 = a.dropna().copy()

for col in a2:
    typ = str(a2[col].dtype)
    if typ in ['int', 'int32', 'int64','float', 'float32', 'float64']:
        a2[col] = standardize(a2[col])

In [560]:
class Feature:
    def __init__(self, featType, labels):
        self.vals = labels.as_matrix()
        self.type = featType
        _, inds, counts = np.unique(self.vals, return_inverse=True, return_counts=True)
        self.valFreq = counts[inds]
        return
    
    def getGamma(self, cntrMask, it=1):
        return self.vals[cntrMask].std() if it > 0 else 1
    
    def getCenter(self, cntrMask):
        if self.type==float:
            return self.vals[cntrMask].mean()
        else:
            return modeVal(self.vals[cntrMask])
      
    def getSimilarity(self, cntrMask, cntr, it=1):
        if self.type==float:
            return (self.vals - cntr)**2
        else:
            return 1-(self.vals==cntr)*(modeVal(self.vals[cntrMask])/cntrMask.sum()) if it>0 else (self.vals!=cntr).astype(int)
            
        
#######################################################################################################################

class KModes:
    def __init__(self, data, k, it=10000):
        self.k, self.it = k, it
        self.n_data, self.n_features = data.shape[0], data.shape[1]
        self.n_chng = round(self.n_data * 0.1)
        #Create features
        self.features = np.array([Feature(data.dtypes[col], data[col]) for col in data])
        self.n_numFeats = np.sum([1 for feat in self.features if feat.type==float])
        self.n_catFeats = self.n_features - self.n_numFeats
        #Init the centers
        self.initCenters()
        return
    
    def initCenters(self):
        cntrs = [0 for i in range(self.k)]
        dens = np.zeros(self.n_data)
        for feat in self.features:
            dens += (feat.valFreq/(self.n_data*self.n_features))
        cntrs[0] = [feat.vals[dens[0].argmax()] for feat in self.features]
        
        for ki in range(1, self.k):
            dens2 = np.zeros((ki, self.n_data))
            for kii in range(ki):
                sims = np.zeros((self.n_data, self.n_features))
                for f, feat in enumerate(self.features):
                    sims[:,f] = feat.getSimilarity(None, cntrs[kii][f], 0)
                dens2[kii] = dens * sims.sum(axis=1)
            cntrs[ki] = [feat.vals[np.argmax(np.min(dens2, axis=0))] for feat in self.features]
            
        self.cntrs = cntrs
        self.clsses = self.assignClsses(0, None)
        self.fitData()
        return 
    
    def assignClsses(self, it, clsses):
    #Assign data to closest Mode
        dists = np.zeros((self.n_data, self.k))
        for i, cntr in enumerate(self.cntrs):
            num_feats, gamma = np.zeros((self.n_data, self.n_numFeats)), np.zeros((self.n_data,self.n_numFeats))
            cat_feats = np.zeros((self.n_data,self.n_catFeats))
            n, c = 0, 0
            for f, feat in enumerate(self.features):
                if feat.type == float:
                    num_feats[:,n] += feat.getSimilarity(clsses, cntr[f], it)
                    gamma[:,n] += feat.getGamma(clsses, it)
                    n += 1
                else:
                    cat_feats[:,c] += feat.getSimilarity(clsses, cntr[f], it)
                    c += 1
            dists[:,i] = num_feats.sum(axis=1) + gamma.mean(axis=1) * cat_feats.sum(axis=1)
        clsses = dists.argmax(axis=1)
        if np.unique(clsses).shape[0] != self.k:
            clsses = self.removeCenters(clsses)
        return clsses
    
    def fitData(self):
    #Iterate until Modes are set
        cnvrg = 0
        clsses = self.clsses
        for i in range(self.it):
            self.setCenters(clsses)
            new_clsses = self.assignClsses(i, clsses)
            chng = np.where(clsses != new_clsses)[0]
            if ((chng.shape[0] < self.n_chng)&(cnvrg==10))|(chng.shape[0]==0):
                print('I converged at iteration {}'.format(i))
                self.clsses = new_clsses
                self.matForm()
                return
            elif((chng.shape[0] <= self.n_chng))&(cnvrg<10):
                print('Converging: {} iterations till convergence condition'.format(10-cnvrg))
                cnvrg += 1
            else:
                cnvrg = 0
            clsses = new_clsses
        self.clsses = clsses
        self.matForm()
        return
        
    def setCenters(self, clsses):
    #Get centers for each feature for each mode
        self.cntrs =  [[feat.getCenter(clsses==i) for feat in self.features] for i in range(self.k)]
        return
   
    
    def removeCenters(self, clsses):
    #If no pts are assigned to Center, remove Center
        missing = np.setdiff1d(range(0,self.k), clsses)
        cntrs = [0 for c in range(self.k-missing.shape[0])]
        new_k = 0 
        for m in range(self.k):
            if m not in missing:
                cntrs[new_k] = self.cntrs[m]
                new_k += 1
            else:
                clsses[clsses>m] -= 1
        self.k = new_k
        self.cntrs = cntrs
        return clsses
    
    def matForm(self):
    #Class array to matrix form
        clssesMat = np.zeros((self.n_data, self.k))
        for r, cls in enumerate(self.clsses):
            clssesMat[r, cls] += 1
            
        self.clssesMat = clssesMat
        return
            


In [None]:
%timeit aC2 = KModes(a2, 5, 10000)

In [584]:
aC2 = KModes(a2, 5, 10000)

In [585]:
aC2.clsses

array([0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 0, 0, 0,
       0, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 1, 1, 1, 1, 1, 2, 2, 0, 0, 0, 0, 2,
       2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,