In [2]:
import numpy as np
import pandas as pd
from scipy import stats

def standardize(labels):
    return (labels-labels.mean())/labels.std()

In [129]:
a = pd.read_csv('data/avir-tzek.tsv', sep='\t')
a = a.round(1)
a2 = a.dropna().copy()

for col in a2:
    typ = str(a2[col].dtype)
    if typ in ['int', 'int32', 'int64','float', 'float32', 'float64']:
        a2[col] = standardize(a2[col])

In [460]:
class Feature:
    def __init__(self, k, featType, labels):
        self.k = k
        self.featType = featType
        self.initFeatMat(labels)
        return
    
    def initFeatMat(self, labels):
        #PANDAS Drop NA Values
        labels.dropna(inplace = True)
        #Unique Vals and their indices
        vals, orig_inds, val_counts = np.unique(labels,return_inverse=True, return_counts=True)
        #Blank n_unique x k
        self.n = (vals.shape[0] * self.k)
        self.featMat = np.zeros((vals.shape[0], self.k))
        #Unique values
        self.vals, self.origInds = vals, orig_inds
        #Frequency dictionary
        self.featDict = {v:c for v, c in zip(vals, val_counts)}   
        return 
    
    def fillFeatMat(self, clsses):
    #Fill FeatureMat
        featMat = self.featMat.flatten()
        inds = (self.origInds * self.k) + clsses
        np.put(featMat, inds, 1)
        self.featMat = featMat.reshape((self.vals.shape[0], self.k))
        return 
    
    def updateFeatMat(self, clss_chng, old_clsses, clsses):
    #Update featureMat with new classes
        featMat = self.featMat.flatten()
        old_inds = (self.origInds[clss_chng] * self.k) + old_clsses[clss_chng]
        np.put(featMat, old_inds, 0)
        inds = (self.origInds[clss_chng] * self.k) + clsses[clss_chng]
        np.put(featMat, old_inds, 1)
        self.featMat = featMat.reshape((self.vals.shape[0], self.k))
        return
    
    def getSD(self, mode, it=1):
    #Get SD of numerical feature
        if it == 0:
            return 1
        else:
            reps = self.featMat[np.nonzero(self.featMat[:,mode] != 0),mode][0].astype(int)
            return np.std(self.vals[np.repeat(np.nonzero(self.featMat[:,mode] != 0), reps)])
    
    def getCenter(self, mode):
    #Value of interest for the classes
        # Categorical Feature
        if self.featType==float:
            #Mean
            reps = self.featMat[np.nonzero(self.featMat[:,mode] != 0),mode][0].astype(int)
            return np.mean(self.vals[np.repeat(np.nonzero(self.featMat[:,mode] != 0), reps)])
        else:
            #Mode
            return self.vals[self.featMat[:,mode].argmax(axis=0)]
      
    def getSimilarity(self, mode, cntr, it=1):
        if self.featType==float:
            return (self.vals[self.origInds] - cntr)**2
        else:
            if it == 0:
                return np.array(self.vals[self.origInds] != cntr).astype(int)
            else:
                return 1 - (self.vals[self.origInds] == cntr)*(self.featMat[:,mode].max()/self.featMat[:,mode].sum())
        
    
                
        
#######################################################################################################################

class KModes:
    def __init__(self, data, k, it=10000):
        self.k, self.it = k, it
        self.n_data, self.n_features = data.shape[0], data.shape[1]
        self.n_chng = round(self.n_data * 0.1)
        #Create features
        self.features = np.array([Feature(self.k, data.dtypes[col], data[col]) for col in data])
        #Init the centers
        self.initModes(data)
        self.fitData()
        #self.toSparseMatrix()
        return
    
    def initModes(self, data):
    #Set the initial modes following Cao
        modes = [0 for i in range(self.k)]
        dens = np.zeros(self.n_data)
        #Iterate through all data
        for c, feat in enumerate(self.features):
            for r in range(dens.shape[0]):            
                #INCLUDE CHECK FOR NULL - PANDAS
                if not pd.isnull(data.iloc[r,c]):
                    #SUPER UGLY GET Freq
                    freq = feat.featDict[feat.vals[feat.origInds[r]]]
                    #Density of (r,c)
                    dens[r] += freq/(self.n_data*self.n_features)
        #row of max density is first mode
        modes[0] = [feat.vals[feat.origInds[dens.argmax()]] for feat in self.features]
        #Subsequent modes
        for m in range(1, self.k):
            #adjusted densities
            dens2 = np.empty((m, self.n_data))
            #Compare to all assigned modes
            for mm in range(m):
                #Get similarity btwn pt and prev mode, use Z-O for cat features
                sims = np.zeros((self.n_data, self.n_features))
                for f, feat in enumerate(self.features):
                    sims[:,f] = feat.getSimilarity(None, modes[mm][f], 0)
                dens2[mm] = dens * sims.sum(axis=1)
            #Assign new mode to max of new densities - whichever has maximum distance to closest mode
            modes[m] = [feat.vals[feat.origInds[np.argmax(np.min(dens2, axis=0))]] for feat in self.features]
        self.modes = modes
        return 
    
    def assignClsses(self, it):
    #Assign data to closest Mode
        #Each data has distance to each Mode
        dists = np.zeros((self.n_data, self.k))
        for m, mode in enumerate(self.modes):
            gamma = np.mean([feat.getSD(m, it) for feat in self.features if feat.featType == float])
            for f, feat in enumerate(self.features):
                if feat.featType == float: 
                    dists[:,m] += feat.getSimilarity(m, mode[f], it)
                else:
                    dists[:,m] += gamma * feat.getSimilarity(m, mode[f], it)
        clsses = dists.argmax(axis=1)
        if np.unique(clsses).shape[0] != self.k:
            clsses = self.removeMode(clsses)
        return clsses
    
    def fitData(self):
    #Iterate until Modes are set
        for i in range(self.it):
            clsses = self.assignClsses(i)
            #In first iteration fill otherwise update
            if i == 0:
                #Fill the featMat
                [feat.fillFeatMat(clsses) for feat in self.features]
                self.setModes()
            else:
                #Update the featMat
                clss_chng = np.where(old_clsses != clsses)[0]
                [feat.updateFeatMat(clss_chng, old_clsses, clsses) for feat in self.features]
                self.setModes()
                if clss_chng.shape[0] <= self.n_chng:
                    self.clsses = clsses
                    return
            old_clsses = clsses    
        self.clsses = clsses
        return
        
    def setModes(self):
    #Get centers for each feature for each mode
        self.modes = [[feat.getCenter(m) for feat in self.features] for m in range(self.k)]
        return
    
    def removeMode(self, clsses):
    #If no pts are assigned to Mode, remove Mode
        #find empty modes
        missing = np.setdiff1d(range(0,self.k), clsses)
        for m in missing:
            #Remove Approp col of featMat
            for feat in self.features:
                feat.featMat = np.delete(feat.featMat, m, 1)
                feat.k -= 1
            #remove mat
            self.modes = np.delete(self.modes, m)
            self.k -= 1
            clsses[clsses>m] -= 1
            missing -= 1
        return clsses
    
    def toSparseMatrix(self):
    #Class array to matrix form
        clssesMat = np.zeros((self.n_data, self.k))
        for r, cls in enumerate(self.clsses):
            clssesMat[r, cls] += 1
            
        self.clssesMat = clssesMat
        return
            


In [465]:
aClusters = KModes(a2, 5, 100000)