In [1]:
import numpy as np
import pandas as pd
from difflib import SequenceMatcher

In [78]:
class Clustering():
    def __init__(self, num_of_means, schema):
        self.num_of_means = num_of_means
        self.data = []
        self.schema = schema # [[Name, Type]]

    def updateData(self, row):
        self.data.append(row)

    def generateDistanceMatrix(self,f,type):
        D = np.zeros((len(self.data), len(self.data)))
        if type == "INTERVAL":
            high = self.data[0][f] 
            low = self.data[0][f] 
            for i in range(1, len(self.data)):
                high = max(high, self.data[i][f])
                low = min(low, self.data[i][f])
            for i in range(len(self.data)):
                for j in range(len(self.data)):
                    D[i][j] = np.abs(self.data[i][f] - self.data[j][f])/(high-low)
        elif type == "RATIO":
            for i in range(len(self.data)):
                for j in range(len(self.data)):
                    D[i][j] = np.abs(np.log10(self.data[i][f]) - np.log10(self.data[j][f]))
        elif type == "NOMINAL":
            for i in range(len(self.data)):
                for j in range(len(self.data)):
                    D[i][j] = 1 - SequenceMatcher(None, self.data[i][f], self.data[j][f]).ratio()
        elif type == "BINARY SYMMETRIC":
            for i in range(len(self.data)):
                for j in range(len(self.data)):
                    if self.data[i][f] != self.data[j][f]:
                        D[i][j] = 1
        elif type == "BINARY ASYMMETRIC":
            for i in range(len(self.data)):
                for j in range(len(self.data)):
                    if self.data[i][f] != self.data[j][f]:
                        D[i][j] = 1
        elif type == "ORDINAL":
            Mf = self.data[0][f]
            for i in range(1, len(self.data)):
                Mf = max(Mf, self.data[i][f])
            for i in range(len(self.data)):
                for j in range(len(self.data)):
                    zif = (self.data[i][f]-1)/(Mf-1)
                    zjf = (self.data[j][f]-1)/(Mf-1)
                    D[i][j] = np.abs(zif - zjf)
        return D

    def generateMixedMatrix(self, attr_idxs):
        DFULL = []
        D = np.zeros((len(self.data), len(self.data)))
        for i in range(len(attr_idxs)):
            DFULL.append(self.generateDistanceMatrix(attr_idxs[i], self.schema[attr_idxs[i]][1]))
        for i in range(len(self.data)):
            for j in range(len(self.data)):
                num = 0
                den = 0
                for f in range(len(attr_idxs)):
                    if not (self.schema[attr_idxs[f]][1] == "BINARY ASYMMETRIC" and self.data[i][attr_idxs[f]] == self.data[j][attr_idxs[f]] == 0):
                        num += DFULL[f][i][j]
                        den += 1
                if den != 0:
                    D[i][j] = num/den
        return D
