Implementation of Unified Distance Metric

*Zhang, Yiqun, and Yiu-Ming Cheung. "A New Distance Metric Exploiting Heterogeneous Interattribute Relationship for Ordinal-and-Nominal-Attribute Data Clustering." IEEE transactions on cybernetics (2020).*

In [1]:
import pandas as pd
import numpy as np

In [22]:
df = pd.read_csv("table3.csv")
X = df.values
ordinal = ["O" in c for c in df.columns]
N, d = X.shape
n = list(df.nunique())
X, ordinal, n, d

(array([[0, 0, 0],
        [0, 1, 1],
        [1, 1, 2],
        [2, 2, 1],
        [2, 2, 0]]),
 [True, True, False],
 [3, 3, 3],
 3)

In [32]:
R = np.zeros((d, d))
psi = [np.zeros((d, n[r], n[r])) for r in range(d)]
phi = [np.zeros((n[r], n[r])) for r in range(d)]
Z = N*(N-1)/2
for r in range(d):
    for s in range(d):
        # ------------------------------------
        # Calculate interdependence measure R.
        C = np.zeros((n[r], n[s]), dtype=int); C_eq = 0; C_diff = 0      
        for i in range(N): # Iterate through samples.
            C[X[i,r], X[i,s]] += 1 # Counts for feature pairings.
            if ordinal[r] and ordinal[s]: # If both r and s are ordinal.   
                for j in range(i): # Iterate through samples before i.
                    if X[i,r] == X[j,r] and X[i,s] == X[j,s]: C_eq += 1 # i,j equal on both features.
                    elif (X[i,r] > X[j,r] and X[i,s] > X[j,s]) or (X[i,r] < X[j,r] and X[i,s] < X[j,s]): C_diff += 1 # Direction of difference is the same.
                    elif (X[i,r] > X[j,r] and X[i,s] < X[j,s]) or (X[i,r] < X[j,r] and X[i,s] > X[j,s]): C_diff -= 1 # Direction of difference is opposite.
                C_diff = abs(C_diff) # Just need absolute value of net difference.  
        if not (ordinal[r] and ordinal[s]): # If at least one of r and s is nominal.
            C_eq = np.maximum(C-1, 0).sum() # Quick way of computing number of sample pairs equal on both features.
            for t in range(n[r]):
                for h in range(t):
                    for g in range(n[s]):
                        for u in range(g):
                            C_diff += abs((C[t,g] * C[h,u]) - (C[t,u] * C[h,g]))
        R[r,s] = (C_eq + C_diff) / Z # Final calculation to get R.     
        # -------------------------------------   
        # Calculate entropy-based distance psi.  
        P = C / N; S_A_s = np.log2(n[s])
        if ordinal[r]: # If r is ordinal.
            for t in range(1,n[r]): # Adjacent only.
                psi[r][s,t,t-1] = np.nansum(-(P[t] + P[t-1]) * np.log2(P[t] + P[t-1])) / S_A_s # Normalised entropy of summed joint distributions.                    
            for t in range(1,n[r]): # Fill in remaining by summation.
                for h in range(t-1):
                    for g in range(h, t):
                        psi[r][s,t,h] += psi[r][s,g+1,g]
            psi[r][s] += psi[r][s].T # Symmetric.        
        else: # If r is nominal.
            for t in range(n[r]):
                for h in range(t): 
                    psi[r][s,t,h] = psi[r][s,h,t] = np.nansum(-(P[t] + P[h]) * np.log2(P[t] + P[h])) / S_A_s # Normalised entropy of summed joint distributions.
        # ------------------------------------------
        # Add to overall per-feature distance phi.
        phi[r] += R[r,s] * psi[r][s] / d

def dist(xi, xj): return np.linalg.norm([phi[r][xir,xjr] for r, (xir, xjr) in enumerate(zip(xi, xj))])



In [37]:
dist([0,0,0],[1,1,1])

0.8904916804946172