# Normalize a dataset and find out the Dissimilarity matrix

## Numerical data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# creating an array of random integers
f1 = np.random.randint(low = 0, high = 100, size = 4)
f2 = np.random.randint(low = 30, high = 1000, size = 4)
f3 = np.random.randint(low = 50, high = 10000, size = 4)
f4 = np.random.randint(low = 100, high = 10000000, size = 4)

# creating a dataframe from the arrays
data = pd.DataFrame({"f1":f1, "f2":f2, "f3":f3, "f4":f4})

In [3]:
data

Unnamed: 0,f1,f2,f3,f4
0,46,196,3270,8709077
1,60,97,3350,5980393
2,45,960,1796,4341810
3,84,574,1127,4072911


We can consider *f1*, *f2*, *f3*, *f4* as features

In [4]:
# function to find the z-score
def z_score(data):
    data_z_score = pd.DataFrame()
    data_z_score = data.copy()
    for f in range(len(data)):
        sum = 0
        for i in range(len(data)):
            sum += data.iloc[i][f]
        mean = (sum / len(data))
        
        # mean absolute deviation
        mad = 0
        for i in range(len(data)):
            mad += abs(data.iloc[i][f] - mean)
        
        mad /= len(data)
        
        # z-score
        for i in range(len(data)):
            data_z_score.iloc[i][f] = (data.iloc[i][f] - mean) / mad
    
    return data_z_score
    
data_z_score = z_score(data)

In [5]:
data_z_score

Unnamed: 0,f1,f2,f3,f4
0,0,0,0,1
1,0,-1,1,0
2,-1,1,0,0
3,1,0,-1,-1


In [9]:
def euclidean_distance(x, y):
    distance = 0
    for i in range(len(x)):
        distance += (x[i] - y[i])**2
    
    return np.sqrt(distance)

We are required to find the **dissimilarity matrix** from the above matrix of *z-scores*

In [41]:
dissimilarity_matrix = np.zeros((4,4)) # creates a 2D numpy array of shape (4,4) having all values as 0
for f in range(len(data_z_score)):
    for i in range(len(data_z_score)):
        dissimilarity_matrix[i][f] = euclidean_distance(data_z_score.iloc[i].values, data_z_score.iloc[f].values)

print(dissimilarity_matrix)

[[0.         1.73205081 1.73205081 2.44948974]
 [1.73205081 0.         2.44948974 2.64575131]
 [1.73205081 2.44948974 0.         2.64575131]
 [2.44948974 2.64575131 2.64575131 0.        ]]


## Nominal Data

In [42]:
# nominal features
hair_color = ['Black', 'Gray', 'Gray', "Blue"]
emp_status = ['Employed', 'Unemployed', 'Employed', 'Retired']
dress_color = ['blue', 'black', 'black', 'gray']
gender = ['Male', 'Female', 'Female', 'Male']

nominal_df = pd.DataFrame({'hair_color':hair_color, 'emp_status':emp_status,
                           'dress_color':dress_color, 'gender':gender})

In [43]:
nominal_df

Unnamed: 0,hair_color,emp_status,dress_color,gender
0,Black,Employed,blue,Male
1,Gray,Employed,black,Female
2,Gray,Employed,black,Female
3,Blue,Retired,gray,Male


In [51]:
def num_of_matches(obj1, obj2):
    matches = 0
    for col in range(len(nominal_df.columns)):
        if obj1[col] == obj2[col]:
            matches += 1
    return matches

In [55]:
def dissimilarity_matrix(nominal_df):
    p = len(nominal_df.columns) # total number of features
    dissimilarity_matrix = [[0 for _ in range(len(nominal_df))] for _ in range(len(nominal_df))]
    for col in range(len(nominal_df.columns)):
        for row in range(len(nominal_df)):
            dissimilarity_matrix[row][col] = (p - num_of_matches(nominal_df.iloc[row], nominal_df.iloc[col])) / p
            
    return dissimilarity_matrix

In [56]:
dissimilarity_matrix(nominal_df)

[[0.0, 0.75, 0.75, 0.75],
 [0.75, 0.0, 0.0, 1.0],
 [0.75, 0.0, 0.0, 1.0],
 [0.75, 1.0, 1.0, 0.0]]

## Ordinal Data

In [2]:
# ordinal features
f1 = ["High", "Low", "Medium", "High"]
f2 = ["Excellent", "Fair", "Good", "Excellent"]
f3 = [3, 1, 2, 4]
f4 = ["Small", "Large", "Small", "Medium"]

ordinal_df = pd.DataFrame({'f1':f1, 'f2':f2, 'f3':f3, 'f4':f4})

In [3]:
ordinal_df

Unnamed: 0,f1,f2,f3,f4
0,High,Excellent,3,Small
1,Low,Fair,1,Large
2,Medium,Good,2,Small
3,High,Excellent,4,Medium


In [4]:
# function to assign ranks to feature values
def assign_ranks(feature):
    unique_states = sorted(set(feature))
    state_to_rank = {state : rank + 1 for rank, state in enumerate(unique_states)}
    return [state_to_rank[state] for state in feature]

# applying the function to each feature in ordinal_df
for feature in ordinal_df.columns.values:
    ordinal_df[feature] = assign_ranks(ordinal_df[feature])

In [5]:
ordinal_df

Unnamed: 0,f1,f2,f3,f4
0,1,1,3,3
1,2,2,1,1
2,3,3,2,3
3,1,1,4,2


In [11]:
# normalized ranking
normalized_rank = [[0 for _ in range(len(ordinal_df.columns))] for _ in range(len(ordinal_df))]
for col in range(len(ordinal_df.columns)):
    feature = ordinal_df.columns.values[col]
    mf = len(set(ordinal_df[feature])) # Total number of unique states in that feature
    for row in range(len(ordinal_df)):
        curr_rank = ordinal_df.iloc[row][col]
        normalized_rank[row][col] = round((curr_rank - 1) / (mf - 1), 2)

normalized_rank

[[0.0, 0.0, 0.67, 1.0],
 [0.5, 0.5, 0.0, 0.0],
 [1.0, 1.0, 0.33, 1.0],
 [0.0, 0.0, 1.0, 0.5]]

In [15]:
ordinal_df.iloc[0].values

array([1, 1, 3, 3], dtype=int64)

In [17]:
dissimilarity_matrix = [[0 for _ in range(len(ordinal_df.columns))] for _ in range(len(ordinal_df))]
for col in range(len(ordinal_df.columns)):
    for row in range(len(ordinal_df)):
        dissimilarity_matrix[row][col] = round(euclidean_distance(ordinal_df.iloc[row].values,
                                                            ordinal_df.iloc[col].values), 2)

dissimilarity_matrix

[[0.0, 3.16, 3.0, 1.41],
 [3.16, 0.0, 2.65, 3.46],
 [3.0, 2.65, 0.0, 3.61],
 [1.41, 3.46, 3.61, 0.0]]

## Ratio Scaled Data

In [3]:
f1 = [2*np.exp(3*t) for t in np.linspace(1,20, 4)]
f2 = [3*np.exp(2*t) for t in np.linspace(1,3,4)]
f3 = [4*np.exp(t) for t in range(4)]
f4 = [5*np.exp(1.5*t) for t in range(4,8)]

ratio_df = pd.DataFrame({'f1': f1, 'f2': f2, 'f3' : f3, 'f4' : f4})

In [4]:
ratio_df

Unnamed: 0,f1,f2,f3,f4
0,40.17107,22.167168,4.0,2017.143967
1,7169826000.0,84.094875,10.873127,9040.212072
2,1.279687e+18,319.028026,29.556224,40515.419638
3,2.284015e+26,1210.28638,80.342148,181577.513371


In [8]:
ratio_log = [[0 for _ in range(len(ratio_df.columns))] for _ in range(len(ratio_df))]
for col in range(len(ratio_df.columns)):
    for row in range(len(ratio_df)):
        ratio_log[row][col] = round(np.log(ratio_df.iloc[row][col]), 4)
ratio_log

[[3.6931, 3.0986, 1.3863, 7.6094],
 [22.6931, 4.4319, 2.3863, 9.1094],
 [41.6931, 5.7653, 3.3863, 10.6094],
 [60.6931, 7.0986, 4.3863, 12.1094]]

In [18]:
def dissimilarity_matrix(ratio_log):
    z_score = [[0 for _ in range(len(ratio_log[0]))] for _ in range(len(ratio_log))]
    
    for col in range(len(ratio_log[0])):
        # catulating mean of the feature
        sum_f = 0
        for row in range(len(ratio_log)):
            sum_f += ratio_log[row][col]
        mean_f = sum_f / len(ratio_log)
        
        # calculating mean standard deviation of the feature
        sum_f = 0
        for row in range(len(ratio_log)):
            sum_f += abs(ratio_log[row][col] - mean_f)
        mean_std_dev = sum_f / len(ratio_log)
        
        # calculating z-scores
        for row in range(len(ratio_log)):
            z_score[row][col] = (ratio_log[row][col] - mean_f) / mean_std_dev
            
            
    # calculating distance between objects and creating dissimilarity matrix
    diss_matrix = [[0 for _ in range(len(ratio_log[0]))] for _ in range(len(ratio_log))]
    for col in range(len(ratio_log[0])):
        for row in range(len(ratio_log)):
            diss_matrix[row][col] = euclidean_distance(z_score[row], z_score[col])
            
    return diss_matrix

In [21]:
diss_matrix = dissimilarity_matrix(ratio_log)
diss_matrix

[[0.0, 1.9999812504980399, 3.9999999999999996, 5.999981250322261],
 [1.9999812504980399, 0.0, 2.0000187500292905, 4.0],
 [3.9999999999999996, 2.0000187500292905, 0.0, 1.9999812504980399],
 [5.999981250322261, 4.0, 1.9999812504980399, 0.0]]

## Mixed type variables

In [4]:
f1 = ['Code-A', 'Code-B', 'Code-C', 'Code-A'] # nominal
f2 = ['Excellent', 'Fair', 'Good', 'Excellent'] # ordinal
f3 = [445, 22, 164, 1210] # ratio scaled
f4 = [45.6, 27.9, 56.7, 19] # numerical
mixed_df = pd.DataFrame({'f1' : f1, 'f2' : f2, 'f3': f3, 'f4': f4})

In [5]:
mixed_df

Unnamed: 0,f1,f2,f3,f4
0,Code-A,Excellent,445,45.6
1,Code-B,Fair,22,27.9
2,Code-C,Good,164,56.7
3,Code-A,Excellent,1210,19.0


In [None]:
# creating dissimilarity matrix for mixed type variables
def mixed_dissimilarity_matrix(mixed_df):
    # grouping features according to their type
    nominal_features = ['f1']
    ordinal_features = ['f2']
    ratio_scaled_features = ['f3']
    numerical_features = ['f4']
    
    for feature in mixed_df.columns.values:
        if feature in nominal_features:
            pass
        elif feature in ordinal_features:
            pass
        elif feature in ratio_scaled_features:
            pass
        elif feature in numerical_features:
            pass
    