# Normalize a dataset using z-score and find out the Dissimilarity matrix

## Numerical data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# creating an array of random integers
f1 = np.random.randint(low = 0, high = 100, size = 4)
f2 = np.random.randint(low = 30, high = 1000, size = 4)
f3 = np.random.randint(low = 50, high = 10000, size = 4)
f4 = np.random.randint(low = 100, high = 10000000, size = 4)

# creating a dataframe from the arrays
data = pd.DataFrame({"f1":f1, "f2":f2, "f3":f3, "f4":f4})

In [3]:
data

Unnamed: 0,f1,f2,f3,f4
0,46,196,3270,8709077
1,60,97,3350,5980393
2,45,960,1796,4341810
3,84,574,1127,4072911


We can consider *f1*, *f2*, *f3*, *f4* as features

In [4]:
# function to find the z-score
def z_score(data):
    data_z_score = pd.DataFrame()
    data_z_score = data.copy()
    for f in range(len(data)):
        sum = 0
        for i in range(len(data)):
            sum += data.iloc[i][f]
        mean = (sum / len(data))
        
        # mean absolute deviation
        mad = 0
        for i in range(len(data)):
            mad += abs(data.iloc[i][f] - mean)
        
        mad /= len(data)
        
        # z-score
        for i in range(len(data)):
            data_z_score.iloc[i][f] = (data.iloc[i][f] - mean) / mad
    
    return data_z_score
    
data_z_score = z_score(data)

In [5]:
data_z_score

Unnamed: 0,f1,f2,f3,f4
0,0,0,0,1
1,0,-1,1,0
2,-1,1,0,0
3,1,0,-1,-1


In [6]:
def euclidean_distance(x, y):
    distance = 0
    for i in range(len(x)):
        distance += (x[i] - y[i])**2
    
    return np.sqrt(distance)

We are required to find the **dissimilarity matrix** from the above matrix of *z-scores*

In [41]:
dissimilarity_matrix = np.zeros((4,4)) # creates a 2D numpy array of shape (4,4) having all values as 0
for f in range(len(data_z_score)):
    for i in range(len(data_z_score)):
        dissimilarity_matrix[i][f] = euclidean_distance(data_z_score.iloc[i].values, data_z_score.iloc[f].values)

print(dissimilarity_matrix)

[[0.         1.73205081 1.73205081 2.44948974]
 [1.73205081 0.         2.44948974 2.64575131]
 [1.73205081 2.44948974 0.         2.64575131]
 [2.44948974 2.64575131 2.64575131 0.        ]]


## Nominal Data

In [42]:
# nominal features
hair_color = ['Black', 'Gray', 'Gray', "Blue"]
emp_status = ['Employed', 'Unemployed', 'Employed', 'Retired']
dress_color = ['blue', 'black', 'black', 'gray']
gender = ['Male', 'Female', 'Female', 'Male']

nominal_df = pd.DataFrame({'hair_color':hair_color, 'emp_status':emp_status,
                           'dress_color':dress_color, 'gender':gender})

In [43]:
nominal_df

Unnamed: 0,hair_color,emp_status,dress_color,gender
0,Black,Employed,blue,Male
1,Gray,Employed,black,Female
2,Gray,Employed,black,Female
3,Blue,Retired,gray,Male


In [51]:
def num_of_matches(obj1, obj2):
    matches = 0
    for col in range(len(nominal_df.columns)):
        if obj1[col] == obj2[col]:
            matches += 1
    return matches

In [55]:
def dissimilarity_matrix(nominal_df):
    p = len(nominal_df.columns)
    dissimilarity_matrix = [[0 for _ in range(len(nominal_df))] for _ in range(len(nominal_df))]
    for col in range(len(nominal_df.columns)):
        for row in range(len(nominal_df)):
            dissimilarity_matrix[row][col] = (p - num_of_matches(nominal_df.iloc[row], nominal_df.iloc[col])) / p
            
    return dissimilarity_matrix

In [56]:
dissimilarity_matrix(nominal_df)

[[0.0, 0.75, 0.75, 0.75],
 [0.75, 0.0, 0.0, 1.0],
 [0.75, 0.0, 0.0, 1.0],
 [0.75, 1.0, 1.0, 0.0]]

## Ordinal Data

In [2]:
# ordinal features
f1 = ["High", "Low", "Medium", "High"]
f2 = ["Excellent", "Fair", "Good", "Excellent"]
f3 = [3, 1, 2, 4]
f4 = ["Small", "Large", "Small", "Medium"]

ordinal_df = pd.DataFrame({'f1':f1, 'f2':f2, 'f3':f3, 'f4':f4})

In [3]:
ordinal_df

Unnamed: 0,f1,f2,f3,f4
0,High,Excellent,3,Small
1,Low,Fair,1,Large
2,Medium,Good,2,Small
3,High,Excellent,4,Medium


In [5]:
x = sorted(set(ordinal_df['f4']))
x

['Large', 'Medium', 'Small']

In [9]:
ordinal_df.columns.values

array(['f1', 'f2', 'f3', 'f4'], dtype=object)

In [10]:
# function to assign ranks to feature values
def assign_ranks(feature):
    unique_states = sorted(set(feature))
    state_to_rank = {state : rank + 1 for rank, state in enumerate(unique_states)}
    return [state_to_rank[state] for state in feature]

# applying the function to each feature in ordinal_df
for feature in ordinal_df.columns.values:
    ordinal_df[feature] = assign_ranks(ordinal_df[feature])

In [11]:
ordinal_df

Unnamed: 0,f1,f2,f3,f4
0,1,1,3,3
1,2,2,1,1
2,3,3,2,3
3,1,1,4,2
