In [1]:
import pandas as pd
import math

In [2]:
df1 = pd.read_csv("../data/animals.csv", sep=',', header=0)

In [3]:
print(df1.shape)
print(df1)

(20, 7)
   animal  war  fly  ver  end  gro  hai
0     ant    1    1    1  1.0  2.0    1
1     bee    1    2    1  1.0  2.0    2
2     cat    2    1    2  1.0  1.0    2
3     cpl    1    1    1  1.0  1.0    2
4     chi    2    1    2  2.0  2.0    2
5     cow    2    1    2  1.0  2.0    2
6     duc    2    2    2  1.0  2.0    1
7     eag    2    2    2  2.0  1.0    1
8     ele    2    1    2  2.0  2.0    1
9     fly    1    2    1  1.0  1.0    1
10    fro    1    1    2  2.0  NaN    1
11    her    1    1    2  1.0  2.0    1
12    lio    2    1    2  NaN  2.0    2
13    liz    1    1    2  1.0  1.0    1
14    lob    1    1    1  1.0  NaN    1
15    man    2    1    2  2.0  2.0    2
16    rab    2    1    2  1.0  2.0    2
17    sal    1    1    2  1.0  NaN    1
18    spi    1    1    1  NaN  1.0    2
19    wha    2    1    2  2.0  2.0    1


In [4]:
# Clean up the data 
# replace NaN with 0
df1 = df1.fillna(0)

## will implement my own function to normalize in future
#normalization
# from scipy.stats import zscore
# # Z-score normalization
# normalized_df = df1.apply(zscore)


In [5]:
#compute euclidean distance
def euclideanDist(obj1, obj2):
    '''
    input of equal length list obj1 and obj2
    returns square of pairwise euclidean distance for speeding up the algorithm
    '''
    
    dist = 0
    for i in range(1, len(obj1)):
        d = (obj1[i] -obj2[i])
        dist += d*d

    return(dist)

#compute UPGMA distance
def upgma(obj1, obj2):
    '''
    input : list obj1 and obj2
    returns square of pairwise euclidean distance for speeding up the algorithm
    '''
    
    dist = 0
    for i in range(0, len(obj1)):
        for j in range(0,len(obj2)):
            d = math.sqrt(euclideanDist(df1.iloc[obj1[i]].tolist(), df1.iloc[obj2[j]].tolist()))
            dist += d

    return(dist/(len(obj1) * len(obj2)))

In [6]:
#testing the functions return above independently
print(euclideanDist(['point1', 2.0, 1],['point2', 3, 0]))
print(upgma([15,4], [16,5]))

2.0
1.0


In [7]:
# Agglomerative approach
result_cluster = []

In [8]:
# compute the distance between all points
def findSimilarityMatrix(obj3):
    '''
    input : list obj3 which is the list of list
    returns similarity matrix i.e list in the form lower traingular matrix
    '''
    similarity_distance = []
    obj1 = []
    obj2 = []

    for i_row in range(0, len(obj3)):
        distance = []
        obj1 = obj3[i_row]

        for j_row in range(0, i_row+1):        
            obj2 = obj3[j_row]
            if len(obj1) == 1 and len(obj2) == 1:
                distance.append(euclideanDist(df1.iloc[obj1[0]].tolist(), df1.iloc[obj2[0]].tolist()))
            else:
                distance.append(upgma(obj1, obj2))

        similarity_distance.append(distance)
    # print(similarity_distance)

    return(similarity_distance)


In [9]:
#testing functions above independently 
cluster_temp = [[15, 4], [16, 5], [19, 8], [0], [1], [2], [3], [6], [7], [9], [10], [11], [12], [13], [14], [17], [18]]
print(findSimilarityMatrix(cluster_temp))

[[0.0], [1.0, 0.0], [1.0, 1.4142135623730951, 0.0], [2.0, 1.7320508075688772, 1.7320508075688772, 0.0], [2.0, 1.7320508075688772, 2.23606797749979, 2.0, 0.0], [1.4142135623730951, 1.0, 1.7320508075688772, 4.0, 4.0, 0.0], [2.0, 1.7320508075688772, 2.23606797749979, 2.0, 2.0, 2.0, 0.0], [1.7320508075688772, 1.4142135623730951, 1.4142135623730951, 3.0, 3.0, 3.0, 5.0, 0.0], [1.7320508075688772, 2.0, 1.4142135623730951, 5.0, 5.0, 3.0, 5.0, 2.0, 0.0], [2.449489742783178, 2.23606797749979, 2.23606797749979, 2.0, 2.0, 4.0, 2.0, 3.0, 3.0, 0.0], [2.449489742783178, 2.6457513110645907, 2.23606797749979, 6.0, 8.0, 4.0, 4.0, 7.0, 3.0, 4.0, 0.0], [1.7320508075688772, 1.4142135623730951, 1.4142135623730951, 1.0, 3.0, 3.0, 3.0, 2.0, 4.0, 3.0, 5.0, 0.0], [2.0, 1.0, 2.23606797749979, 4.0, 4.0, 2.0, 4.0, 3.0, 7.0, 6.0, 10.0, 3.0, 0.0], [2.0, 1.7320508075688772, 1.7320508075688772, 2.0, 4.0, 2.0, 2.0, 3.0, 3.0, 2.0, 2.0, 1.0, 4.0, 0.0], [2.8284271247461903, 2.6457513110645907, 2.6457513110645907, 4.0, 6.0

In [10]:
#find minimum distance in the similarity distance and pairing
def findMinDistPair(similarity_distance, in_cluster): 
    '''
    input : cluster and lower traingular similarity matrix
    returns pairs of min distance values 
    '''
    min_dist = 9999999
    min_dist_pair = []
    for i in range(0, len(similarity_distance)):
        for j in range(0, len(similarity_distance[i])):
            if i != j:
                if similarity_distance[i][j] <= min_dist:
                    min_dist = similarity_distance[i][j]

    for i in range(0, len(similarity_distance)):
        for j in range(0, len(similarity_distance[i])):
            if i != j:
                if similarity_distance[i][j] <= min_dist:
                    pair_found = [i,j]
                    min_dist_pair.append(pair_found)

    print(min_dist)
    print(min_dist_pair)    
    #cleaning the min-dist_pair 
    ''' example1: pair [15,10],[13,15],[14,15] should belong to the one group [10,13,14,15]
        example2: pair [11,3],[13,11],[15,10],[15,13],[15,14] should belong to the one group [11,3,13,15,10,14]'''

    clustered_pair = []
    for count in range (0, 3):  #do this until new_min_dist_pair starts to repeat, this loop was implemented to accomodate example2   
        for i in range(0,len(min_dist_pair)):
            for j in range(0, len(min_dist_pair[i])):
                clustered_pair.append(min_dist_pair[i][j])

        new_min_dist_pair = []
        for i in range(0, len(min_dist_pair)):
            pair =[]
            case1 = False
            for j in range(0, len(min_dist_pair[i])):

                for k in range(0, len(new_min_dist_pair)):
                    if min_dist_pair[i][j] in new_min_dist_pair[k]:
                        for l in range(0, len(min_dist_pair[i])):
                            if l != j:
                                new_min_dist_pair[k].append(min_dist_pair[i][l])
                        case1 = True
                        break
                
                if case1:
                    break
                else:
                    pair.append(min_dist_pair[i][j])
            
            if not case1:
                new_min_dist_pair.append(pair)
        min_dist_pair =new_min_dist_pair
    print(new_min_dist_pair)

    #complete the new_min_dist_pair 
    for i in range(0, len(in_cluster)):
        case2 =False
        for j in range(0, len(new_min_dist_pair)):
            if i in new_min_dist_pair[j]:
                case2 = True
                break
        
        if not case2:
            new_min_dist_pair.append([i])
    print(new_min_dist_pair)

    #if any of the row is previously formed group break it open
    out_cluster =[] 
    for i in range(0, len(new_min_dist_pair)):
        cluster =[]
        for j in range(0, len(new_min_dist_pair[i])):
            if len(in_cluster[new_min_dist_pair[i][j]]) >1:
                for k in range(0, len(in_cluster[new_min_dist_pair[i][j]])):
                    cluster.append(in_cluster[new_min_dist_pair[i][j]][k])

            else:
                cluster.append(in_cluster[new_min_dist_pair[i][j]][0])

        out_cluster.append(cluster)

    print(out_cluster)
    return(out_cluster)

In [11]:
#testing functions above independently 
similarity_distance = [[0.0], [1.0, 0.0], [1.0, 1.4142135623730951, 0.0], [2.0, 1.7320508075688772, 1.7320508075688772, 0.0], [2.0, 1.7320508075688772, 2.23606797749979, 2.0, 0.0], [1.4142135623730951, 1.0, 1.7320508075688772, 4.0, 4.0, 0.0], [2.0, 1.7320508075688772, 2.23606797749979, 2.0, 2.0, 2.0, 0.0], [1.7320508075688772, 1.4142135623730951, 1.4142135623730951, 3.0, 3.0, 3.0, 5.0, 0.0], [1.7320508075688772, 2.0, 1.4142135623730951, 5.0, 5.0, 3.0, 5.0, 2.0, 0.0], [2.449489742783178, 2.23606797749979, 2.23606797749979, 2.0, 2.0, 4.0, 2.0, 3.0, 3.0, 0.0], [2.449489742783178, 2.6457513110645907, 2.23606797749979, 6.0, 8.0, 4.0, 4.0, 7.0, 3.0, 4.0, 0.0], [1.7320508075688772, 1.4142135623730951, 1.4142135623730951, 1.0, 3.0, 3.0, 3.0, 2.0, 4.0, 3.0, 5.0, 0.0], [2.0, 1.0, 2.23606797749979, 4.0, 4.0, 2.0, 4.0, 3.0, 7.0, 6.0, 10.0, 3.0, 0.0], [2.0, 1.7320508075688772, 1.7320508075688772, 2.0, 4.0, 2.0, 2.0, 3.0, 3.0, 2.0, 2.0, 1.0, 4.0, 0.0], [2.8284271247461903, 2.6457513110645907, 2.6457513110645907, 4.0, 6.0, 4.0, 2.0, 7.0, 5.0, 2.0, 2.0, 5.0, 8.0, 2.0, 0.0], [2.6457513110645907, 2.449489742783178, 2.449489742783178, 5.0, 7.0, 3.0, 3.0, 6.0, 4.0, 3.0, 1.0, 4.0, 7.0, 1.0, 1.0, 0.0], [2.6457513110645907, 2.0, 2.8284271247461903, 3.0, 3.0, 3.0, 1.0, 6.0, 8.0, 3.0, 7.0, 4.0, 3.0, 3.0, 3.0, 4.0, 0.0]]
print(findMinDistPair(similarity_distance, [[15, 4], [16, 5], [19, 8], [0], [1], [2], [3], [6], [7], [9], [10], [11], [12], [13], [14], [17], [18]]))

1.0
[[1, 0], [2, 0], [5, 1], [11, 3], [12, 1], [13, 11], [15, 10], [15, 13], [15, 14], [16, 6]]
[[1, 0, 2, 5, 12], [11, 3, 13, 15, 10, 14], [16, 6]]
[[1, 0, 2, 5, 12], [11, 3, 13, 15, 10, 14], [16, 6], [4], [7], [8], [9]]
[[16, 5, 15, 4, 19, 8, 2, 12], [11, 0, 13, 17, 10, 14], [18, 3], [1], [6], [7], [9]]
[[16, 5, 15, 4, 19, 8, 2, 12], [11, 0, 13, 17, 10, 14], [18, 3], [1], [6], [7], [9]]


In [12]:
#################Testing idea manually -sanity check #################################
#first pass 

# bin ing every data points into their own cluster
cluster1 = []
for i_row in range(0, df1.shape[0]):
    cluster1.append([i_row])

#find similarity matrix
similarity_distance = findSimilarityMatrix(cluster1)
result_cluster.append(findMinDistPair(similarity_distance, cluster1))


#second pass 
cluster2 = result_cluster[0]
similarity_distance = findSimilarityMatrix(cluster2)
result_cluster.append(findMinDistPair(similarity_distance, cluster2))
print(findMinDistPair(similarity_distance, cluster2))

#third pass 
cluster3 = result_cluster[1]
similarity_distance = findSimilarityMatrix(cluster3)
result_cluster.append(findMinDistPair(similarity_distance, cluster3))
print(findMinDistPair(similarity_distance, cluster3))


#fourth pass 
cluster4 = result_cluster[2]
similarity_distance = findSimilarityMatrix(cluster4)
result_cluster.append(findMinDistPair(similarity_distance, cluster4))
print(findMinDistPair(similarity_distance, cluster4))


#fifth pass 
cluster5 = result_cluster[3]
similarity_distance = findSimilarityMatrix(cluster5)
result_cluster.append(findMinDistPair(similarity_distance, cluster5))
print(findMinDistPair(similarity_distance, cluster5))



0.0
[[15, 4], [16, 5], [19, 8]]
[[15, 4], [16, 5], [19, 8]]
[[15, 4], [16, 5], [19, 8], [0], [1], [2], [3], [6], [7], [9], [10], [11], [12], [13], [14], [17], [18]]
[[15, 4], [16, 5], [19, 8], [0], [1], [2], [3], [6], [7], [9], [10], [11], [12], [13], [14], [17], [18]]
1.0
[[1, 0], [2, 0], [5, 1], [11, 3], [12, 1], [13, 11], [15, 10], [15, 13], [15, 14], [16, 6]]
[[1, 0, 2, 5, 12], [11, 3, 13, 15, 10, 14], [16, 6]]
[[1, 0, 2, 5, 12], [11, 3, 13, 15, 10, 14], [16, 6], [4], [7], [8], [9]]
[[16, 5, 15, 4, 19, 8, 2, 12], [11, 0, 13, 17, 10, 14], [18, 3], [1], [6], [7], [9]]
1.0
[[1, 0], [2, 0], [5, 1], [11, 3], [12, 1], [13, 11], [15, 10], [15, 13], [15, 14], [16, 6]]
[[1, 0, 2, 5, 12], [11, 3, 13, 15, 10, 14], [16, 6]]
[[1, 0, 2, 5, 12], [11, 3, 13, 15, 10, 14], [16, 6], [4], [7], [8], [9]]
[[16, 5, 15, 4, 19, 8, 2, 12], [11, 0, 13, 17, 10, 14], [18, 3], [1], [6], [7], [9]]
[[16, 5, 15, 4, 19, 8, 2, 12], [11, 0, 13, 17, 10, 14], [18, 3], [1], [6], [7], [9]]
1.573132184970986
[[4, 0]]
[[4,

In [13]:
#first pass 
# bin ing every data points into their own cluster
cluster1 = []
for i_row in range(0, df1.shape[0]):
    cluster1.append([i_row])

############### input for dendogram ###############################
result_cluster=[]
cluster_temp = []
for i in range(0, df1.shape[0]):
    if i==0:
        cluster_temp = cluster1
    else:
        cluster_temp = result_cluster[i-1]

    similarity_distance = findSimilarityMatrix(cluster_temp)
    result_cluster.append(findMinDistPair(similarity_distance, cluster_temp))
   
    # print(f"value of i is{i}")
    if len(result_cluster[-1][-1]) == df1.shape[0]:
        break

0.0
[[15, 4], [16, 5], [19, 8]]
[[15, 4], [16, 5], [19, 8]]
[[15, 4], [16, 5], [19, 8], [0], [1], [2], [3], [6], [7], [9], [10], [11], [12], [13], [14], [17], [18]]
[[15, 4], [16, 5], [19, 8], [0], [1], [2], [3], [6], [7], [9], [10], [11], [12], [13], [14], [17], [18]]
1.0
[[1, 0], [2, 0], [5, 1], [11, 3], [12, 1], [13, 11], [15, 10], [15, 13], [15, 14], [16, 6]]
[[1, 0, 2, 5, 12], [11, 3, 13, 15, 10, 14], [16, 6]]
[[1, 0, 2, 5, 12], [11, 3, 13, 15, 10, 14], [16, 6], [4], [7], [8], [9]]
[[16, 5, 15, 4, 19, 8, 2, 12], [11, 0, 13, 17, 10, 14], [18, 3], [1], [6], [7], [9]]
1.573132184970986
[[4, 0]]
[[4, 0]]
[[4, 0], [1], [2], [3], [5], [6]]
[[6, 16, 5, 15, 4, 19, 8, 2, 12], [11, 0, 13, 17, 10, 14], [18, 3], [1], [7], [9]]
1.5731321849709863
[[3, 2], [5, 2]]
[[3, 2, 5]]
[[3, 2, 5], [0], [1], [4]]
[[1, 18, 3, 9], [6, 16, 5, 15, 4, 19, 8, 2, 12], [11, 0, 13, 17, 10, 14], [7]]
1.7871716023211677
[[3, 1]]
[[3, 1]]
[[3, 1], [0], [2]]
[[7, 6, 16, 5, 15, 4, 19, 8, 2, 12], [1, 18, 3, 9], [11, 0, 