In [43]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [44]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist, fashion_mnist

In [45]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [46]:
X_train = X_train.reshape(60000,784)
X_test = X_test.reshape(10000,784)

In [5]:
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from statistics import mode

In [6]:
# %time
# euc_sq_dist = np.square(pairwise_distances(X_train, X_test))

In [7]:
#Check the distribution of numbers in y_test - Fairly Uniform
np.unique(y_test, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8),
 array([ 980, 1135, 1032, 1010,  982,  892,  958, 1028,  974, 1009]))

In [65]:
#Randomly select 10 indices from X_test
random_indices = np.random.choice(X_test.shape[0], 10, replace = False)

In [66]:
#Initialising Means
init_means = X_test[random_indices]

In [67]:
#Now, we need to get the distance of each point from the initial means and only select the minimum distance for that point
#So, we again take the euclidean sq distance between the 10000 X_test points and the 10 initialised means to get a 10000x10 sq dist matrix
dist_matrix = np.square(pairwise_distances(X_test,init_means))
print(np.sum(dist_matrix))

700894305660.0


In [68]:
#Now, we need to get the minimum distance, so for each row, we need to get the minimum value or assign 1 to the min value
#and assign 0 to all the other values
min_dist = np.min(dist_matrix, axis = 1)
pi_matrix = np.where(dist_matrix == min_dist[:,np.newaxis], 1, 0)

In [69]:
#np.newaxis is used to make sure that the minimum values obtained for each row have the correct shape 
#to be compared with the entire row of the original array

In [70]:
#Values in each cluster
np.sum(pi_matrix, axis = 0)

array([1678, 2182,  343,  628,  626, 2049,  872,  474,  533,  615])

In [71]:
np.where(pi_matrix == 1)

(array([   0,    1,    2, ..., 9997, 9998, 9999]),
 array([0, 1, 1, ..., 0, 0, 8]))

In [72]:
pi_df = pd.DataFrame(pi_matrix)

In [75]:
# pi_df

In [73]:
# np.average(X_test[np.array(pi_df[pi_df[0]==1].index)], axis=0)

In [74]:
new_init_means = []
clusters = []
for col in pi_df.columns:
    new_init_means.append(np.average(X_test[np.array(pi_df[pi_df[col]==1].index)], axis=0))
    clusters.append(X_test[np.array(pi_df[pi_df[col]==1].index)])
init_means = np.array(new_init_means)
clusters = np.array(clusters, dtype = object)
# init_means = np.array(new_init_means)

In [90]:
# import matplotlib.pyplot as plt
# for i, cluster in enumerate(clusters):
#     representative_element = cluster[0, :]  # Use the first element as a representative
#     plt.scatter(range(len(representative_element)), representative_element, label=f'Cluster {i + 1}', s=5)

# # Customize the plot
# plt.title('Dot Plot of Each Cluster')
# plt.xlabel('Feature Index')
# plt.ylabel('Value')
# plt.legend()
# plt.show()

In [43]:
# X_test[np.array(pi_df[pi_df[col]==1].index)]

In [19]:
# np.sum(dist_matrix)

In [58]:
# dist_matrix = np.square(pairwise_distances(X_test,temp_means))
# print(np.sum(dist_matrix))

In [30]:
# new_init_means[0].astype(np.uint8)

In [99]:
def k_means(k, X_test):
    random_indices = np.random.choice(X_test.shape[0], k, replace = False)
    init_means = X_test[random_indices]
    prev_dist = 0
    for iteration in range(100):
        ## E step
        dist_matrix = np.square(pairwise_distances(X_test,init_means))
        # obj_fun = np.sum(dist_matrix)
        # print(obj_fun, iteration)
        
        min_dist = np.min(dist_matrix, axis = 1)
        pi_matrix = np.where(dist_matrix == min_dist[:,np.newaxis], 1, 0)
    
        #Number of values in each cluster
        print(np.sum(pi_matrix, axis = 0))
    
        ## M step
        pi_df = pd.DataFrame(pi_matrix)
        new_init_means = []
        clusters = []
        for col in pi_df.columns:
            new_init_means.append(np.average(X_test[np.array(pi_df[pi_df[col]==1].index)], axis=0))
            clusters.append(y_test[np.array(pi_df[pi_df[col]==1].index)])
        init_means = np.array(new_init_means)
        clusters = np.array(clusters, dtype = object)
        obj_fun = sum(np.sum(np.where(pi_matrix, dist_matrix, 0),axis=1))
        print(obj_fun, iteration)

        
        #Termination Criteria
        if abs(prev_dist - obj_fun)/obj_fun < 0.0001:
            break
        prev_dist = obj_fun

    print("################# Execution Successful #########################")

    for i in range(k):
        true_label = mode(clusters[i])
        purity = np.count_nonzero(clusters[i]==true_label)/clusters[i].shape[0]
        gini = 1 - purity**2
        print(true_label, round(purity,4), round(gini,4))
    
    print("################# Evaluation Successful #########################")

In [100]:
#Actual k = 10
k_means(10, X_test)

[ 196 2603 1129  643  357  856   32 1454 2024  706]
48096187840.0 0
[ 260 2231 1319  762  684  911  184 1429 1319  901]
27620040573.35609 1
[ 276 1969 1348  778  854  932  372 1411 1141  919]
26940215853.583874 2
[ 324 1699 1347  796  971  971  502 1366 1166  858]
26607822447.456367 3
[ 385 1525 1328  802 1088  983  595 1261 1229  804]
26377306944.080284 4
[ 510 1433 1258  817 1169  976  657 1152 1260  768]
26190244224.73166 5
[ 749 1408 1142  819 1222  959  670 1050 1228  753]
25974724644.352436 6
[ 924 1441 1119  820 1235  914  645  975 1175  752]
25744121662.001038 7
[1004 1493 1104  815 1226  882  654  931 1135  756]
25635591317.80125 8
[1058 1516 1086  802 1234  863  661  903 1111  766]
25577797385.488857 9
[1096 1537 1057  785 1241  847  668  885 1113  771]
25542662202.13613 10
[1123 1549 1031  780 1244  828  666  880 1124  775]
25523076878.64562 11
[1138 1542 1008  783 1239  824  669  879 1142  776]
25510949347.975903 12
[1147 1540 1003  785 1229  820  671  872 1156  777]
255014

In [101]:
#Half k = 5
k_means(5, X_test)

[ 400 2771 4925 1222  682]
51131674485.0 0
[ 678 2961 4181 1061 1119]
29847891056.79114 1
[ 788 2972 3803 1043 1394]
29149015244.59864 2
[ 869 2957 3529 1020 1625]
28887952062.611717 3
[ 956 2946 3362 1007 1729]
28737945533.9857 4
[1069 2948 3211 1003 1769]
28680162671.723347 5
[1172 2952 3081 1005 1790]
28655123368.47102 6
[1289 2948 2933 1014 1816]
28635154134.403732 7
[1415 2938 2800 1028 1819]
28611272003.198765 8
[1512 2934 2682 1051 1821]
28589073931.523346 9
[1609 2918 2584 1077 1812]
28571990246.984806 10
[1684 2895 2508 1111 1802]
28556005530.62479 11
[1771 2888 2427 1131 1783]
28541736597.16184 12
[1829 2877 2369 1162 1763]
28529454189.894585 13
[1884 2869 2330 1181 1736]
28520470684.34133 14
[1926 2861 2302 1201 1710]
28514318284.60338 15
[1935 2863 2283 1228 1691]
28509556850.16061 16
[1957 2865 2268 1248 1662]
28503918732.085957 17
[1992 2870 2251 1270 1617]
28497369744.17725 18
[2031 2872 2232 1306 1559]
28486142250.181713 19
[2087 2875 2209 1357 1472]
28462424802.12935 2

In [102]:
#Doubled k = 20
k_means(20, X_test)

[ 606  170  322  212  280  506  152  983  899  382  954  518  227  987
  135  247  334  543  351 1192]
38865534763.0 0
[553 266 461 313 412 536 380 675 784 417 672 574 314 887 248 349 425 407
 373 954]
24291665843.37201 1
[540 323 475 351 447 548 483 590 725 433 636 603 343 838 293 391 422 404
 367 788]
23535034509.36271 2
[549 370 484 384 457 533 504 560 684 439 631 641 353 813 294 406 414 423
 364 697]
23279496736.866978 3
[545 404 490 388 469 509 517 526 655 461 623 690 357 786 341 401 373 434
 373 658]
23109091410.32016 4
[550 438 487 392 478 489 529 504 634 470 621 711 354 766 369 390 342 445
 391 640]
22988684783.276646 5
[542 458 486 394 490 478 528 487 616 491 615 734 359 749 388 376 331 454
 412 612]
22917164865.836807 6
[537 465 488 394 503 462 527 470 602 507 611 745 371 730 404 375 330 454
 423 602]
22870741029.641342 7
[537 466 487 397 509 436 537 468 593 513 600 742 379 714 414 383 333 464
 429 599]
22832307999.99733 8
[525 468 487 402 513 414 539 469 587 528 600 728 383 

In [103]:
# ###Running the entire algorithm iteratively 5-7 times and printing the sum of euclidean square matrix, as our objective
# ###function is to minimize the sum of euclidean distances
# from sklearn.metrics import pairwise_distances

# k = 10
# random_indices = np.random.choice(X_test.shape[0], k, replace = False)
# init_means = X_test[random_indices]
# prev_dist = 0
# for iteration in range(100):
#     ## E step
#     dist_matrix = np.square(pairwise_distances(X_test,init_means))
#     # obj_fun = np.sum(dist_matrix)
#     # print(obj_fun, iteration)
    
#     min_dist = np.min(dist_matrix, axis = 1)
#     pi_matrix = np.where(dist_matrix == min_dist[:,np.newaxis], 1, 0)

#     #Number of values in each cluster
#     print(np.sum(pi_matrix, axis = 0))

#     ## M step
#     pi_df = pd.DataFrame(pi_matrix)
#     new_init_means = []
#     clusters = []
#     for col in pi_df.columns:
#         new_init_means.append(np.average(X_test[np.array(pi_df[pi_df[col]==1].index)], axis=0))
#         clusters.append(y_test[np.array(pi_df[pi_df[col]==1].index)])
#     init_means = np.array(new_init_means)
#     clusters = np.array(clusters, dtype = object)
#     obj_fun = sum(np.sum(np.where(pi_matrix, dist_matrix, 0),axis=1))
#     print(obj_fun, iteration)
    
#     #Termination Criteria
#     if abs(prev_dist - obj_fun)/obj_fun < 0.0001:
#         break
#     prev_dist = obj_fun

In [104]:
# np.dot(dist_matrix, pi_matrix.T).shape

In [105]:
# dist_matrix

In [106]:
# pi_matrix

In [107]:
# np.dot(dist_matrix, pi_matrix.T)

In [108]:
# sum(np.sum(np.where(pi_matrix, dist_matrix, 0),axis=1))

In [78]:
# from statistics import mode
# for i in range(k):
#     true_label = mode(clusters[i])
#     purity = np.count_nonzero(clusters[i]==true_label)/clusters[i].shape[0]
#     gini = 1 - purity**2
#     print(true_label, round(purity,4), round(gini,4))
#     # print(mode(clusters[i]))
#     # print(np.unique(clusters[i], return_counts = True))

In [25]:
# # k = 5
# from statistics import mode
# for i in range(k):
#     true_label = mode(clusters[i])
#     purity = np.count_nonzero(clusters[i]==true_label)/clusters[i].shape[0]
#     gini = 1 - purity**2
#     print(true_label, round(purity,4), round(gini,4))
#     # print(mode(clusters[i]))
#     # print(np.unique(clusters[i], return_counts = True))

In [26]:
# # k = 20
# from statistics import mode
# for i in range(k):
#     true_label = mode(clusters[i])
#     purity = np.count_nonzero(clusters[i]==true_label)/clusters[i].shape[0]
#     gini = 1 - purity**2
#     print(true_label, round(purity,4), round(gini,4))
#     # print(mode(clusters[i]))
#     # print(np.unique(clusters[i], return_counts = True))

In [185]:
np.unique(y_test, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8),
 array([ 980, 1135, 1032, 1010,  982,  892,  958, 1028,  974, 1009]))

In [189]:
clusters[0].shape

(1451,)

In [96]:
obj_fun

203288461367.905

In [98]:
obj_fun

434426023968.73706

In [100]:
obj_fun

924888318113.3265

In [10]:
###### Fashion Dataset ######

In [109]:
import tensorflow as tf
from tensorflow.keras.datasets import fashion_mnist

In [110]:
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

In [111]:
X_train = X_train.reshape(60000,784)
X_test = X_test.reshape(10000,784)

In [87]:
# ###Running the entire algorithm iteratively 5-7 times and printing the sum of euclidean square matrix, as our objective
# ###function is to minimize the sum of euclidean distances
# from sklearn.metrics import pairwise_distances

# k = 10
# random_indices = np.random.choice(X_test.shape[0], k, replace = False)
# init_means = X_test[random_indices]
# prev_dist = 0
# for iteration in range(100):
#     ## E step
#     dist_matrix = np.square(pairwise_distances(X_test,init_means))
#     obj_fun = np.sum(dist_matrix)
#     print(obj_fun, iteration)
    
#     min_dist = np.min(dist_matrix, axis = 1)
#     pi_matrix = np.where(dist_matrix == min_dist[:,np.newaxis], 1, 0)

#     #Number of values in each cluster
#     print(np.sum(pi_matrix, axis = 0))

#     ## M step
#     pi_df = pd.DataFrame(pi_matrix)
#     new_init_means = []
#     clusters = []
#     for col in pi_df.columns:
#         new_init_means.append(np.average(X_test[np.array(pi_df[pi_df[col]==1].index)], axis=0))
#         clusters.append(y_test[np.array(pi_df[pi_df[col]==1].index)])
#     init_means = np.array(new_init_means)
#     clusters = np.array(clusters, dtype = object)
    
#     #Termination Criteria
#     if abs(prev_dist - obj_fun)/obj_fun < 0.0001:
#         break
#     prev_dist = obj_fun

In [88]:
# from statistics import mode
# for i in range(k):
#     true_label = mode(clusters[i])
#     purity = np.count_nonzero(clusters[i]==true_label)/clusters[i].shape[0]
#     gini = 1 - purity**2
#     print(true_label, round(purity,4), round(gini,4))
#     # print(mode(clusters[i]))
#     # print(np.unique(clusters[i], return_counts = True))

In [112]:
#Actual k =10
k_means(10, X_test)

[ 883  172  535  313  247 2582  656 2371  538 1703]
43876843487.0 0
[ 988  668  654  417  254 2269  567 2134  589 1460]
23741759610.430344 1
[ 846  792 1055  493  295 1862  563 1943  612 1539]
22262996460.406143 2
[ 782  823 1154  582  336 1748  589 1847  621 1518]
21591720779.522667 3
[ 769  826 1187  655  362 1705  595 1796  581 1524]
21441751872.370064 4
[ 789  825 1213  700  379 1685  583 1756  534 1536]
21374436009.4633 5
[ 826  825 1252  719  391 1673  558 1709  501 1546]
21287032710.41455 6
[ 848  822 1282  728  404 1678  525 1659  493 1561]
21191446078.00852 7
[ 869  823 1313  727  413 1686  495 1609  486 1579]
21110195185.96981 8
[ 882  814 1330  726  424 1707  459 1567  471 1620]
21044242987.8295 9
[ 885  807 1341  730  428 1726  439 1550  446 1648]
20985158676.306183 10
[ 860  806 1316  773  430 1747  417 1538  434 1679]
20931616503.12079 11
[ 834  805 1292  814  432 1757  408 1529  424 1705]
20882916636.009857 12
[ 826  805 1257  839  435 1763  396 1522  422 1735]
208557233

In [113]:
#Half k =5
k_means(5, X_test)

[ 966 3277 2741 2277  739]
47032524352.0 0
[1319 2870 3030 1537 1244]
27760618251.849705 1
[1358 2637 3096 1497 1412]
27170612859.11854 2
[1446 2516 3086 1470 1482]
26891136693.62958 3
[1542 2429 3033 1475 1521]
26654522590.534298 4
[1599 2382 2925 1560 1534]
26469835244.08346 5
[1655 2380 2790 1643 1532]
26307801196.438477 6
[1708 2400 2669 1692 1531]
26136872313.32957 7
[1785 2444 2551 1693 1527]
25966687027.816586 8
[1863 2495 2457 1668 1517]
25768174838.529507 9
[1918 2527 2402 1645 1508]
25638924934.50211 10
[1944 2528 2363 1662 1503]
25587516407.7581 11
[1973 2512 2341 1672 1502]
25565036622.523422 12
[2003 2498 2334 1676 1489]
25547239792.062843 13
[2022 2472 2348 1676 1482]
25537232442.58633 14
[2038 2438 2367 1685 1472]
25530564346.787315 15
[2057 2395 2407 1676 1465]
25523248808.391838 16
[2066 2362 2447 1672 1453]
25514569339.82289 17
[2082 2305 2511 1657 1445]
25505757309.86446 18
[2104 2247 2569 1643 1437]
25494682438.63093 19
[2125 2183 2633 1631 1428]
25481006958.981884 

In [114]:
#Doubled k =20
k_means(20, X_test)

[ 340  412  954  188  268  785  172  539  166  859  148  867  225  285
    4  868 1120  250  277 1273]
33859193341.0 0
[ 552  412  703  155  322  645  220  580  466  759  221  777  270  357
   84  732 1069  493  407  776]
19003091133.958866 1
[633 423 678 152 362 598 276 554 741 668 218 756 296 367 119 735 920 541
 434 529]
18112929098.34457 2
[618 429 670 153 368 595 334 516 786 613 218 769 308 371 126 728 924 545
 423 506]
17851021213.605785 3
[598 419 660 155 368 595 376 477 802 582 216 772 313 377 133 734 949 552
 417 505]
17777943624.248432 4
[587 413 654 156 363 599 407 438 808 567 214 765 316 379 137 739 986 556
 417 499]
17721040185.34898 5
[574 415 649 158 363 602 432 408 805 556 212 758 317 379 141 749 994 557
 418 513]
17667207882.40024 6
[575 412 655 158 363 609 456 378 795 548 211 750 319 381 143 753 979 562
 419 534]
17628778911.241043 7
[574 410 648 159 359 615 474 365 794 542 211 743 321 386 144 769 957 565
 418 546]
17606789879.849056 8
[571 405 639 159 359 622 498 356

In [22]:
######## 20 NG #########

In [11]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [116]:
train_data = fetch_20newsgroups(subset = "train", categories = ["alt.atheism", "sci.med", "sci.electronics", "comp.graphics", "talk.politics.guns", "sci.crypt"])
test_data = fetch_20newsgroups(subset = "test", categories = ["alt.atheism", "sci.med", "sci.electronics", "comp.graphics", "talk.politics.guns", "sci.crypt"])

In [117]:
train_labels = train_data.target
test_labels = test_data.target

In [118]:
vectorizer = TfidfVectorizer()
all_data = train_data.data + test_data.data
all_vectors = vectorizer.fit_transform(all_data)

In [119]:
train_vectors = all_vectors[:3390]
test_vectors = all_vectors[3390:]

In [120]:
test_vectors.shape

(2257, 62583)

In [121]:
X_test = test_vectors
y_test = test_labels

In [122]:
def k_means_20ng(k, X_test):
    random_indices = np.random.choice(X_test.shape[0], k, replace = False)
    init_means = X_test[random_indices]
    prev_dist = 0
    for iteration in range(100):
        ## E step
        # dist_matrix = np.square(pairwise_distances(X_test,init_means))
        dist_matrix = cosine_similarity(X_test, init_means)
        # obj_fun = np.sum(dist_matrix)
        # print(obj_fun, iteration)
        
        min_dist = np.max(dist_matrix, axis = 1)
        pi_matrix = np.where(dist_matrix == min_dist[:,np.newaxis], 1, 0)
    
        #Number of values in each cluster
        print(np.sum(pi_matrix, axis = 0))
    
        ## M step
        pi_df = pd.DataFrame(pi_matrix)
        new_init_means = []
        clusters = []
        for col in pi_df.columns:
            new_init_means.append(np.average(X_test[np.array(pi_df[pi_df[col]==1].index)].toarray(), axis=0))
            clusters.append(y_test[np.array(pi_df[pi_df[col]==1].index)])
        init_means = np.array(new_init_means)
        clusters = np.array(clusters, dtype = object)
        obj_fun = sum(np.sum(np.where(pi_matrix, dist_matrix, 0),axis=1))
        print(obj_fun, iteration)

        
        #Termination Criteria
        if abs(prev_dist - obj_fun)/obj_fun < 0.0001:
            break
        prev_dist = obj_fun

    print("################# Execution Successful #########################")
    
    for i in range(k):
        true_label = mode(clusters[i])
        purity = np.count_nonzero(clusters[i]==true_label)/clusters[i].shape[0]
        gini = 1 - purity**2
        print(true_label, round(purity,4), round(gini,4))

    print("################# Evaluation Successful #########################")

In [123]:
#Actual k = 20
k_means_20ng(20, X_test)

[170   2  33  18  10 139  95  94  20   5 442 963   9  19  23  36  21  34
  45  79]
304.0573332471885 0
[168   3  40  24  12 178 112 109  32   7 400 861  16  27  26  47  23  34
  53  85]
631.043846115655 1
[157   5  39  30  13 197 125 126  37   7 381 819  17  33  24  52  23  34
  51  87]
647.9873478276916 2
[145   5  40  35  14 210 137 135  38   7 362 808  17  37  23  53  23  34
  49  85]
656.1314722826577 3
[125   5  41  42  15 210 158 138  38   7 353 802  17  36  23  54  23  34
  49  87]
659.5951822513435 4
[104   5  42  47  15 210 174 140  38   7 350 800  17  36  23  54  23  34
  49  89]
662.5908317796295 5
[ 89   5  42  49  15 210 192 140  38   7 344 800  17  36  23  54  23  34
  49  90]
664.5185887712269 6
[ 75   5  42  49  15 209 204 140  38   7 343 801  17  36  23  56  23  34
  49  91]
665.7953735830437 7
[ 73   5  42  49  15 209 202 140  38   7 342 805  17  36  23  56  23  34
  49  92]
666.552908463977 8
[ 72   5  42  49  15 209 201 140  38   7 343 804  17  36  23  56  23  34
  

In [124]:
#Half k = 10
k_means_20ng(10, X_test)

[  40 1527   12  134   63  135   18   44  210   74]
272.63449881548144 0
[  58 1304   14  178   87  178   25   50  275   88]
593.0613653736582 1
[  56 1238   16  211  100  176   34   52  285   89]
605.7832410617493 2
[  56 1198   19  233  105  175   37   55  282   97]
611.8963688642792 3
[  56 1173   19  248  114  176   37   58  277   99]
614.8506631560429 4
[  57 1158   20  259  115  180   37   57  273  101]
616.3003527325902 5
[  57 1150   21  263  118  181   37   55  275  100]
616.9554407693678 6
[  57 1144   22  265  119  181   37   54  278  100]
617.4812456593104 7
[  57 1139   22  267  120  181   37   55  279  100]
617.7119931144277 8
[  57 1139   22  266  122  181   37   55  278  100]
617.8938594699569 9
[  57 1137   22  267  123  181   37   55  278  100]
618.011678905589 10
[  57 1137   22  267  123  181   37   55  278  100]
618.0851380775289 11
[  57 1137   22  267  123  181   37   55  278  100]
618.0851380775289 12
################# Execution Successful ######################

In [125]:
#Doubled k = 40
k_means_20ng(40, X_test)

[ 10  16  39  23  29 204  11  24  19  62  14  51  13  26  34  28 161  44
 288  15 130  16  45  11  14  16  14  16  18  73   7   8 112  36  18   9
  20  43 444  96]
369.9879523040123 0
[ 13  16  40  25  32 214  14  30  22  79  16  61  13  26  36  32 168  48
 261  18 137  17  48  16  18  18  15  19  23  78   9  10 120  36  25  10
  24  42 342  86]
710.1257677297797 1
[ 19  17  42  25  34 248  14  32  25  66  16  53  14  33  34  36 168  49
 247  18 130  18  47  20  18  18  16  21  32  78  11  12 114  39  29  12
  26  41 309  76]
733.2182175398782 2
[ 20  17  40  27  35 279  14  31  24  52  16  52  14  38  33  36 164  47
 237  18 128  18  46  21  20  19  16  21  34  82  12  13 115  40  29  12
  26  40 299  72]
745.5027838955607 3
[ 19  17  40  27  36 293  14  31  24  50  16  52  14  41  34  36 161  47
 229  18 127  19  44  20  21  19  16  21  34  85  13  13 119  42  29  11
  26  40 291  68]
751.364376071883 4
[ 19  17  40  26  36 307  14  31  24  49  16  52  14  38  34  36 160  47
 229  18

In [92]:
# ###Running the entire algorithm iteratively 5-7 times and printing the sum of euclidean square matrix, as our objective
# ###function is to minimize the sum of euclidean distances
# from sklearn.metrics import pairwise_distances

# k = 20
# random_indices = np.random.choice(X_test.shape[0], k, replace = False)
# init_means = X_test[random_indices]
# prev_dist = 0
# for iteration in range(100):
#     ## E step
#     # dist_matrix = np.square(pairwise_distances(X_test,init_means))
#     dist_matrix = cosine_similarity(X_test, init_means)
#     obj_fun = np.sum(dist_matrix)
#     print(obj_fun, iteration)
    
#     min_dist = np.max(dist_matrix, axis = 1)
#     pi_matrix = np.where(dist_matrix == min_dist[:,np.newaxis], 1, 0)

#     #Number of values in each cluster
#     print(np.sum(pi_matrix, axis = 0))

#     ## M step
#     pi_df = pd.DataFrame(pi_matrix)
#     new_init_means = []
#     clusters = []
#     for col in pi_df.columns:
#         new_init_means.append(np.average(X_test[np.array(pi_df[pi_df[col]==1].index)].toarray(), axis=0))
#         clusters.append(y_test[np.array(pi_df[pi_df[col]==1].index)])
#     init_means = np.array(new_init_means)
#     clusters = np.array(clusters, dtype = object)
    
#     #Termination Criteria
#     if abs(prev_dist - obj_fun)/obj_fun < 0.0001:
#         break
#     prev_dist = obj_fun

In [93]:
# from statistics import mode
# for i in range(k):
#     true_label = mode(clusters[i])
#     purity = np.count_nonzero(clusters[i]==true_label)/clusters[i].shape[0]
#     gini = 1 - purity**2
#     print(true_label, round(purity,4), round(gini,4))
#     # print(mode(clusters[i]))
#     # print(np.unique(clusters[i], return_counts = True))