In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import pandas as pd 
from pandas import DataFrame
import math
import numpy as np
from numpy import genfromtxt
import sklearn
import os
import matplotlib
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from nltk.corpus import stopwords
from sklearn import linear_model
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import random
from sklearn.metrics.cluster import homogeneity_score

## Training data
In this section, we are reading the contents from the data files and storing them in a dictionary where the keys are the paths of the files and values it's content.

In [105]:
path = '/content/drive/My Drive/Assignment-2_Dataset/Datasets/Question-6/dataset'
file_name = ""
files = {}
for a,b,f in os.walk(path):
  for file in f:
    if ".txt" in file:
      file_name = os.path.join(a,file)
      f1 = open(file_name, 'rb')
      file_data = f1.read().decode(errors="replace")
      file_data = " ".join(file_data.split())
      files[file_name] = file_data

# files

{'/content/drive/My Drive/Assignment-2_Dataset/Datasets/Question-6/dataset/96_2.txt': 'Pop band Busted to \'take a break\' Chart-topping pop band Busted have confirmed that they plan to "take a break", following rumours that they were on the verge of splitting. A statement from the band\'s record company Universal said frontman Charlie Simpson planned to spend some time working with his other band, Fightstar. However they said that Busted would "reconvene in due course". The band have had eight top three hits, including four number ones, since they first hit the charts in 2002. Their singles include What I Go To School For, Year 3000, Crashed The Wedding, You Said No, and Who\'s David? The band, which also includes members Matt Jay and James Bourne, made the top ten with their self-titled debut album, as well as the follow-up, A Present For Everyone, in 2003. They won best pop act and best breakthrough act at the 2004 Brit Awards and were nominated for best British group. Most recently

### We convert the data to numpy array by vectorizing it using TfidVectorizer.

In [106]:
vectorizer = TfidfVectorizer(stop_words="english")
vectors = vectorizer.fit_transform(files.values())
# X=vectors
feature_names = vectorizer.get_feature_names()
# print(feature_names)
# print(vectors.shape)
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
# df
X=df.to_numpy()
# print(X)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.08099135 0.         ... 0.         0.         0.        ]
 [0.         0.01985943 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


### Here we are defining the number of clusters. In this case we take it as 5.

In [0]:
no_of_clusters=5

### We are assigning random centroids.

In [108]:
centroids=np.random.uniform(size=(no_of_clusters,X.shape[1]))
for i in range(no_of_clusters):
  centroids[i] = centroids[i]/(np.linalg.norm(centroids[i]))

# print(centroids)

[[0.00169304 0.00773207 0.00847356 ... 0.00898844 0.0029558  0.00413361]
 [0.00513484 0.0084621  0.00703065 ... 0.00759797 0.00742591 0.00688314]
 [0.00666591 0.00193041 0.00431396 ... 0.00566386 0.00087926 0.01016608]
 [0.00330407 0.00122848 0.00431997 ... 0.00501819 0.00947051 0.00837613]
 [0.00389918 0.00077451 0.00526159 ... 0.01034552 0.00572892 0.00451064]]


In [0]:
def euclidean_distance(x):
  return (np.sqrt(np.sum(x**2)))

# K-Means Algorithm
We formalize the K-Means algorithm in the following section. We take out the distance of every data point from the centroids and assign them to a cluster which belongs to the centroid that is closest to that point. We return the indices of the data points that belong to each cluster as a dictionary.

In [110]:
i=0
loop=0
prev_centroids=centroids
for loop in range(15):
  # print(loop)
  classes={}
  indexes={}
  begin=0

  for i in range(no_of_clusters):
    indexes[i] = []
    classes[i] = []

  for row in X:
    # for c in range(len(centers)):
    #   dists[c]=np.linalg.norm(row-centers[c])
    distances = [euclidean_distance(row-centroids[i]) for i in range(len(centroids))]
    # print(dists)
    minimum_distance = min(distances)
    m_index = distances.index(minimum_distance)
    # print(index)
    indexes[m_index].append(begin)
    classes[m_index].append(row)
    begin = begin + 1

  for i in range(len(centroids)):
    centroids[i] = np.mean(classes[i], axis = 0)
  
# print(indexes)

{0: [3, 8, 9, 29, 30, 45, 54, 67, 70, 71, 75, 77, 79, 82, 83, 96, 97, 104, 108, 111, 117, 123, 124, 126, 131, 138, 141, 146, 162, 164, 173, 198, 205, 210, 211, 218, 220, 221, 223, 224, 246, 257, 264, 265, 269, 278, 294, 298, 303, 311, 316, 317, 321, 339, 344, 345, 347, 349, 350, 352, 356, 361, 366, 368, 378, 384, 385, 388, 403, 412, 420, 422, 425, 428, 431, 435, 437, 442, 453, 458, 473, 482, 486, 501, 502, 505, 508, 514, 563, 579, 615, 616, 640, 642, 649, 659, 660, 662, 668, 672, 686, 689, 690, 700, 701, 711, 719, 722, 726, 731, 734, 736, 756, 758, 763, 765, 767, 771, 772, 777, 782, 788, 791, 802, 806, 813, 819, 822, 834, 839, 842, 846, 852, 861, 863, 866, 869, 872, 873, 874, 879, 889, 895, 903, 906, 916, 920, 923, 933, 940, 942, 948, 954, 963, 971, 992, 998, 999, 1002, 1006, 1053, 1056, 1057, 1059, 1060, 1063, 1071, 1077, 1078, 1087, 1089, 1094, 1097, 1103, 1108, 1117, 1118, 1136, 1138, 1145, 1148, 1161, 1171, 1179, 1181, 1183, 1195, 1197, 1198, 1204, 1208, 1212, 1213, 1216, 1218, 121

### In the following two sections, we store the original labels of the files in a dictionary.

In [111]:
i=0
file_labels={}
for k in files.keys():
  x=k.split("_")
  # print(x)
  file_labels[i]=int(x[2][0])
  i=i+1

# print(file_no)
# print(file_labels)
# print(len(file_labels))

{0: 2, 1: 1, 2: 2, 3: 5, 4: 2, 5: 3, 6: 3, 7: 3, 8: 1, 9: 5, 10: 5, 11: 3, 12: 2, 13: 1, 14: 2, 15: 1, 16: 2, 17: 4, 18: 3, 19: 4, 20: 2, 21: 3, 22: 3, 23: 4, 24: 1, 25: 3, 26: 2, 27: 1, 28: 2, 29: 5, 30: 5, 31: 2, 32: 1, 33: 4, 34: 3, 35: 1, 36: 1, 37: 1, 38: 4, 39: 3, 40: 4, 41: 1, 42: 4, 43: 4, 44: 4, 45: 5, 46: 1, 47: 1, 48: 1, 49: 2, 50: 4, 51: 4, 52: 3, 53: 4, 54: 5, 55: 1, 56: 3, 57: 1, 58: 2, 59: 3, 60: 4, 61: 2, 62: 3, 63: 2, 64: 4, 65: 1, 66: 4, 67: 5, 68: 4, 69: 3, 70: 5, 71: 5, 72: 4, 73: 3, 74: 1, 75: 5, 76: 4, 77: 5, 78: 1, 79: 5, 80: 4, 81: 4, 82: 5, 83: 5, 84: 2, 85: 1, 86: 1, 87: 2, 88: 4, 89: 3, 90: 4, 91: 1, 92: 2, 93: 4, 94: 1, 95: 3, 96: 5, 97: 5, 98: 3, 99: 3, 100: 3, 101: 3, 102: 2, 103: 1, 104: 5, 105: 2, 106: 2, 107: 3, 108: 5, 109: 4, 110: 2, 111: 5, 112: 2, 113: 1, 114: 1, 115: 1, 116: 2, 117: 5, 118: 1, 119: 2, 120: 3, 121: 4, 122: 3, 123: 5, 124: 5, 125: 3, 126: 5, 127: 1, 128: 3, 129: 3, 130: 3, 131: 5, 132: 1, 133: 1, 134: 3, 135: 4, 136: 2, 137: 4, 138: 

In [0]:
orig_labels={}
for i in indexes.keys():
  for j in indexes[i]:
    # print(j)
    orig_labels[j]=file_labels[j]

orig_labels

### To predict our output, we use majority voting of the cluster points.

In [0]:
def find_majority_label(label):
    return max(set(label), key = label.count) 

In [114]:
pred_label_of_cluster={}
for loop in range(5):
  li=[]
  for i in indexes[loop]:
    li.append(file_labels[i])
#   print(find_majority_label(li))
  pred_label_of_cluster[loop]=find_majority_label(li)

5
1
2
3
4


In [0]:
pred_label={}
for i in indexes.keys():
  for j in indexes[i]:
    # print(j)
    pred_label[j]=pred_label_of_cluster[i]


## Result
We measure the accuracy of our model by comparing our predicted labels with the original labels of the files.

The accuracy score obtained is **84.34%**

Note: This value of accuracy score may vary for every execution as initially the centroids are assigned randomly.

In [116]:
y_true=orig_labels.values()
y_pred=pred_label.values()
y_true=list(y_true)
y_pred=list(y_pred)
accuracy_score(y_true,y_pred)

0.8434782608695652

### The homogeneity score is calculated and found out to be **0.69**

In [119]:
homogeneity_score(y_true, y_pred)

0.6989383929356456