In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import pandas as pd 
from pandas import DataFrame
import math
import numpy as np
from numpy import genfromtxt
import sklearn
import os
import matplotlib
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from nltk.corpus import stopwords
from sklearn import linear_model
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import random
from sklearn.metrics.cluster import homogeneity_score

## Training data
In this section, we are reading the contents from the data files and storing them in a dictionary where the keys are the paths of the files and values it's content.

In [None]:
path = '/content/drive/My Drive/Assignment-2_Dataset/Datasets/Question-6/dataset'
file_name = ""
files = {}
for a,b,f in os.walk(path):
  for file in f:
    if ".txt" in file:
      file_name = os.path.join(a,file)
      f1 = open(file_name, 'rb')
      file_data = f1.read().decode(errors="replace")
      file_data = " ".join(file_data.split())
      files[file_name] = file_data

# files

### We convert the data to numpy array by vectorizing it using TfidVectorizer.

In [None]:
vectorizer = TfidfVectorizer(stop_words="english")
vectors = vectorizer.fit_transform(files.values())
# X=vectors
feature_names = vectorizer.get_feature_names()
# print(feature_names)
# print(vectors.shape)
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
# df
X=df.to_numpy()
# print(X)

### Here we are defining the number of clusters. In this case we take it as 5.

In [0]:
no_of_clusters=5

### We are assigning random centroids.

In [None]:
centroids=np.random.uniform(size=(no_of_clusters,X.shape[1]))
for i in range(no_of_clusters):
  centroids[i] = centroids[i]/(np.linalg.norm(centroids[i]))

# print(centroids)

In [0]:
def euclidean_distance(x):
  return (np.sqrt(np.sum(x**2)))

# K-Means Algorithm
We formalize the K-Means algorithm in the following section. We take out the distance of every data point from the centroids and assign them to a cluster which belongs to the centroid that is closest to that point. We return the indices of the data points that belong to each cluster as a dictionary.

In [None]:
i=0
loop=0
prev_centroids=centroids
for loop in range(15):
  # print(loop)
  classes={}
  indexes={}
  begin=0

  for i in range(no_of_clusters):
    indexes[i] = []
    classes[i] = []

  for row in X:
    # for c in range(len(centers)):
    #   dists[c]=np.linalg.norm(row-centers[c])
    distances = [euclidean_distance(row-centroids[i]) for i in range(len(centroids))]
    # print(dists)
    minimum_distance = min(distances)
    m_index = distances.index(minimum_distance)
    # print(index)
    indexes[m_index].append(begin)
    classes[m_index].append(row)
    begin = begin + 1

  for i in range(len(centroids)):
    centroids[i] = np.mean(classes[i], axis = 0)
  
# print(indexes)

### In the following two sections, we store the original labels of the files in a dictionary.

In [None]:
i=0
file_labels={}
for k in files.keys():
  x=k.split("_")
  # print(x)
  file_labels[i]=int(x[2][0])
  i=i+1

# print(file_no)
# print(file_labels)
# print(len(file_labels))

In [0]:
orig_labels={}
for i in indexes.keys():
  for j in indexes[i]:
    # print(j)
    orig_labels[j]=file_labels[j]

orig_labels

### To predict our output, we use majority voting of the cluster points.

In [0]:
def find_majority_label(label):
    return max(set(label), key = label.count) 

In [None]:
pred_label_of_cluster={}
for loop in range(5):
  li=[]
  for i in indexes[loop]:
    li.append(file_labels[i])
#   print(find_majority_label(li))
  pred_label_of_cluster[loop]=find_majority_label(li)

In [0]:
pred_label={}
for i in indexes.keys():
  for j in indexes[i]:
    # print(j)
    pred_label[j]=pred_label_of_cluster[i]


## Result
We measure the accuracy of our model by comparing our predicted labels with the original labels of the files.

The accuracy score obtained is **84.34%**

Note: This value of accuracy score may vary for every execution as initially the centroids are assigned randomly.

In [116]:
y_true=orig_labels.values()
y_pred=pred_label.values()
y_true=list(y_true)
y_pred=list(y_pred)
accuracy_score(y_true,y_pred)

0.8434782608695652

### The homogeneity score is calculated and found out to be **0.69**

In [119]:
homogeneity_score(y_true, y_pred)

0.6989383929356456