In [1]:
import pandas as pd
from random import sample
from math import sqrt
from numpy import mean
import numpy as np
import copy
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.spatial import distance
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
def dist(v1, v2):
    return np.sqrt(np.sum((np.array(v1) - np.array(v2)) ** 2))  

In [3]:
def initializeCenters(df, k):
    random_indices = sample(range(len(df)), k)
    centers = [list(df.iloc[idx]) for idx in random_indices]
    return centers

In [4]:
def computeCenter(df, k, cluster_labels):
    cluster_centers = list()
    data_points = list()
    for i in range(k):
        for idx, val in enumerate(cluster_labels):
            if val == i:
                data_points.append(list(df.iloc[idx]))
        cluster_centers.append(list(map(mean, zip(*data_points))))
    return cluster_centers

In [5]:
def assignCluster(df, k, cluster_centers):
    cluster_assigned = list()
    for i in range(len(df)):
        distances = [dist(list(df.iloc[i]),center) for center in cluster_centers]
        min_dist, idx = min((val, idx) for (idx, val) in enumerate(distances))
        cluster_assigned.append(idx)
    return cluster_assigned

In [6]:
def kmeans(df, k, class_labels):
    cluster_centers = initializeCenters(df, k)
    curr = 1

    while curr <= MAX_ITER:
        cluster_labels = assignCluster(df, k, cluster_centers)
        cluster_centers = computeCenter(df, k, cluster_labels)
        curr += 1

    return cluster_labels, cluster_centers

In [7]:
def featurize_text(data):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data.ravel())
    
    svd = TruncatedSVD(n_components=2)
    Y = svd.fit_transform(X)
    Z = StandardScaler().fit_transform(Y)

    return Z

In [8]:
k = 5
MAX_ITER = 500

In [9]:
files = os.listdir('Datasets/Question-6/dataset/')

x = []
y_labels = []
for file in files:
    y_labels.append(int(file.split("_",1)[1][0]))
    f = open('Datasets/Question-6/dataset/'+file, 'r', errors='replace')
    data = f.read()
    x.append(data)
    f.close()

In [10]:
z = featurize_text(np.array(x))

df = pd.DataFrame(z) 
class_labels = ['business', 'entertainment', 'politics', 'sport', 'tech']

In [11]:
labels, centers = kmeans(df, k, class_labels)

In [12]:
accuracy_score(y_labels, labels)

0.15768115942028985