In [1]:
import csv
import numpy as np 
from collections import Counter
import glob
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words
import itertools
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import homogeneity_score
from sklearn.decomposition import PCA



In [2]:
class Cluster:
    K = 5
    m = 0
    n = 0
    centroids = []
    itNum = 100

    def stat(self, labels_test, predictions):
        print("RAND score ", adjusted_rand_score(labels_test, predictions))
        print("Homogeneity score ", homogeneity_score(labels_test, predictions))
  
    # calculate the Euclidean distance between two vectors
    def euclidean_distance(self, trainData, test_row):
        a = trainData - test_row
        b =  a**2
        distances = np.sum(b, axis = 1)
        return distances.reshape(-1,1)

    def predict(self, testData):
        distances = np.zeros([testData.shape[0],self.K])
        for k in range(self.K):
            d= self.euclidean_distance(testData, self.centroids[k])
            distances[:,k] = d.reshape(testData.shape[0])
        predictions = np.argmin(distances, axis = 1)
        predictions = predictions+1   
        return predictions       

    def clustering(self, trainData):
        for i in range(self.itNum):
            distances = np.zeros([self.m,self.K])
            for k in range(self.K):
                d= self.euclidean_distance(trainData, self.centroids[k])
                distances[:,k] = d.reshape(self.m)
            predictions = np.argmin(distances, axis = 1)
            predictions = predictions+1
            labels = np.unique(predictions)

            # print("should be 5 check", labels)
            for l in range(len(labels)):
                pool = np.where(predictions == labels[l])[0]
                if len(pool) is not 0:
                    self.centroids[l] = np.mean(trainData[pool], axis = 0)

        
    def prepData(self, fileList):
        labels = []
        i = 0
        for f in fileList:
            i = i+1
            a = str(f).split('.')
            b = str(a[0]).split('_')
            labels.append(int(b[1]))
        labels = np.array(labels)
#         print("labels shape", labels.shape)
        vectorizer = TfidfVectorizer(input='filename', decode_error='ignore', 
                                    lowercase=True, token_pattern=r'\b[^\d\W]+\b', 
                                    stop_words=stop_words.ENGLISH_STOP_WORDS)
        x = vectorizer.fit_transform(fileList).toarray()
        x = np.array(x)
        x = x.astype(np.float)
        pca = PCA(n_components=1000)
        xPCA = pca.fit_transform(x)
        # x_normed = (x - x.min(0)) / x.ptp(0)
#         print("X shape", xPCA.shape)
#         print("no of files is ", i)
        return labels, xPCA


    def cluster(self, TestFile):
        fileList = []
        for dirpath,_,filenames in os.walk(TestFile):
            for f in filenames:
                fileList.append(os.path.abspath(os.path.join(dirpath, f)))
#         print(fileList[0])
        labels, x = self.prepData(fileList)

        x_test = x[0:100]
        x_train = x[100:]
        labels_test = labels[0:100]
        labels_train = labels[100:]
        self.m, self.n = x_train.shape
        i = np.random.randint(0,self.m-1, size=(1, self.K))
        index = list(itertools.chain.from_iterable(i))
#         print("index = ", index)
        self.centroids = x[index]
        self.clustering(x_train)
        predictions = self.predict(x_test)
        self.stat(labels_test, predictions)
        return predictions

In [3]:
cluster_algo = Cluster()
predictions = cluster_algo.cluster('./Question-6/dataset/') 

/home/subbu/SMAI/Assn2/Question-6/dataset/55_3.txt
labels shape (1725,)
X shape (1725, 1000)
no of files is  1725
index =  [275, 1139, 720, 1127, 1402]
RAND score  0.33226801740686024
Homogeneity score  0.5087377760331913
