In [1]:
import numpy as np

In [2]:
def evaluate_clustering_algorithm(algorithm, X, y, eval_algo):
    import time
    import resource
    t1 = time.time()
    predicted_clusters = algorithm(X)
    exec_time = time.time() - t1
    import resource
    mem_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    predicted_clusters = map_their_labels_to_ours(predicted_clusters, y)
    performance = eval_algo(predicted_clusters, y)
    return exec_time, mem_usage, performance
   
    
    

In [3]:
def eval_algo(pred, act):
    # f1
    import sklearn
    return sklearn.metrics.f1_score(act, pred, average="micro")
  

In [4]:
#Importing all the neccesary libraries
import numpy as np 
import pandas as pd 

import os


import re
import string
import nltk 
nltk.download('stopwords')
nltk.download("punkt")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
data={'bio':pd.read_csv('./biology.csv',index_col=0),
      'robo':pd.read_csv('./robotics.csv',index_col=0),
      'cooking':pd.read_csv('./cooking.csv',index_col=0)}

stops = set(stopwords.words("english"))
def clean_content(table):
    content = table.content
    #Converting text to lowercase characters
    content = content.apply(lambda x: x.lower())
    #Removing HTML tags
    content = content.apply(lambda x: re.sub(r'\<[^<>]*\>','',x))
    #Removing any character which does not match to letter,digit or underscore
    content = content.apply(lambda x: re.sub(r'^\W+|\W+$',' ',x))
    #Removing space,newline,tab
    content = content.apply(lambda x: re.sub(r'\s',' ',x))
    #Removing punctuation
    content = content.apply(lambda x: re.sub(r'[^a-zA-Z0-9]',' ',x))
    #Tokenizing data
    content = content.apply(lambda x: word_tokenize(x))
    #Removing stopwords
    content = content.apply(lambda x: [i for i in x if i not in stops])
    return(content)
   
def clean_title(table):
    title = table.title
    title = title.apply(lambda x: x.lower())
    title = title.apply(lambda x: re.sub(r'^\W+|\W+$',' ',x))
    title = title.apply(lambda x: re.sub(r'\s',' ',x))
    title = title.apply(lambda x: re.sub(r'[^a-zA-Z0-9]',' ',x))
    title = title.apply(lambda x: word_tokenize(x))
    title = title.apply(lambda x: [i for i in x if i not in stops])
    return(title)
   

for df in data:
    data[df].content = clean_content(data[df])
  
for df in data:
    data[df].title = clean_title(data[df])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thomasjarosz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/thomasjarosz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def identity_tokenizer(text):
    return text

In [6]:
def create_X_and_y(data_dict, percentage_of_total = .1):
    vect = TfidfVectorizer(tokenizer=identity_tokenizer,lowercase=False)
    ret = []
    y = np.array([0]* int(len(data_dict["bio"].content) * percentage_of_total))
    y = np.append(y, np.array([1]*int(len(data_dict["robo"].content) * percentage_of_total)), axis=0)
    y = np.append(y, np.array([2]*int(len(data_dict["cooking"].content) * percentage_of_total)), axis=0)
    pre_vectorized_X = data['bio'].content.values[0:int(data['bio'].content.values.shape[0] * percentage_of_total)]
    pre_vectorized_X = np.append(pre_vectorized_X, data['robo'].content.values[0:int(data['robo'].content.values.shape[0] * percentage_of_total)])
    pre_vectorized_X = np.append(pre_vectorized_X, data['cooking'].content.values[0:int(data['cooking'].content.values.shape[0] * percentage_of_total)])
    X = vect.fit_transform(pre_vectorized_X)
    return X, y


In [7]:
def find_map_their_labels_to_ours(their_labels, our_labels):
    from collections import Counter
    their_unique_labels = np.unique(their_labels)
    their_labels_used = set()
    ret = {}
    for our_label in np.unique(our_labels):
        idxs = np.where(our_labels == our_label)
        predicted_for_label_l = their_labels[idxs]
        most_common = Counter(predicted_for_label_l).most_common()
        for label_count_pair in most_common:
            their_label = label_count_pair[0]
            if their_label not in their_labels_used:
                break
        their_labels_used.add(their_label)
        ret[their_label] = our_label
    return ret

def map_their_labels_to_ours(their_labels, our_labels):
    label_map = find_map_their_labels_to_ours(their_labels, our_labels) 
    map_preds = lambda x: label_map[x]
    return np.vectorize(map_preds)(their_labels)
        
def kmeans_algo(X):
    from sklearn.cluster import KMeans
    model = KMeans(n_clusters = 3)
    return model.fit_predict(X)
   
    

In [8]:
X, y = create_X_and_y(data)

In [16]:
kmeans_res = evaluate_clustering_algorithm(kmeans_algo, X,  y, eval_algo)
kmeans_res

(0.5684669017791748, 397291520, 0.30165816326530615)

In [10]:
def AgglomerativeClustering_algo(X):
    from sklearn.cluster import AgglomerativeClustering
    model = AgglomerativeClustering(n_clusters = 3)
    return model.fit_predict(X.toarray())
AgglomerativeClustering_algo_res = evaluate_clustering_algorithm(AgglomerativeClustering_algo, X,  y, eval_algo)
AgglomerativeClustering_algo_res

(99.60228490829468, 882409472, 0.8737244897959183)

In [9]:
def birch_algo(X):
    from sklearn.cluster import Birch
    model = Birch(n_clusters = 3)
    return model.fit_predict(X)
birch_res = evaluate_clustering_algorithm(birch_algo, X,  y, eval_algo)
birch_res

(106.34011816978455, 3196514304, 0.4209183673469387)

In [9]:
def MiniBatchKMeans_algo(X):
    from sklearn.cluster import MiniBatchKMeans
    model = MiniBatchKMeans(n_clusters = 3)
    return model.fit_predict(X)
MiniBatchKMeans_algo_res = evaluate_clustering_algorithm(MiniBatchKMeans_algo, X,  y, eval_algo)
MiniBatchKMeans_algo_res


(0.11622405052185059, 381128704, 0.3565051020408163)

In [13]:
def SpectralClustering_algo(X):
    from sklearn.cluster import SpectralClustering
    model = SpectralClustering(n_clusters = 3)
    return model.fit_predict(X)
SpectralClustering_algo_res = evaluate_clustering_algorithm(SpectralClustering_algo, X,  y, eval_algo)
SpectralClustering_algo_res


(4.283025741577148, 777080832, 0.7088647959183673)