In [19]:
import pandas as pd
import numpy as np
import sklearn 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.cluster import normalized_mutual_info_score
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import LancasterStemmer
lanc_stemmer = LancasterStemmer()

[nltk_data] Downloading package punkt to /home/rantanj15/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rantanj15/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
df = pd.read_csv('data.csv', sep='\t', header=None)
df.iloc[0]
corpus = df.values.tolist()
corpus[0]
corpus = [item for sublist in corpus for item in sublist]
corpus[0]
labels = []

#Remove id and class values from the beginning and add labels to a list
for i in range(0,len(corpus)):
    ind = corpus[i].index("#")
    corpus[i] = corpus[i][ind+1:]
    labels.append(int(corpus[i][0])-1)
    ind = corpus[i].index("#")
    corpus[i] = corpus[i][ind+1:]
    
def lanc_stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [lanc_stemmer.stem(word) for word in words]
    return words

In [21]:
#tfidf for KMeans
vec = TfidfVectorizer(stop_words=stopwords.words('english'), 
                      tokenizer=lanc_stemming_tokenizer,
                      use_idf=True,
                      norm='l2') 

matrix = vec.fit_transform(corpus)



In [22]:
#Single KMeans model with seed for reproducibility
kmeans_model = KMeans(n_clusters=5, n_init=1, random_state=420)
kmeans_predictions = kmeans_model.fit_predict(matrix)

normalized_mutual_info_score(labels, kmeans_predictions, average_method='geometric')

0.7208134122527954

In [14]:
#Keywords for the KMeans clustering
print("Top terms per kmeans cluster:")
order_centroids = kmeans_model.cluster_centers_.argsort()[:, ::-1]
terms = vec.get_feature_names()
for i in range(5):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :3]:
        print(' %s' % terms[ind]),
    print

Top terms per kmeans cluster:
Cluster 0:
 compil
 program
 cod
Cluster 1:
 robot
 control
 system
Cluster 2:
 sec
 encrypt
 cryptograph
Cluster 3:
 im
 detect
 vis
Cluster 4:
 databas
 dat
 rel


Mean score

In [15]:
#Mean score for 100 KMeans clusterings with random seeds
scores = []

for i in range(0,100):
    kmeans_model = KMeans(n_clusters=5)
    kmeans_predictions = kmeans_model.fit_predict(matrix)
    
    score = normalized_mutual_info_score(labels, kmeans_predictions, average_method='geometric')
    scores.append(score)

np.mean(scores)


0.7018217077564124

In [16]:
#Different stop word list for spectral clustering and also different tfidf calculation and LSA (truncatedSVD)
stop_words_long = ["able","about","above","abroad","according","accordingly","across","actually","adj","after","afterwards","again","against","ago","ahead","ain't","all","allow","allows","almost","alone","along","alongside","already","also","although","always","am","amid","amidst","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","a's","aside","ask","asking","associated","at","available","away","awfully","back","backward","backwards","be","became","because","become","becomes","becoming","been","before","beforehand","begin","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","came","can","cannot","cant","can't","caption","cause","causes","certain","certainly","changes","clearly","c'mon","co","co.","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","c's","currently","dare","daren't","definitely","described","despite","did","didn't","different","directly","do","does","doesn't","doing","done","don't","down","downwards","during","each","edu","eg","eight","eighty","either","else","elsewhere","end","ending","enough","entirely","especially","et","etc","even","ever","evermore","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","fairly","far","farther","few","fewer","fifth","first","five","followed","following","follows","for","forever","former","formerly","forth","forward","found","four","from","further","furthermore","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","had","hadn't","half","happens","hardly","has","hasn't","have","haven't","having","he","he'd","he'll","hello","help","hence","her","here","hereafter","hereby","herein","here's","hereupon","hers","herself","he's","hi","him","himself","his","hither","hopefully","how","howbeit","however","hundred","i'd","ie","if","ignored","i'll","i'm","immediate","in","inasmuch","inc","inc.","indeed","indicate","indicated","indicates","inner","inside","insofar","instead","into","inward","is","isn't","it","it'd","it'll","its","it's","itself","i've","just","k","keep","keeps","kept","know","known","knows","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","likewise","little","look","looking","looks","low","lower","ltd","made","mainly","make","makes","many","may","maybe","mayn't","me","mean","meantime","meanwhile","merely","might","mightn't","mine","minus","miss","more","moreover","most","mostly","mr","mrs","much","must","mustn't","my","myself","name","namely","nd","near","nearly","necessary","need","needn't","needs","neither","never","neverf","neverless","nevertheless","new","next","nine","ninety","no","nobody","non","none","nonetheless","noone","no-one","nor","normally","not","nothing","notwithstanding","novel","now","nowhere","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","one's","only","onto","opposite","or","other","others","otherwise","ought","oughtn't","our","ours","ourselves","out","outside","over","overall","own","particular","particularly","past","per","perhaps","placed","please","plus","possible","presumably","probably","provided","provides","que","quite","qv","rather","rd","re","really","reasonably","recent","recently","regarding","regardless","regards","relatively","respectively","right","round","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","shan't","she","she'd","she'll","she's","should","shouldn't","since","six","so","some","somebody","someday","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","take","taken","taking","tell","tends","th","than","thank","thanks","thanx","that","that'll","thats","that's","that've","the","their","theirs","them","themselves","then","thence","there","thereafter","thereby","there'd","therefore","therein","there'll","there're","theres","there's","thereupon","there've","these","they","they'd","they'll","they're","they've","thing","things","think","third","thirty","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","till","to","together","too","took","toward","towards","tried","tries","truly","try","trying","t's","twice","two","un","under","underneath","undoing","unfortunately","unless","unlike","unlikely","until","unto","up","upon","upwards","us","use","used","useful","uses","using","usually","v","value","various","versus","very","via","viz","vs","want","wants","was","wasn't","way","we","we'd","welcome","well","we'll","went","were","we're","weren't","we've","what","whatever","what'll","what's","what've","when","whence","whenever","where","whereafter","whereas","whereby","wherein","where's","whereupon","wherever","whether","which","whichever","while","whilst","whither","who","who'd","whoever","whole","who'll","whom","whomever","who's","whose","why","will","willing","wish","with","within","without","wonder","won't","would","wouldn't","yes","yet","you","you'd","you'll","your","you're","yours","yourself","yourselves","you've","zero","a","how's","i","when's","why's","b","c","d","e","f","g","h","j","l","m","n","o","p","q","r","s","t","u","uucp","w","x","y","z","I","www","amount","bill","bottom","call","computer","con","couldnt","cry","de","describe","detail","due","eleven","empty","fifteen","fifty","fill","find","fire","forty","front","full","give","hasnt","herse","himse","interest","itse”","mill","move","myse”","part","put","show","side","sincere","sixty","system","ten","thick","thin","top","twelve","twenty","abst","accordance","act","added","adopted","affected","affecting","affects","ah","announce","anymore","apparently","approximately","aren","arent","arise","auth","beginning","beginnings","begins","biol","briefly","ca","date","ed","effect","et-al","ff","fix","gave","giving","heres","hes","hid","home","id","im","immediately","importance","important","index","information","invention","itd","keys","kg","km","largely","lets","line","'ll","means","mg","million","ml","mug","na","nay","necessarily","nos","noted","obtain","obtained","omitted","ord","owing","page","pages","poorly","possibly","potentially","pp","predominantly","present","previously","primarily","promptly","proud","quickly","ran","readily","ref","refs","related","research","resulted","resulting","results","run","sec","section","shed","shes","showed","shown","showns","shows","significant","significantly","similar","similarly","slightly","somethan","specifically","state","states","stop","strongly","substantially","successfully","sufficiently","suggest","thered","thereof","therere","thereto","theyd","theyre","thou","thoughh","thousand","throug","til","tip","ts","ups","usefully","usefulness","'ve","vol","vols","wed","whats","wheres","whim","whod","whos","widely","words","world","youd","youre"]

vec = TfidfVectorizer(stop_words=stop_words_long, 
                      tokenizer=lanc_stemming_tokenizer,
                      use_idf=True,
                      norm='l2',
                      ngram_range=(1,3)) 

matrix = vec.fit_transform(corpus)

svd = TruncatedSVD(n_components=100)
matrix = svd.fit_transform(matrix)



In [17]:
#Single Spectral clustering model with seed for reproducibility
spectral_model = SpectralClustering(n_clusters=5, assign_labels='discretize', random_state=420)
spectral_predictions = spectral_model.fit_predict(matrix)

normalized_mutual_info_score(labels, spectral_predictions, average_method='geometric')

0.8135081307404697

In [18]:
#Mean score for 100 Spectral clusterings with random seeds
scores = []

for i in range(0,100):
    spectral_model = SpectralClustering(n_clusters=5, assign_labels='discretize')
    spectral_predictions = spectral_model.fit_predict(matrix)
    
    score = normalized_mutual_info_score(labels, spectral_predictions, average_method='geometric')
    scores.append(score)

np.mean(scores)


0.8135081307404698