# Full Workflow

Feature extraction + Word embedding (only pretrained ones) + Clustering + Evaluation

# 1. Feature Extraction

Already done, use the csv output from that instead of running the code again.

In [2]:
import pandas as pd
import numpy as np
import nltk
import regex
import re
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
import stanza
stanza.download('en') # download English model
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 1.22MB/s]
2021-03-26 01:21:26 INFO: Downloading default packages for language: en (English)...
2021-03-26 01:21:27 INFO: File exists: C:\Users\TzeMin\stanza_resources\en\default.zip.
2021-03-26 01:21:35 INFO: Finished downloading models and saved to C:\Users\TzeMin\stanza_resources.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TzeMin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TzeMin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\TzeMin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [21]:
import collections
from sklearn.cluster import KMeans
from tqdm import tqdm
from gensim.models import Word2Vec 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
import spacy
from nltk.stem import PorterStemmer
import sys

In [6]:
refined = pd.read_csv("../../output/corpus-refined-features.csv", usecols = ['index'])
refined

Unnamed: 0,index
0,ippt
1,ipt
2,sessions
3,still
4,rt
...,...
1766,vary
1767,mailbox
1768,forsee
1769,save


In [8]:
uncleaned_words = refined['index']
uncleaned_words

0           ippt
1            ipt
2       sessions
3          still
4             rt
          ...   
1766        vary
1767     mailbox
1768      forsee
1769        save
1770         win
Name: index, Length: 1771, dtype: object

In [9]:
nlp = spacy.load("en_core_web_sm") # to run on command prompt: python -m spacy download en_core_web_sm
words = [item for item in uncleaned_words if item not in nlp.Defaults.stop_words]

print("Words removed were: ", set(uncleaned_words).difference(set(words)))
print("From", len(uncleaned_words), "to", len(words))

words_df = pd.DataFrame(words)
words_df.columns = ['word']
words_df

Words removed were:  {'serious', 'everything', 'everyone', 'move', 'wherein', 'used', 'others', 'amount', 'still', 'somewhere', 'nothing', 'full', 'much', 'see', 'mine', 'give', 'call', 'next', 'part', 'anyone', 'becomes', 'put', 'yet', 'anything', 'go', 'latter', 'say', 'first', 'seem', 'nobody', 'alone', 'something', 'side', 'show', 'whole', 'get', 'enough', 'take', 'never', 'name', 'last', 'many', 'one', 'someone', 'become', 'keep', 'make', 'back', 'ten', 'none', 'top', 'less'}
From 1771 to 1719


Unnamed: 0,word
0,ippt
1,ipt
2,sessions
3,rt
4,window
...,...
1714,vary
1715,mailbox
1716,forsee
1717,save


# 2. Word Embedding + Clustering + Evaluation

- a higher Silhouette Coefficient score relates to a model with better defined clusters; the score is bounded between -1 for incorrect clustering and +1 for highly dense clustering. Scores around zero indicate overlapping clusters.
- a higher Calinski-Harabasz score relates to a model with better defined clusters

## 2.1 spaCy's Pretained Vectors + Affinity Propagation

In [10]:
def vectorize(text):
    """Get the SpaCy vector corresponding to a text"""
    return nlp(text).vector

X = np.stack(vectorize(word) for word in words)
X_normalised = normalize(np.stack(vectorize(word) for word in words))

affprop = AffinityPropagation()
affprop.fit(X)

  if (await self.run_code(code, result,  async_=asy)):


In [27]:
word_array = np.array(words)
print("Total no. of clusters: ", len(affprop_glove.cluster_centers_indices_))
for cluster_id in np.unique(affprop.labels_):
    exemplar = word_array[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(word_array[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))

Total no. of clusters:  125
 - *dont:* cant, didnt, doesnt, dont, havent, wont
 - *complete:* birdy, clean, clear, complete, correct, exempt, liable, open, qualify, select, sorry, strict, suitable
 - *long:* bully, early, far, hard, kinda, late, long, mcs, nsdotsg, overseas, rly, worst
 - *days:* days, hours, months, thanks, times, ways, weeks, years
 - *good:* advisable, bad, brief, compulsory, cool, curious, dependent, different, dusty, fine, funny, good, great, green, haiz, incamp, mandatory, maximum, meaningful, mindef, miserable, nice, present, proactive, public, regular, right, siong, stringent, tough, true, useless, vague, weird, wrong
 - *week:* april, bit, day, december, hour, july, june, month, night, november, time, way, week, weekend, year
 - *letter:* abuse, background, base, bill, blur, butt, concern, cookhouse, degree, direction, division, dun, experience, eye, factor, fi, force, incident, job, kana, kit, language, letter, mission, mob, moi, mustache, period, prep, privi

#### Evaluation

- Silhouette score
- Calinski Harabasz index
- Davies Bouldin index

In [12]:
from sklearn import metrics
print("Silhouette score:", metrics.silhouette_score(X, affprop.labels_, metric='euclidean'))
print("Calinski Harabasz:", metrics.calinski_harabasz_score(X, affprop.labels_))
print("Davies Bouldin:", metrics.davies_bouldin_score(X, affprop.labels_))

Silhouette score: 0.025242906
Calinski Harabasz: 28.82584207818853
Davies Bouldin: 2.91456112363272


## 2.2 GloVe's Pretrained Vectors + Affinity Propagation

In [13]:
# progress bar
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))
    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)
    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush() 

# load dictionary of word vectors based on pretrained Glove model
def loadGloveDict(File):
    print("Loading glove model")
    f = open(File, 'r', encoding = 'utf-8')
    gloveDict = {}
    i = 0
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        progress(i, 400000, status = 'retreiving vectors')
        wordEmbedding = pd.DataFrame([float(value) for value in splitLine[1:]]).T
        gloveDict[word] = wordEmbedding
        i += 1
    print(len(gloveDict), "words loaded")
    return gloveDict

In [16]:
model = loadGloveDict("../glove.6B/glove.6B.300d.txt") #pretrained

Loading glove model


In [18]:
numFeatures = len(words_df['word'])
vectorlist = []
notInCorpus = []

for i in range(0, numFeatures):
    progress(i, numFeatures - 1, status = "concatenating extracted vectors")
    wordAsDF = words_df['word'][[i]]
    try:
        vector = pd.concat([wordAsDF, model[wordAsDF[i]].set_index(wordAsDF.index)], axis = 1)
        vectorlist.append(vector)
    except KeyError:
        notInCorpus.append(wordAsDF[i])

embeddings = pd.concat(vectorlist).reset_index(drop = True).rename(columns = {"word":"feature"})
embeddings



Unnamed: 0,feature,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
0,ipt,-0.264510,0.418580,0.210640,0.30751,-0.038163,0.413510,-0.21860,0.128270,-0.008734,...,-0.17047,-0.311660,-0.24170,0.287430,-0.371740,-0.357100,0.212920,0.105580,0.304500,-0.542100
1,sessions,-0.509840,0.008746,-0.102280,0.37741,0.162980,-0.507320,-0.15894,-0.317470,0.233890,...,-0.29631,-0.322150,-0.81685,-0.086836,-0.289170,0.483030,-0.213460,-0.369050,-0.051946,0.011276
2,rt,-0.097004,-0.874420,-0.087863,-0.27042,-0.524880,0.051912,-0.49371,0.497150,-0.002542,...,0.39796,0.042912,-0.12752,0.124230,-0.435650,0.032578,0.013193,0.190810,0.464680,0.597970
3,window,-0.029352,-0.137720,-0.197070,-0.79303,0.146030,0.563230,-0.49493,-0.610630,-0.086160,...,-0.10082,0.076632,-0.17503,0.110900,0.418830,0.296150,-0.233930,0.399510,0.167900,0.456090
4,book,0.048733,-0.055083,0.149470,-0.11269,0.098791,0.543340,-0.51204,0.278820,0.114970,...,-0.11647,-0.072080,-0.41821,0.392380,-0.017030,-0.031026,0.254280,0.513520,0.136670,-0.126390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1650,vary,-0.412140,-0.018634,0.048448,0.43328,-0.023861,0.495050,0.02622,0.761890,0.011218,...,0.11174,0.002409,-0.02948,0.143860,-0.013612,-0.296920,-0.573060,0.177610,0.294730,-0.350610
1651,mailbox,-0.500810,-0.125950,-0.717040,0.11644,0.212270,0.280370,-0.45449,-0.408650,-0.444210,...,0.22481,0.119780,0.20767,-0.147620,-0.329880,-0.451520,0.621500,-0.174460,-0.331040,-0.341090
1652,forsee,0.013178,-0.235220,0.085450,-0.13470,0.277110,-0.255290,-0.40537,-0.002905,0.205280,...,-0.26494,-0.092792,0.95559,-0.131530,-0.184620,-0.299780,-0.377070,-0.046352,-0.174660,0.252760
1653,save,0.568290,0.101280,-0.839500,0.22182,0.019760,0.034984,-0.02562,0.253890,-0.193850,...,-0.01707,0.041124,-0.14130,-0.140870,-0.123330,-0.448690,0.323660,-0.539140,0.351690,-0.295690


In [19]:
notInCorpus

['ippt',
 'gagt',
 'nsmen',
 'inpro',
 'cmpb',
 'ipts',
 'fccs',
 'orns',
 'covid',
 'ippts',
 'nsman',
 'alrdy',
 'hiit',
 'greent',
 'bday',
 'workyear',
 'mths',
 'ppls',
 'nsportal',
 'ptis',
 'tbh',
 'liddat',
 'occifer',
 'nvm',
 'downpes',
 'calander',
 'burpees',
 'fiit',
 'sibei',
 'llst',
 'singpass',
 'nssc',
 'situp',
 'scgp',
 'wsdip',
 'suay',
 'tcss',
 'platoonmate',
 'paynow',
 'oots',
 'req',
 'redditors',
 'bookouts',
 'impt',
 'nsti',
 'chiongsua',
 'tekan',
 'recuit',
 'gng',
 'wose',
 'thrgh',
 'incamp',
 'fked',
 'bdae',
 'nsdotsg',
 'idti',
 'macdonals',
 'haiz',
 'buibui',
 'mred',
 'saikang',
 'pullups',
 'jiak',
 'buay']

In [22]:
sc_X = StandardScaler()
X_glove = sc_X.fit_transform(embeddings.iloc[:,1:])
X_glove

array([[-0.64602977,  1.17045476,  0.70192671, ...,  0.47917534,
         0.97753791, -1.49455437],
       [-1.4082633 , -0.03487774, -0.31594748, ..., -0.86652199,
        -0.12867643,  0.0841282 ],
       [-0.12559325, -2.63229296, -0.2690515 , ...,  0.72082417,
         1.47464943,  1.75786106],
       ...,
       [ 0.21673918, -0.75238859,  0.29470547, ...,  0.0484093 ,
        -0.50951388,  0.77303879],
       [ 1.94145683,  0.23726694, -2.71399559, ..., -1.34877061,
         1.12398998, -0.79159079],
       [ 0.81536576,  2.3018972 , -1.3847284 , ..., -1.70901818,
        -0.5750899 ,  0.45494926]])

In [23]:
affprop_glove = AffinityPropagation()
affprop_glove.fit(X_glove)



AffinityPropagation()

In [26]:
word_array = np.array(words)
print("Total no. of clusters: ", len(affprop_glove.cluster_centers_indices_))
for cluster_id in np.unique(affprop_glove.labels_):
    exemplar = word_array[affprop_glove.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(word_array[np.nonzero(affprop_glove.labels_ == cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))

Total no. of clusters:  125
 - *book:* april, better, board, book, chill, condition, contact, cycle, dates, defaulter, different, doubts, effect, fcc, form, ict, injuries, injury, meeting, mo, months, nightmare, nth, number, op, period, person, probably, pti, pushup, saw, slots, specialist, thats, trouble, units, ur, vague, weekday, weekend, yr
 - *years:* action, end, sunday, years
 - *sms:* accept, activate, attendance, beards, benefits, bill, bmt, cash, forgot, gng, goodie, guess, huh, instructor, lan, life, lines, macdonals, pretend, progressive, remain, request, scdf, seperate, sick, skills, smaller, sms, special, tally, volunteer, wait, wake
 - *nsmen:* nsmen
 - *mindef:* appointments, assume, camps, cant, cb, clear, command, cut, easy, enquire, fact, guys, ipt, jobs, mindef, obligations, oc, places, question, return, review, system, time, training, vocation, ways
 - *new:* boy, eg, jog, lessons, new, phones, wonder
 - *liao:* actions, button, cases, closes, cso, depends, detect,

#### Evaluation

In [25]:
from sklearn import metrics
print("Silhouette score:", metrics.silhouette_score(X_glove, affprop_glove.labels_, metric='euclidean'))
print("Calinski Harabasz:", metrics.calinski_harabasz_score(X_glove, affprop_glove.labels_))
print("Davies Bouldin:", metrics.davies_bouldin_score(X_glove, affprop_glove.labels_))

Silhouette score: -0.01159183009910761
Calinski Harabasz: 4.019040661069761
Davies Bouldin: 2.3162252707756577
