# Full Workflow

Feature extraction + Word embedding (only pretrained ones) + Clustering + Evaluation

# 1. Feature Extraction

Already done, use the csv output from that instead of running the code again.

In [2]:
import pandas as pd
import numpy as np
import nltk
import regex
import re
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
import stanza
stanza.download('en') # download English model
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 1.22MB/s]
2021-03-26 01:21:26 INFO: Downloading default packages for language: en (English)...
2021-03-26 01:21:27 INFO: File exists: C:\Users\TzeMin\stanza_resources\en\default.zip.
2021-03-26 01:21:35 INFO: Finished downloading models and saved to C:\Users\TzeMin\stanza_resources.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TzeMin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TzeMin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\TzeMin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [15]:
import collections
from sklearn.cluster import KMeans
from tqdm import tqdm
from gensim.models import Word2Vec 
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
import spacy
from nltk.stem import PorterStemmer
import sys

In [6]:
refined = pd.read_csv("../../output/corpus-refined-features.csv", usecols = ['index'])
refined

Unnamed: 0,index
0,ippt
1,ipt
2,sessions
3,still
4,rt
...,...
1766,vary
1767,mailbox
1768,forsee
1769,save


In [8]:
uncleaned_words = refined['index']
uncleaned_words

0           ippt
1            ipt
2       sessions
3          still
4             rt
          ...   
1766        vary
1767     mailbox
1768      forsee
1769        save
1770         win
Name: index, Length: 1771, dtype: object

In [9]:
nlp = spacy.load("en_core_web_sm") # to run on command prompt: python -m spacy download en_core_web_sm
words = [item for item in uncleaned_words if item not in nlp.Defaults.stop_words]

print("Words removed were: ", set(uncleaned_words).difference(set(words)))
print("From", len(uncleaned_words), "to", len(words))

words_df = pd.DataFrame(words)
words_df.columns = ['word']
words_df

Words removed were:  {'serious', 'everything', 'everyone', 'move', 'wherein', 'used', 'others', 'amount', 'still', 'somewhere', 'nothing', 'full', 'much', 'see', 'mine', 'give', 'call', 'next', 'part', 'anyone', 'becomes', 'put', 'yet', 'anything', 'go', 'latter', 'say', 'first', 'seem', 'nobody', 'alone', 'something', 'side', 'show', 'whole', 'get', 'enough', 'take', 'never', 'name', 'last', 'many', 'one', 'someone', 'become', 'keep', 'make', 'back', 'ten', 'none', 'top', 'less'}
From 1771 to 1719


Unnamed: 0,word
0,ippt
1,ipt
2,sessions
3,rt
4,window
...,...
1714,vary
1715,mailbox
1716,forsee
1717,save


# 2. Word Embedding + Clustering

## 2.1 spaCy's Pretained Vectors + Affinity Propagation

In [10]:
def vectorize(text):
    """Get the SpaCy vector corresponding to a text"""
    return nlp(text).vector

X = np.stack(vectorize(word) for word in words)
X_normalised = normalize(np.stack(vectorize(word) for word in words))

affprop = AffinityPropagation()
affprop.fit(X)

  if (await self.run_code(code, result,  async_=asy)):


In [11]:
word_array = np.array(words)
for cluster_id in np.unique(affprop.labels_):
    exemplar = word_array[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(word_array[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))

 - *dont:* cant, didnt, doesnt, dont, havent, wont
 - *complete:* birdy, clean, clear, complete, correct, exempt, liable, open, qualify, select, sorry, strict, suitable
 - *long:* bully, early, far, hard, kinda, late, long, mcs, nsdotsg, overseas, rly, worst
 - *days:* days, hours, months, thanks, times, ways, weeks, years
 - *good:* advisable, bad, brief, compulsory, cool, curious, dependent, different, dusty, fine, funny, good, great, green, haiz, incamp, mandatory, maximum, meaningful, mindef, miserable, nice, present, proactive, public, regular, right, siong, stringent, tough, true, useless, vague, weird, wrong
 - *week:* april, bit, day, december, hour, july, june, month, night, november, time, way, week, weekend, year
 - *letter:* abuse, background, base, bill, blur, butt, concern, cookhouse, degree, direction, division, dun, experience, eye, factor, fi, force, incident, job, kana, kit, language, letter, mission, mob, moi, mustache, period, prep, privilege, programme, quiz, rate,

#### Evaluation

- Silhouette score
- Calinski Harabasz index
- Davies Bouldin index

In [12]:
from sklearn import metrics
print("Silhouette score:", metrics.silhouette_score(X, affprop.labels_, metric='euclidean'))
print("Calinski Harabasz:", metrics.calinski_harabasz_score(X, affprop.labels_))
print("Davies Bouldin:", metrics.davies_bouldin_score(X, affprop.labels_))

Silhouette score: 0.025242906
Calinski Harabasz: 28.82584207818853
Davies Bouldin: 2.91456112363272


## 2.2 GloVe's Pretrained Vectors + Affinity Propagation

In [13]:
# progress bar
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))
    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)
    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush() 

# load dictionary of word vectors based on pretrained Glove model
def loadGloveDict(File):
    print("Loading glove model")
    f = open(File, 'r', encoding = 'utf-8')
    gloveDict = {}
    i = 0
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        progress(i, 400000, status = 'retreiving vectors')
        wordEmbedding = pd.DataFrame([float(value) for value in splitLine[1:]]).T
        gloveDict[word] = wordEmbedding
        i += 1
    print(len(gloveDict), "words loaded")
    return gloveDict

In [None]:
model = loadGloveDict("../glove.6B/glove.6B.300d.txt") #pretrained

Loading glove model

In [None]:
numFeatures = len(words_df['index'])
vectorlist = []
notInCorpus = []

for i in range(0, numFeatures):
    progress(i, numFeatures - 1, status = "concatenating extracted vectors")
    wordAsDF = words_df['index'][[i]]
    try:
        vector = pd.concat([wordAsDF, model[wordAsDF[i]].set_index(wordAsDF.index)], axis = 1)
        vectorlist.append(vector)
    except KeyError:
        notInCorpus.append(wordAsDF[i])

embeddings = pd.concat(vectorlist).reset_index(drop = True).rename(columns = {"index":"feature"})
embeddings

In [None]:
notInCorpus

In [None]:
sc_X = StandardScaler()
X_glove = sc_X.fit_transform(embeddings.iloc[:,1:])
X_glove

In [None]:
affprop_glove = AffinityPropagation()
affprop_glove.fit(X_glove)

In [None]:
word_array = np.array(words)
for cluster_id in np.unique(affprop_glove.labels_):
    exemplar = word_array[affprop_glove.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(word_array[np.nonzero(affprop_glove.labels_ == cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))