# Unsupervised Text Clustering for Taxonomy Generation

In [1]:
import pandas as pd
import csv
import numpy as np
import os
import re
import spacy
from collections import Counter
from scipy.spatial.distance import euclidean, cosine
from nltk import ngrams, FreqDist

In [2]:
nlp = spacy.load('en_core_web_lg')

### Train word embeddings with Word2Vec

In [37]:
! ./dataset/word2vec -train ./dataset/starwars_text_dataset_cleaned.txt -output ./dataset/starwars_embeddings.txt

Starting training using file ./dataset/starwars_text_dataset_cleaned.txt
Vocab size: 6561
Words in train file: 284807
Alpha: 0.001277  Progress: 98.40%  Words/thread/sec: 125.32k  

### Explore embeddings

In [11]:
word_embd = {}
with open('./dataset/our-l3-0.25-new/embeddings.txt', 'r') as file:
    for line in file:
        items = line.strip().split()
        word = items[0]
        vector = [float(v) for v in items[1:]]
        word_embd[word] = vector

In [12]:
len(word_embd.keys())

5659

In [15]:
len(word_embd['Luke'])

100

In [17]:
vec_a = np.array(word_embd['LukeSkywalker'])
vec_b = np.array(word_embd['Luke'])

dist = np.linalg.norm(vec_a - vec_b)
print(dist)

cos = 1 - euclidean(vec_a, vec_b)
print(cos)

0.8747054911282998
0.12529450887170035


In [26]:
vec_a = np.array(word_embd['luke_skywalker'])
vec_b = np.array(word_embd['enlarge'])

dist = np.linalg.norm(vec_a - vec_b)
print(dist)

cos = 1 - euclidean(vec_a, vec_b)
print(cos)

# try cosine
# top 10 words

0.7411936572596934
0.25880634274030645


In [27]:
vec_a = np.array(word_embd['anakin_skywalker'])
vec_b = np.array(word_embd['anakin'])

dist = np.linalg.norm(vec_a - vec_b)
print(dist)

cos = 1 - euclidean(vec_a, vec_b)
print(cos)

0.9435581309103324
0.05644186908966764


In [28]:
vec_a = np.array(word_embd['anakin_skywalker'])
vec_b = np.array(word_embd['the_ship'])

dist = np.linalg.norm(vec_a - vec_b)
print(dist)

cos = 1 - euclidean(vec_a, vec_b)
print(cos)

0.7665282012574619
0.2334717987425381


### Run Taxogen

### Explore Heirarchy

In [18]:
def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{} {}{}/'.format(level, indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)

In [21]:
list_files('./dataset/our-l3-0.25-new')

0 our-l3-0.25-new/
1     TheCloneWars/
1     Them/
2         Ventress/
3             DarthVader/
4                 Fett/
4                 DarthVader/
4                 BobaFett/
4                 TheStory/
3             Ventress/
4                 ThePlanet/
4                 Ventress/
4                 Jabba/
2         Her/
3             Vader/
4                 Finn/
4                 Rose/
4                 Vader/
3             Her/
4                 Her/
4                 Ben/
4                 You/
3             You/
3             Anakin/
1     January/
2         TheClones/
3             References/
4                 Playstation/
4                 References/
3             TheClones/
4                 TheRise/
4                 TheClones/
4                 Place/
4                 ThePhantomMenace/
4                 Skywalker/
4                 TheOldRepublic/
3             ThePhantomMenace/
2         January/
3             March/
4                 January/
4                 Marc

In [33]:
list_files('./dataset/starwars-taxonomy')

0 starwars-taxonomy/
1     the_millennium_falcon/
1     kylo_ren/
2         r2-d2/
3             order/
3             c-3po/
3             the_galaxy/
3             boba_fett/
3             luke_skywalker/
3             anakin_skywalker/
3             solo/
3             obi-wan_kenobi/
3             darth_maul/
3             the_rebel_alliance/
3             princess_leia/
2         kuiil/
3             poe/
3             gideon/
3             he/
3             she/
3             thrawn/
3             rose/
3             kuiil/
3             they/
3             cara/
3             grogu/
2         ben/
2         anakin/
3             the_dark_side/
3             the_millennium_falcon/
3             tatooine/
3             darth_sidious/
3             the_resistance/
3             dooku/
3             lando/
3             ventress/
3             vader/
3             the_first_order/
3             ben/
3             him/
3             han/
3             the_force/
2         princess_lei

### Generate Concept Pairs

In [34]:
concept_pairs = [('kylo_ren', 'r2-d2'), 
                 ('kylo_ren', 'kuiil'), 
                 ('kylo_ren', 'ben'), 
                 ('kylo_ren', 'anakin'), 
                 ('kylo_ren', 'princess_leia'),
                 ('favreau', 'favreau'), 
                 ('favreau', 'we'), 
                 ('favreau', 'november'), 
                 ('favreau', 'you'), 
                 ('favreau', 'disney'),
                 ('favreau', 'lucas'),
                 ('the_events', 'the_last_jedi'), 
                 ('the_events', 'the_galactic_republic'), 
                 ('the_events', 'the_clone_wars'),
                 ('the_events', 'a_new_hope')]
len(concept_pairs)

15

### Cluster Center Label Generation

<!-- 1     kylo_ren/
2         r2-d2/
3             order/
3             c-3po/
3             the_galaxy/
3             boba_fett/
3             luke_skywalker/
3             anakin_skywalker/
3             solo/
3             obi-wan_kenobi/
3             darth_maul/
3             the_rebel_alliance/
3             princess_leia/
2         kuiil/
3             poe/
3             gideon/
3             he/
3             she/
3             thrawn/
3             rose/
3             kuiil/
3             they/
3             cara/
3             grogu/
2         ben/
2         anakin/
3             the_dark_side/
3             the_millennium_falcon/
3             tatooine/
3             darth_sidious/
3             the_resistance/
3             dooku/
3             lando/
3             ventress/
3             vader/
3             the_first_order/
3             ben/
3             him/
3             han/
3             the_force/
2         princess_leia/ -->

In [35]:
# 0 starwars-taxonomy/
# 1     the_millennium_falcon/
# 1     kylo_ren/
# 2         r2-d2/
# 3             order/
# 3             c-3po/
# 3             the_galaxy/
# 3             boba_fett/
# 3             luke_skywalker/
# 3             anakin_skywalker/
# 3             solo/
# 3             obi-wan_kenobi/
# 3             darth_maul/
# 3             the_rebel_alliance/
# 3             princess_leia/
# 2         kuiil/
# 3             poe/
# 3             gideon/
# 3             he/
# 3             she/
# 3             thrawn/
# 3             rose/
# 3             kuiil/
# 3             they/
# 3             cara/
# 3             grogu/
# 2         ben/
# 2         anakin/
# 3             the_dark_side/
# 3             the_millennium_falcon/
# 3             tatooine/
# 3             darth_sidious/
# 3             the_resistance/
# 3             dooku/
# 3             lando/
# 3             ventress/
# 3             vader/
# 3             the_first_order/
# 3             ben/
# 3             him/
# 3             han/
# 3             the_force/
# 2         princess_leia/
# 1     favreau/
# 2         favreau/
# 3             it/
# 3             favreau/
# 3             lucas/
# 2         we/
# 2         november/
# 3             lucasfilm/
# 3             the_series/
# 3             november/
# 2         you/
# 3             abrams/
# 3             i/
# 3             what/
# 2         disney/
# 2         lucas/
# 1     the_events/
# 2         the_last_jedi/
# 3             star_wars/
# 3             a_new_hope/
# 3             the_empire/
# 2         the_galactic_republic/
# 3             the_galactic_republic/
# 3             the_republic/
# 3             part/
# 3             the_star_wars_franchise/
# 3             naboo/
# 2         the_clone_wars/
# 3             comics/
# 3             attack/
# 3             the_end/
# 2         a_new_hope/

In [3]:
cluster_label = 'kylo_ren'
cluster_members = ['r2-d2', 'kuiil', 'ben', 'anakin', 'princess_leia']

In [9]:
dataset = pd.read_csv('./dataset/starwars_text_dataset_cleaned.txt', delimiter='\n', header=None, error_bad_lines=False)
dataset

b'Skipping line 2445: expected 1 fields, saw 2\nSkipping line 3096: expected 1 fields, saw 2\nSkipping line 10258: expected 1 fields, saw 2\nSkipping line 11580: expected 1 fields, saw 2\nSkipping line 16550: expected 1 fields, saw 2\nSkipping line 17640: expected 1 fields, saw 2\n'


Unnamed: 0,0
0,luke_skywalker is a_fictional_character and th...
1,"portrayed by mark_hamill, luke first appeared ..."
2,": the_force awakens (2015),the"
3,"last_jedi (2017), and the_rise of luke_skywalk..."
4,"the_rescue"" (2020), voicing the_character that..."
...,...
20509,"in 2016, serkis was nominated for an_mtv_movie..."
20510,some_viewers felt that snoke's_character_arc w...
20511,various_fan_theories about his_origins were he...
20512,serkis addressed the_criticisms by saying prod...


In [5]:
descriptions = {}
for index, row in dataset.iterrows():
    sentence = dataset.iloc[index][0]
    for member in cluster_members:
        found = re.search("(^|[a-zA-Z\s])"+member+"($|[a-zA-Z\s])", sentence)
        if found:
#             doc = nlp(sentence)
#             tokens = []
#             for sent in doc.sents:
#                 for token in sent:
#                     tokens.append(token.text)
            tokens = sentence.split(' ')
                        
            if not member in descriptions:
                descriptions[member] = [tokens]
            else:
                descriptions[member].append(tokens)

In [6]:
total = 0
for i in descriptions:
    print(i, len(descriptions[i]))
    total += len(descriptions[i])
print('\ntotal: {}'.format(total))

r2-d2 133
princess_leia 74
ben 119
anakin 321
kuiil 89

total: 736


### Find most occuring words to describe cluster member

In [7]:
for member in descriptions:
    words = []
    for i in descriptions[member]:
        words += i
    print(member, len(words))
    most_common_words = [word for word, word_count in Counter(words).most_common(10)]
    print(most_common_words)
    print()

r2-d2 3378
['and', 'r2-d2', 'to', 'in', 'of', 'is', 'c-3po', 'by', 'with', 'on']

princess_leia 2762
['of', 'and', 'princess_leia', 'to', 'in', 'by', 'as', 'is', 'on', 'that']

ben 3644
['and', 'to', 'of', 'ben', 'in', 'as', 'with', 'luke', 'that', 'is']

anakin 8512
['anakin', 'and', 'to', 'of', 'in', 'is', 'as', 'that', 'by', 'with']

kuiil 2330
['kuiil', 'and', 'of', 'to', 'in', 'for', 'on', 'that', 'is', 'as']



### Pointwise Mutual Information (PMI)

$ PMI(x,y) = log [\frac{p(x,y)}{p(x)p(y)}]$

In [60]:
descriptions['anakin'][0]

['jake_lloyd',
 'is',
 'referenced',
 'in',
 'both',
 'the',
 'force',
 'awakens',
 'and',
 'the_jedi,',
 'and',
 'makes',
 'vocal_cameos',
 'as',
 'both_vader',
 'and',
 'anakin',
 'in',
 'the_rise',
 'of',
 'skywalker.']

In [77]:
sentence = descriptions['anakin'][0]

threegram = ngrams(sentence, 3)
fivegram = ngrams(sentence, 5)

In [78]:
fdist = FreqDist()
fdist.update(threegram)
fdist
for k,v in fdist.items():
    print(k,v)

('jake_lloyd', 'is', 'referenced') 1
('is', 'referenced', 'in') 1
('referenced', 'in', 'both') 1
('in', 'both', 'the') 1
('both', 'the', 'force') 1
('the', 'force', 'awakens') 1
('force', 'awakens', 'and') 1
('awakens', 'and', 'the_jedi,') 1
('and', 'the_jedi,', 'and') 1
('the_jedi,', 'and', 'makes') 1
('and', 'makes', 'vocal_cameos') 1
('makes', 'vocal_cameos', 'as') 1
('vocal_cameos', 'as', 'both_vader') 1
('as', 'both_vader', 'and') 1
('both_vader', 'and', 'anakin') 1
('and', 'anakin', 'in') 1
('anakin', 'in', 'the_rise') 1
('in', 'the_rise', 'of') 1
('the_rise', 'of', 'skywalker.') 1


In [79]:
fdist2 = FreqDist()
fdist2.update(fivegram)
fdist2
for k,v in fdist2.items():
    print(k,v)

('jake_lloyd', 'is', 'referenced', 'in', 'both') 1
('is', 'referenced', 'in', 'both', 'the') 1
('referenced', 'in', 'both', 'the', 'force') 1
('in', 'both', 'the', 'force', 'awakens') 1
('both', 'the', 'force', 'awakens', 'and') 1
('the', 'force', 'awakens', 'and', 'the_jedi,') 1
('force', 'awakens', 'and', 'the_jedi,', 'and') 1
('awakens', 'and', 'the_jedi,', 'and', 'makes') 1
('and', 'the_jedi,', 'and', 'makes', 'vocal_cameos') 1
('the_jedi,', 'and', 'makes', 'vocal_cameos', 'as') 1
('and', 'makes', 'vocal_cameos', 'as', 'both_vader') 1
('makes', 'vocal_cameos', 'as', 'both_vader', 'and') 1
('vocal_cameos', 'as', 'both_vader', 'and', 'anakin') 1
('as', 'both_vader', 'and', 'anakin', 'in') 1
('both_vader', 'and', 'anakin', 'in', 'the_rise') 1
('and', 'anakin', 'in', 'the_rise', 'of') 1
('anakin', 'in', 'the_rise', 'of', 'skywalker.') 1


In [103]:
occur_count = 0
for sentence in descriptions['anakin']:
    threegram = ngrams(sentence, 3)
    fivegram = ngrams(sentence, 5)
    
    
    for gram3 in threegram:
        for gram5 in fivegram:
#             print(gram5, gram3)
            if all(a in sentence for a in list(gram5)) and all(a in sentence for a in list(gram3)):
                occur_count += 1
                
joint_prob = occur_count / len(descriptions['anakin'])
print('{}/{} = {}'.format(occur_count, len(descriptions['anakin']), joint_prob))

7228/321 = 22.517133956386292


In [146]:
words = []
for sentence in description_corpus:
    words += sentence
words = list(set(words))

word_word_matrix = pd.DataFrame(-10, index=words, columns=words)
word_word_matrix = word_word_matrix.drop(index='')
word_word_matrix = word_word_matrix.drop(columns='')
word_word_matrix

Unnamed: 0,the_captain,replies,their_real_mother.,used,kueller,princess_leia's_brother_and_jedi_knight_anakin_skywalker_friend.\nthrough,the_baby,lee_towersey,son.,supported,...,"evil"",",'star_wars:\nthe,a_star_wars_encyclopedia,d-o.,shouldn't,an_imperial_star_destroyer.,12,breha_organa,encourages,ever
the_captain,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,...,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10
replies,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,...,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10
their_real_mother.,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,...,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10
used,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,...,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10
kueller,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,...,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
an_imperial_star_destroyer.,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,...,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10
12,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,...,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10
breha_organa,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,...,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10
encourages,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,...,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10


In [107]:
sentence = descriptions['anakin'][0]

bigram = ngrams(sentence, 2)
for gram in bigram:
    print(gram)

('jake_lloyd', 'is')
('is', 'referenced')
('referenced', 'in')
('in', 'both')
('both', 'the')
('the', 'force')
('force', 'awakens')
('awakens', 'and')
('and', 'the_jedi,')
('the_jedi,', 'and')
('and', 'makes')
('makes', 'vocal_cameos')
('vocal_cameos', 'as')
('as', 'both_vader')
('both_vader', 'and')
('and', 'anakin')
('anakin', 'in')
('in', 'the_rise')
('the_rise', 'of')
('of', 'skywalker.')


In [None]:
r2-d2 133
princess_leia 74
ben 119
anakin 321
kuiil 89

In [51]:
133 + 74 + 119 + 321 + 89

736

In [125]:
description_corpus = []

for sentence in descriptions['r2-d2']:
    description_corpus.append(sentence)

for sentence in descriptions['princess_leia']:
    description_corpus.append(sentence)

for sentence in descriptns['ben']:
    description_corpus.append(sentence)

for sentence in descriptions['anakin']:
    description_corpus.append(sentence)

for sentence in descriptions['kuiil']:
    description_corpus.append(sentence)

print(len(description_corpus))

736


$ PMI(w,c) = log [\frac{p(w,c)}{p(w)p(c)}]$

In [None]:
2         r2-d2/
3             order/
3             c-3po/
3             the_galaxy/
3             boba_fett/
3             luke_skywalker/
3             anakin_skywalker/
3             solo/
3             obi-wan_kenobi/
3             darth_maul/
3             the_rebel_alliance/
3             princess_leia/
2         kuiil/
3             poe/
3             gideon/
3             he/
3             she/
3             thrawn/
3             rose/
3             kuiil/
3             they/
3             cara/
3             grogu/
2         ben/
2         anakin/
3             the_dark_side/
3             the_millennium_falcon/
3             tatooine/
3             darth_sidious/
3             the_resistance/
3             dooku/
3             lando/
3             ventress/
3             vader/
3             the_first_order/
3             ben/
3             him/
3             han/
3             the_force/
2         princess_leia/

In [145]:
len(consideration)

41

In [147]:
# N = len(descriptions['anakin'])
N = len(description_corpus)

cols = word_word_matrix.columns.values
    
# word = 'anakin'
# for word in ['r2-d2', 'princess_leia', 'ben', 'anakin', 'kuiil', 'kylo_ren']:

consideration = ['r2-d2', 'order', 'c-3po', 'the_galaxy', 'boba_fett', 'luke_skywalker', 'anakin_skywalker', 'solo', 'obi-wan_kenobi', 'darth_maul', 'the_rebel_alliance', 'princess_leia', 
'kuiil', 'poe', 'gideon', 'he', 'she', 'thrawn', 'rose', 'kuiil', 'they', 'cara', 'grogu', 'ben', 'anakin', 'the_dark_side', 'the_millennium_falcon', 'tatooine', 'darth_sidious', 'the_resistance', 'dooku', 'lando', 'ventress', 'vader', 'the_first_order', 'ben', 'him', 'han', 'the_force', 'princess_leia', 'kylo_ren']

for word in consideration:
    print('working on {}'.format(word))
    for col in cols:
        p_w = 0
        p_c = 0
        p_w_c = 0
        for i, sentence in enumerate(description_corpus):
            if word in sentence:
                p_w += 1

            if col in sentence:
                p_c += 1

            if word in sentence and col in sentence:
                p_w_c += 1

        p_w = p_w / N
        p_c = p_c / N
        p_w_c = p_w_c / N
        pmi = np.log(p_w_c / (p_w * p_c))
        word_word_matrix.loc[col, word] = format_int(pmi)
    

working on r2-d2




working on order
working on c-3po
working on the_galaxy
working on boba_fett
working on luke_skywalker
working on anakin_skywalker
working on solo
working on obi-wan_kenobi
working on darth_maul
working on the_rebel_alliance
working on princess_leia
working on kuiil
working on poe
working on gideon
working on he
working on she
working on thrawn
working on rose
working on kuiil
working on they
working on cara
working on grogu
working on ben
working on anakin
working on the_dark_side
working on the_millennium_falcon
working on tatooine
working on darth_sidious
working on the_resistance
working on dooku
working on lando
working on ventress
working on vader
working on the_first_order
working on ben
working on him
working on han
working on the_force
working on princess_leia
working on kylo_ren


In [135]:
for word in ['r2-d2', 'princess_leia', 'ben', 'anakin', 'kuiil', 'kylo_ren']:
    index = word_word_matrix[word].argmax()
    print('Closest Related Pair: ({}, {}), PMI: {}'.format(word, word_word_matrix.iloc[index].name, word_word_matrix.iloc[index][word]))

Closest Related Pair: (r2-d2, kueller), PMI: 1.6314168191528755
Closest Related Pair: (princess_leia, their_real_mother.), PMI: 2.3107706775804853
Closest Related Pair: (ben, son.), PMI: 2.3107706775804853
Closest Related Pair: (anakin, replies), PMI: 0.8082165103447326
Closest Related Pair: (kuiil, occasionally), PMI: 2.1353220000742925
Closest Related Pair: (kylo_ren, an_agent), PMI: 5.502617830060767


In [148]:
for word in consideration:
    index = word_word_matrix[word].argmax()
    print('Closest Related Pair: ({}, {}), PMI: {}'.format(word, word_word_matrix.iloc[index].name, word_word_matrix.iloc[index][word]))

Closest Related Pair: (r2-d2, kueller), PMI: 1.6314168191528755
Closest Related Pair: (order, princess_leia's_brother_and_jedi_knight_anakin_skywalker_friend.
through), PMI: 4.298645025734831
Closest Related Pair: (c-3po, (anthony_daniels)), PMI: 2.7094098206182498
Closest Related Pair: (the_galaxy, marshal), PMI: 4.404005541392657
Closest Related Pair: (boba_fett, beneficial,), PMI: 5.908082938168931
Closest Related Pair: (luke_skywalker, their_real_mother.), PMI: 3.9621727891136174
Closest Related Pair: (anakin_skywalker, the_dark_nest_trilogy,), PMI: 5.502617830060767
Closest Related Pair: (solo, (2003),), PMI: 4.991792206294776
Closest Related Pair: (obi-wan_kenobi, princess_leia's_brother_and_jedi_knight_anakin_skywalker_friend.
through), PMI: 4.0362807612673395
Closest Related Pair: (darth_maul, the_story_resurrection,), PMI: 6.601230118728877
Closest Related Pair: (the_rebel_alliance, who,), PMI: 5.502617830060767
Closest Related Pair: (princess_leia, their_real_mother.), PMI: 2

In [12]:
def format_int(x):
    if x in [float("-inf"),float("inf")]: return float("nan")
    return x