# w2widget
*Words to Widget*

Import widget

In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
from w2v_widget.widget import WVWidget
%aimport w2v_widget.widget

## Load data

In [3]:
from nltk.corpus import inaugural, reuters
# import nltk
# nltk.download('inaugural')

### Reuters

In [57]:
docs = []

for path in reuters.fileids():
    with reuters.open(path) as f:
        docs.append(f.read())

### Inaugural speeches

In [4]:
docs = []

for doc in inaugural.abspaths():
    with open(doc, 'r') as f:
        docs.append(f.read())

## Train word2vec model

In [5]:
from gensim.models import Word2Vec

In [6]:
w2v = Word2Vec()

In [222]:
wv_model = Word2Vec(
    [
        [word.lower().strip() for word in doc.split()] for doc in docs
    ], vector_size=200, window=10, workers=4, seed=42, epochs=100, min_count=2).wv

## Reduce dimensions

In [8]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [223]:
normed_vectors = wv_model.get_normed_vectors()

In [224]:
pca = PCA(n_components=50)
pca_embedding = pca.fit_transform(normed_vectors)

TSNE_embedding = TSNE(
    n_components=2, 
    learning_rate='auto',
    init='random', 
    random_state=420,
    verbose=1
).fit_transform(pca_embedding)


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 6777 samples in 0.001s...
[t-SNE] Computed neighbors for 6777 samples in 1.392s...
[t-SNE] Computed conditional probabilities for sample 1000 / 6777
[t-SNE] Computed conditional probabilities for sample 2000 / 6777
[t-SNE] Computed conditional probabilities for sample 3000 / 6777
[t-SNE] Computed conditional probabilities for sample 4000 / 6777
[t-SNE] Computed conditional probabilities for sample 5000 / 6777
[t-SNE] Computed conditional probabilities for sample 6000 / 6777
[t-SNE] Computed conditional probabilities for sample 6777 / 6777
[t-SNE] Mean sigma: 0.244274
[t-SNE] KL divergence after 250 iterations with early exaggeration: 97.813324
[t-SNE] KL divergence after 1000 iterations: 3.700697


In [96]:
wv_model.most_similar(positive='president', negative='vice')

[('case', 0.35096365213394165),
 ('shield', 0.3467693328857422),
 ('person', 0.3253069519996643),
 ('congress', 0.3177087604999542),
 ('veto', 0.30393755435943604),
 ('not,', 0.2921431064605713),
 ('neglect', 0.28800955414772034),
 ('practicable', 0.28466546535491943),
 ('any', 0.28271952271461487),
 ('subject', 0.2781681716442108)]

## doc2vec

https://towardsdatascience.com/word-embeddings-and-document-vectors-when-in-doubt-simplify-8c9aaeec244e

https://towardsdatascience.com/document-embedding-techniques-fed3e7a6a25d

In [152]:
from w2v_widget.utils import Doc2Vec 

In [251]:
from collections import Counter

def inverse_frequency(word_count, total_count):
    return 1e-3/(1e-3+word_count/total_count)

In [274]:
all_words = [x for y in [[word.lower().strip() for word in doc.split()] for doc in docs] for x in y]

c = Counter(all_words)

total_count = len(all_words)
word_if = {k:inverse_frequency(v, total_count) for k,v in c.items()}

In [227]:
dv_model = Doc2Vec(wv_model)

In [228]:
dv_model.add_doc2vec([[word.lower().strip() for word in doc.split()] for doc in docs])

100%|██████████| 59/59 [00:00<00:00, 150.51it/s]


In [238]:
dv_model.most_similar(['president', 'vice'])

[(29, 1.1605741435860528),
 (40, 1.122883143061137),
 (31, 1.1149274458343688),
 (34, 1.1056027849790784),
 (56, 1.080133469840932),
 (46, 1.0798061905072247),
 (55, 1.0634980444673026),
 (32, 1.0634620461555329),
 (33, 1.0582027218552403),
 (38, 1.0496658140054815)]

## Display widget

In [232]:
wv_widget = WVWidget(wv_model=wv_model,
         two_dim_word_embedding=TSNE_embedding, 
         two_dim_doc_embedding=None, 
         tokens_with_ws=None)

In [233]:
wv_widget.display_widget()

VBox(children=(HTML(value='<style>\n.widget-button {\n    margin-right: 160px;\n}\n\n.widget-select-multiple {…

In [141]:
self = wv_widget

In [143]:
self.skip_words

['vice']

In [142]:
self.on_load_button_clicked(change=True)

In [134]:
self.accept_checkboxes

[Checkbox(value=True, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=True, indent=False),
 Checkbox(value=True, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False)]

In [120]:
self.update_output()

AttributeError: 'bool' object has no attribute 'value'

In [118]:
self.search_menu.value

('possibility',)

In [116]:
[
    word for word in self.search_words if word not in self.search_menu.value
]

[]

In [108]:
if '':
    print('h')

In [87]:
[label.value for label in wv_widget.checkboxes_text]

['vice (0.47)',
 'senator (0.45)',
 'citizens: (0.43)',
 'ceremony (0.42)',
 'congress, (0.42)',
 'transfer (0.41)',
 'bush, (0.41)',
 'predecessor, (0.4)',
 'presence (0.4)',
 'appoint (0.4)']