# w2widget
*Words to Widget*

Import widget

In [2]:
%load_ext autoreload
%autoreload 1

In [3]:
from w2v_widget.widget import WVWidget
%aimport w2v_widget.widget

## Load data

In [4]:
from nltk.corpus import inaugural, reuters
# import nltk
# nltk.download('inaugural')

### Reuters

In [57]:
docs = []

for path in reuters.fileids():
    with reuters.open(path) as f:
        docs.append(f.read())

### Inaugural speeches

In [11]:
docs = []

for doc in inaugural.abspaths():
    #with open(doc, 'r') as f:
    f = inaugural.open(doc)
    docs.append(f.read())
    f.close()

## Train word2vec model

In [8]:
from gensim.models import Word2Vec

In [9]:
w2v = Word2Vec()

In [14]:
wv_model = Word2Vec(
    [
        [word.lower().strip() for word in doc.split()] for doc in docs
    ], vector_size=200, window=10, workers=4, seed=42, epochs=100, min_count=2).wv

## Reduce dimensions

In [17]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [18]:
normed_vectors = wv_model.get_normed_vectors()

In [19]:
pca = PCA(n_components=50)
pca_embedding = pca.fit_transform(normed_vectors)

TSNE_embedding = TSNE(
    n_components=2, 
    learning_rate='auto',
    init='random', 
    random_state=420,
    verbose=1
).fit_transform(pca_embedding)


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 6777 samples in 0.001s...
[t-SNE] Computed neighbors for 6777 samples in 2.055s...
[t-SNE] Computed conditional probabilities for sample 1000 / 6777
[t-SNE] Computed conditional probabilities for sample 2000 / 6777
[t-SNE] Computed conditional probabilities for sample 3000 / 6777
[t-SNE] Computed conditional probabilities for sample 4000 / 6777
[t-SNE] Computed conditional probabilities for sample 5000 / 6777
[t-SNE] Computed conditional probabilities for sample 6000 / 6777
[t-SNE] Computed conditional probabilities for sample 6777 / 6777
[t-SNE] Mean sigma: 0.244651
[t-SNE] KL divergence after 250 iterations with early exaggeration: 99.460617
[t-SNE] KL divergence after 1000 iterations: 3.695014


In [96]:
wv_model.most_similar(positive='president', negative='vice')

[('case', 0.35096365213394165),
 ('shield', 0.3467693328857422),
 ('person', 0.3253069519996643),
 ('congress', 0.3177087604999542),
 ('veto', 0.30393755435943604),
 ('not,', 0.2921431064605713),
 ('neglect', 0.28800955414772034),
 ('practicable', 0.28466546535491943),
 ('any', 0.28271952271461487),
 ('subject', 0.2781681716442108)]

## doc2vec

https://towardsdatascience.com/word-embeddings-and-document-vectors-when-in-doubt-simplify-8c9aaeec244e

https://towardsdatascience.com/document-embedding-techniques-fed3e7a6a25d

In [24]:
from w2v_widget.utils import Doc2Vec


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)



In [90]:
from collections import Counter
from typing import List, Dict

def calculate_inverse_frequency(docs:List[List[str]]) -> Dict:
    
    def word_if(word_count:int, total_count:int):
        return 1e-3/(1e-3+word_count/total_count)
    
    all_words = [x for y in docs for x in y]

    c = Counter(all_words)

    total_count = len(all_words)
    
    return {k:word_if(v, total_count) for k,v in c.items()}

In [91]:
word_weights = calculate_inverse_frequency([[word.lower().strip() for word in doc.split()] for doc in docs])

In [92]:
dv_model = Doc2Vec(wv_model, word_weights)

In [93]:
dv_model.add_doc2vec([[word.lower().strip() for word in doc.split()] for doc in docs])

  0%|          | 0/59 [00:00<?, ?it/s]

In [94]:
dv_model.most_similar(['light', 'god'])

[(8, 1.273523746656745),
 (30, 1.2454017719649713),
 (13, 1.207218891605),
 (17, 1.1891012644183097),
 (7, 1.187252786978486),
 (27, 1.181893240558797),
 (14, 1.1817671276666513),
 (9, 1.1785176445330041),
 (6, 1.1768039409951228),
 (2, 1.1673041927314738)]

## Display widget

In [20]:
wv_widget = WVWidget(wv_model=wv_model,
         two_dim_word_embedding=TSNE_embedding, 
         two_dim_doc_embedding=None, 
         tokens_with_ws=None)

In [21]:
wv_widget.display_widget()

VBox(children=(HTML(value='<style>\n.widget-button {\n    margin-right: 160px;\n}\n\n.widget-select-multiple {…

In [23]:
wv_widget.search_words

['president', 'george']

In [141]:
self = wv_widget

In [143]:
self.skip_words

['vice']

In [142]:
self.on_load_button_clicked(change=True)

In [134]:
self.accept_checkboxes

[Checkbox(value=True, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=True, indent=False),
 Checkbox(value=True, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False)]

In [120]:
self.update_output()

AttributeError: 'bool' object has no attribute 'value'

In [118]:
self.search_menu.value

('possibility',)

In [116]:
[
    word for word in self.search_words if word not in self.search_menu.value
]

[]

In [108]:
if '':
    print('h')

In [87]:
[label.value for label in wv_widget.checkboxes_text]

['vice (0.47)',
 'senator (0.45)',
 'citizens: (0.43)',
 'ceremony (0.42)',
 'congress, (0.42)',
 'transfer (0.41)',
 'bush, (0.41)',
 'predecessor, (0.4)',
 'presence (0.4)',
 'appoint (0.4)']