# w2widget
*Words to Widget*

Import widget

In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
from w2v_widget.widget import WVWidget
%aimport w2v_widget.widget

## Load data

In [3]:
from nltk.corpus import inaugural, reuters
# import nltk
# nltk.download('inaugural')

### Reuters

In [4]:
# docs = []

# for path in reuters.fileids():
#     with reuters.open(path) as f:
#         docs.append(f.read())

### Inaugural speeches

In [5]:
docs = []

for doc in inaugural.abspaths():
    #with open(doc, 'r') as f:
    f = inaugural.open(doc)
    docs.append(f.read())
    f.close()

## Text preprocessing

In [6]:
from nltk.tokenize import WordPunctTokenizer
from typing import List

In [7]:
from functools import partial

In [8]:
tokenizer = WordPunctTokenizer()

In [9]:
def tokenize_with_ws(text:str, tokenizer) -> List[str]:
    return [x for y in [tokenizer(x) + [' '] for x in text.split()] for x in y]

tokenizer.tokenize_with_ws = partial(tokenize_with_ws, tokenizer=tokenizer.tokenize)

In [10]:
document_tokens = [
        [
            token.lower() for token in tokenizer.tokenize_with_ws(doc) if token.isalnum()
        ] for doc in docs
    ]

## Train word2vec model

In [11]:
from gensim.models import Word2Vec

In [12]:
w2v = Word2Vec()

In [13]:
wv_model = Word2Vec(document_tokens, 
                    vector_size=200, 
                    window=10, 
                    workers=4, 
                    seed=42, 
                    epochs=100, 
                    min_count=2).wv

## Reduce dimensions

In [14]:
from sklearn.decomposition import PCA
#from sklearn.manifold import TSNE
from openTSNE import TSNE

In [15]:
normed_vectors = wv_model.get_normed_vectors()

In [16]:
pca = PCA(n_components=50)
pca_embedding = pca.fit_transform(normed_vectors)

TSNE_embedding = TSNE(
    n_components=2, 
    learning_rate='auto',
    random_state=420,
    verbose=1
).fit(pca_embedding)

two_dim_word_vectors = TSNE_embedding.transform(pca_embedding)

--------------------------------------------------------------------------------
TSNE(random_state=420, verbose=1)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 2.59 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.39 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.02 seconds
===> Running optimization with exaggeration=12.00, lr=455.42 for 250 iterations...
Iteration   50, KL divergence 5.0334, 50 iterations in 3.2400 sec
Iteration  100, KL divergence 5.0654, 50 iterations in 3.6330 sec
Iteration  150, KL divergence 5.0624, 50 iterations in 3.3560 sec
Iteration  200, KL divergence 5.0419, 50 iterations in 3.2810 sec
Iteration  250, KL divergence 5.0444, 50 iterations in 3.4100 sec
   --> Time elapsed: 16.92 seconds
===> Running optimization with exaggeration=1.00, lr=455.42 for 500 iterations..

In [17]:
wv_model.most_similar(positive='president')

[('chief', 0.522323727607727),
 ('neill', 0.516529381275177),
 ('bush', 0.514725387096405),
 ('senator', 0.4798869788646698),
 ('magistrate', 0.4788762927055359),
 ('vice', 0.4769551157951355),
 ('dole', 0.46583324670791626),
 ('guests', 0.45091012120246887),
 ('mathias', 0.4420539438724518),
 ('speaker', 0.43907949328422546)]

## doc2vec

https://towardsdatascience.com/word-embeddings-and-document-vectors-when-in-doubt-simplify-8c9aaeec244e

https://towardsdatascience.com/document-embedding-techniques-fed3e7a6a25d

In [18]:
from w2v_widget.utils import Doc2Vec, calculate_inverse_frequency
%aimport w2v_widget.utils

  from tqdm.autonotebook import tqdm


In [19]:
word_weights = calculate_inverse_frequency(document_tokens)

In [20]:
dv_model = Doc2Vec(wv_model, word_weights)

In [21]:
dv_model.add_doc2vec([[word.lower().strip() for word in doc.split()] for doc in docs])

  0%|          | 0/59 [00:00<?, ?it/s]

In [22]:
dv_model.reduce_dimensions()

Perplexity value 30 is too high. Using perplexity 19.33 instead


In [23]:
dv_tsne_embedding = dv_model.TSNE_embedding_array

## Display widget

In [24]:
wv_widget = WVWidget(wv_model=wv_model,
                     two_dim_word_embedding=TSNE_embedding,
                     dv_model=dv_model,
                     two_dim_doc_embedding=dv_tsne_embedding, 
                     tokens_with_ws = [tokenizer.tokenize_with_ws(doc) for doc in docs],
                     initial_search_words=['president', 'vice']
)

In [25]:
wv_widget.display_widget()

VBox(children=(HTML(value='<style>\n.widget-button {\n    margin-right: 160px;\n}\n\n.widget-select-multiple {…

In [31]:
from ipywidgets.embed import embed_minimal_html, dependency_state

In [32]:
embed_minimal_html('export.html', views=wv_widget.view, title='w2widget', state=dependency_state(wv_widget.view))

In [36]:
wv_widget.add_document_embedding_traces()


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.



In [284]:
wv_widget.html_format_text([tokenizer.tokenize_with_ws(doc) for doc in docs][0], ['and'])

'Fellow-Citizens of the Senate <span style="color:teal">and</span> of the House of Representatives: Among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order, <span style="color:teal">and</span> received on the 14th day of the present month. On the one hand, I was summoned by my Country, whose voice I can never hear but with veneration <span style="color:teal">and</span> love, from a retreat which I had chosen with the fondest predilection, <span style="color:teal">and</span>, in my flattering hopes, with an immutable decision, as the asylum of my declining years -- a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination, <span style="color:teal">and</span> of frequent interruptions in my health to the gradual waste committed on it by time. On the other hand, the magnitude <span style="color:teal">and</span> difficulty of 

In [91]:
wv_widget.dv_figure_widget.__sizeof__()

32

In [71]:
wv_widget.tokens_with_ws = [preprocess_doc(doc) for doc in docs]

In [76]:
wv_widget.add_document_embedding_traces()

In [None]:
from IPython.core.display import HTML

In [96]:
import ipywidgets as widgets

In [140]:
from plotly import graph_objects as go

In [155]:
caption = widgets.Label(value='The slider value is in its initial position.')
slider = widgets.IntSlider(min=-5, max=5, value=1, description='Slider')

def handle_slider_change(change):
    caption.value = 'The slider value is ' + (
        'negative' if change.new < 0 else 'nonnegative'
    )

slider.observe(handle_slider_change, names='value')

display(caption, slider)

Label(value='The slider value is in its initial position.')

IntSlider(value=1, description='Slider', max=5, min=-5)

In [171]:
print(tabs)

Tab(children=(FigureWidget({
    'data': [], 'layout': {'autosize': True, 'template': '...'}
}), FigureWidget({
    'data': [], 'layout': {'template': '...'}
})))


In [194]:
def on_tabs_change(change):
    f2.layout.autosize = True

f1 = go.FigureWidget()
f2 = go.FigureWidget()
tabs = widgets.Tab(
    children=[f1, f2]
)
tabs.observe(on_tabs_change, names='selected_index')
tabs

Tab(children=(FigureWidget({
    'data': [], 'layout': {'template': '...'}
}), FigureWidget({
    'data': [], …

In [180]:
f2.show()