# w2widget
*Words to Widget*

Import widget

In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
from w2v_widget.widget import WVWidget
%aimport w2v_widget.widget

## Load data

In [3]:
from nltk.corpus import inaugural, reuters
# import nltk
# nltk.download('inaugural')

### Reuters

In [4]:
# docs = []

# for path in reuters.fileids():
#     with reuters.open(path) as f:
#         docs.append(f.read())

### Inaugural speeches

In [5]:
docs = []

for doc in inaugural.abspaths():
    #with open(doc, 'r') as f:
    f = inaugural.open(doc)
    docs.append(f.read())
    f.close()

## Train word2vec model

In [6]:
from gensim.models import Word2Vec

In [7]:
w2v = Word2Vec()

In [8]:
wv_model = Word2Vec(
    [
        [word.lower().strip() for word in doc.split()] for doc in docs
    ], vector_size=200, window=10, workers=4, seed=42, epochs=100, min_count=2).wv

## Reduce dimensions

In [9]:
from sklearn.decomposition import PCA
#from sklearn.manifold import TSNE
from openTSNE import TSNE

In [10]:
normed_vectors = wv_model.get_normed_vectors()

In [11]:
pca = PCA(n_components=50)
pca_embedding = pca.fit_transform(normed_vectors)

TSNE_embedding = TSNE(
    n_components=2, 
    learning_rate='auto',
    random_state=420,
    verbose=1
).fit(pca_embedding)

two_dim_word_vectors = TSNE_embedding.transform(pca_embedding)

--------------------------------------------------------------------------------
TSNE(random_state=420, verbose=1)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 6.55 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 1.19 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.02 seconds
===> Running optimization with exaggeration=12.00, lr=564.75 for 250 iterations...
Iteration   50, KL divergence 5.3205, 50 iterations in 7.8918 sec
Iteration  100, KL divergence 5.2963, 50 iterations in 5.2588 sec
Iteration  150, KL divergence 5.3292, 50 iterations in 4.5732 sec
Iteration  200, KL divergence 5.3288, 50 iterations in 4.3540 sec
Iteration  250, KL divergence 5.3485, 50 iterations in 4.3112 sec
   --> Time elapsed: 26.39 seconds
===> Running optimization with exaggeration=1.00, lr=564.75 for 500 iterations..

In [12]:
wv_model.most_similar(positive='president', negative='vice')

[('case', 0.33405646681785583),
 ('constitution', 0.3333568871021271),
 ('station,', 0.29565098881721497),
 ('vote', 0.2892177700996399),
 ('respects', 0.2876661717891693),
 ('route', 0.28618890047073364),
 ('acquiesced', 0.28555968403816223),
 ('conscious', 0.2761351764202118),
 ('sovereign', 0.2750607132911682),
 ('influence', 0.270988792181015)]

## doc2vec

https://towardsdatascience.com/word-embeddings-and-document-vectors-when-in-doubt-simplify-8c9aaeec244e

https://towardsdatascience.com/document-embedding-techniques-fed3e7a6a25d

In [13]:
from w2v_widget.utils import Doc2Vec, calculate_inverse_frequency
%aimport w2v_widget.utils

  from tqdm.autonotebook import tqdm


In [14]:
word_weights = calculate_inverse_frequency(
    [
        [word.lower().strip() for word in doc.split()] for doc in docs
    ]
)

In [15]:
dv_model = Doc2Vec(wv_model, word_weights)

In [16]:
dv_model.add_doc2vec([[word.lower().strip() for word in doc.split()] for doc in docs])

  0%|          | 0/59 [00:00<?, ?it/s]

In [18]:
dv_model.reduce_dimensions()

In [19]:
dv_tsne_embedding = dv_model.TSNE_embedding_array

## Display widget

In [20]:
wv_widget = WVWidget(wv_model=wv_model,
                     two_dim_word_embedding=TSNE_embedding,
                     dv_model=dv_model,
                     two_dim_doc_embedding=dv_tsne_embedding, 
                     tokens_with_ws=docs)

In [71]:
wv_widget.display_widget()

VBox(children=(HTML(value='<style>\n.widget-button {\n    margin-right: 160px;\n}\n\n.widget-select-multiple {…

In [54]:
plot_tab = None

In [59]:
from traitlets import TraitError

In [67]:
plot_tab = widgets.Tab()
plot_tab.children = (widgets.Box([wv_widget.wv_figure_widget]), widgets.Box([wv_widget.dv_figure_widget]))
plot_tab._titles = {0:"Words", 1:"Documents"}

In [70]:
()

()

In [65]:
plot_tab.children = (wv_widget.wv_figure_widget, wv_widget.dv_figure_widget)

In [68]:
plot_tab

Tab(children=(Box(children=(FigureWidget({
    'data': [{'marker': {'color': 'green'},
              'mode': '…

In [71]:
wv_widget.plot_tab._titles = ['Word', 'Document']

TraitError: The '_titles' trait of a Tab instance expected a dict, not the list ['Word', 'Document'].

In [74]:
import ipywidgets

In [67]:
wv_widget.plot_tab

Tab(layout=Layout(margin='0px 50px 0px 0px'))

In [59]:
, wv_widget.dv_figure_widget

(FigureWidget({
     'data': [{'marker': {'color': 'green'},
               'mode': 'markers',
               'name': 'Similar',
               'text': [],
               'type': 'scatter',
               'uid': '725b2e07-3bb6-4af9-a46b-6fa610bb1873',
               'x': array([], dtype=float64),
               'y': array([], dtype=float64)},
              {'marker': {'color': 'blue'},
               'mode': 'markers',
               'name': 'Query',
               'text': [],
               'type': 'scatter',
               'uid': '638ba9d4-b17f-4020-8ede-9186d39ccb33',
               'x': array([], dtype=float64),
               'y': array([], dtype=float64)},
              {'marker': {'color': 'orange'},
               'mode': 'markers',
               'name': 'Topic',
               'text': [],
               'type': 'scatter',
               'uid': '2b97e341-c636-4f0c-bef9-17eb145c48fe',
               'x': array([], dtype=float64),
               'y': array([], dtype=float64)},
 

In [25]:
from IPython.display import display
import ipywidgets as widgets

In [46]:
tab = widgets.Tab()

In [47]:
tab

Tab()

In [49]:
tab.children = [widgets.Text(x) for x in ['x1', 'x2']]

In [54]:
widgets.GridBox([tab, widgets.Text('test')])

GridBox(children=(Tab(children=(Text(value='x1'), Text(value='x2'))), Text(value='test')))

In [42]:
widgets.Box(wv_widget.plot_tab, style={'margin':0, 'padding':0})

Box(children=(Tab(children=(FigureWidget({
    'data': [{'marker': {'color': 'green'},
              'mode': '…

In [142]:
wv_widget.generate_plot_figure(
    "Embedding of doc2vec-space",
    xy_range = wv_widget.dv_get_axis_range()
)

TypeError: generate_plot_figure() got an unexpected keyword argument 'xy_range'

In [138]:
wv_widget.dv_get_axis_range()

(PartialTSNEEmbedding([ 1.4241119 ,  1.57582199,  0.87745116,  1.58169787,
                        1.25462676,  1.54895266,  1.07024655,  0.53830533,
                        0.6841267 ,  0.78573008,  0.81127594,  0.90592049,
                        1.14196477,  0.02201661, -0.26609285,  0.98473553,
                        0.86947298, -0.00229526, -1.00784356,  1.55266983,
                       -0.20643438,  0.44639983,  0.46315278, -0.19492664,
                        0.0435651 , -0.29481291,  0.04428599, -0.37549166,
                       -0.16064737, -0.15761923, -0.68825903,  0.08589982,
                       -0.25373281, -0.94174959, -0.44463227, -0.56580457,
                       -0.51645391,  0.00731056,  0.27862024, -1.11932547,
                       -1.57258535, -1.24824686, -1.42303123, -1.23983342,
                       -0.12657384, -0.28795227,  0.37171875, -0.484488  ,
                       -0.49594469, -0.14436149, -0.58453489, -0.3686936 ,
                        0

In [126]:
wv_widget.topic_words

['god', 'light', 'fire']

In [137]:
wv_widget.add_document_embedding_traces()

In [133]:
wv_widget.dv_get_axis_range()

(PartialTSNEEmbedding([ 1.4241119 ,  1.57582199,  0.87745116,  1.58169787,
                        1.25462676,  1.54895266,  1.07024655,  0.53830533,
                        0.6841267 ,  0.78573008,  0.81127594,  0.90592049,
                        1.14196477,  0.02201661, -0.26609285,  0.98473553,
                        0.86947298, -0.00229526, -1.00784356,  1.55266983,
                       -0.20643438,  0.44639983,  0.46315278, -0.19492664,
                        0.0435651 , -0.29481291,  0.04428599, -0.37549166,
                       -0.16064737, -0.15761923, -0.68825903,  0.08589982,
                       -0.25373281, -0.94174959, -0.44463227, -0.56580457,
                       -0.51645391,  0.00731056,  0.27862024, -1.11932547,
                       -1.57258535, -1.24824686, -1.42303123, -1.23983342,
                       -0.12657384, -0.28795227,  0.37171875, -0.484488  ,
                       -0.49594469, -0.14436149, -0.58453489, -0.3686936 ,
                        0

In [23]:
wv_widget.search_words

['president', 'george']

In [141]:
self = wv_widget

In [143]:
self.skip_words

['vice']

In [142]:
self.on_load_button_clicked(change=True)

In [134]:
self.accept_checkboxes

[Checkbox(value=True, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=True, indent=False),
 Checkbox(value=True, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False),
 Checkbox(value=False, indent=False)]

In [120]:
self.update_output()

AttributeError: 'bool' object has no attribute 'value'

In [118]:
self.search_menu.value

('possibility',)

In [116]:
[
    word for word in self.search_words if word not in self.search_menu.value
]

[]

In [108]:
if '':
    print('h')

In [87]:
[label.value for label in wv_widget.checkboxes_text]

['vice (0.47)',
 'senator (0.45)',
 'citizens: (0.43)',
 'ceremony (0.42)',
 'congress, (0.42)',
 'transfer (0.41)',
 'bush, (0.41)',
 'predecessor, (0.4)',
 'presence (0.4)',
 'appoint (0.4)']