# w2widget
*Words to Widget*

Import widget

In [1]:
%load_ext autoreload
%autoreload 1

In [3]:
from w2widget.widget import Widget
%aimport w2widget.widget

## Load data

In [4]:
from nltk.corpus import inaugural, twitter_samples, reuters

# import nltk
# nltk.download('twitter_samples')


### Twitter samples

In [5]:
import json


In [70]:
twitter_samples._readme = 'README.txt'
print(twitter_samples.readme())

Twitter Samples

These samples of Tweets (or 'status updates') were collected from the
Twitter Streaming and REST APIs (see
https://dev.twitter.com/overview/documentation,
https://dev.twitter.com/streaming/overview). Each file consists of
line-separated JSON-formatted tweets, i.e. one Tweet per line. For a
detailed description of the JSON fields in a Tweet, see
https://dev.twitter.com/overview/api/tweets.

Any use of this data is subject to the Twitter Developer Agreement and
Developer Policy:
https://dev.twitter.com/overview/terms/agreement-and-policy.

####################################################
* tweets.20150430-223406.json:

Collected from the public stream of the Streaming API using the
'statuses / filter' endpoint. The value of 'track' was set to the
following keywords: 

"david cameron, miliband, milliband, sturgeon, clegg, farage, tory,
tories, ukip, snp, libdem"

####################################################
* positive_tweets.json
* negative_tweets.json

These 

In [4]:
twitter_samples.abspaths()


[FileSystemPathPointer('C:\\Users\\tobia\\AppData\\Roaming\\nltk_data\\corpora\\twitter_samples\\negative_tweets.json'),
 FileSystemPathPointer('C:\\Users\\tobia\\AppData\\Roaming\\nltk_data\\corpora\\twitter_samples\\positive_tweets.json'),
 FileSystemPathPointer('C:\\Users\\tobia\\AppData\\Roaming\\nltk_data\\corpora\\twitter_samples\\tweets.20150430-223406.json')]

In [33]:
tweets_path = twitter_samples.abspaths()[-1]
# with open(tweets_path, 'r') as f:
tweets = []
with twitter_samples.open(tweets_path) as f:
    for line in f:
        tweets.append(json.loads(line.strip()))


In [34]:
docs = [tweet["text"] for tweet in tweets if "retweeted_status" not in tweet]


### Reuters

In [6]:
docs = []

for path in reuters.fileids():
    with reuters.open(path) as f:
        docs.append(f.read())


### Inaugural speeches

In [5]:
docs = []

for doc in inaugural.abspaths():
    # with open(doc, 'r') as f:
    f = inaugural.open(doc)
    docs.append(f.read())
    f.close()


## Text preprocessing

In [36]:
from nltk.tokenize import WordPunctTokenizer
from typing import List
from functools import partial


In [8]:
tokenizer = WordPunctTokenizer()

def tokenize_with_ws(text: str, tokenizer) -> List[str]:
    return [x for y in [tokenizer(x) + [" "] for x in text.split()] for x in y]


tokenizer.tokenize_with_ws = partial(tokenize_with_ws, tokenizer=tokenizer.tokenize)


In [38]:
document_tokens = [
    [token.lower() for token in tokenizer.tokenize_with_ws(doc) if token.isalnum()]
    for doc in docs
]

tokens_with_ws = [tokenizer.tokenize_with_ws(doc) for doc in docs]

## Train word2vec model

In [10]:
from gensim.models import Word2Vec


In [11]:
w2v = Word2Vec()


In [12]:
wv_model = Word2Vec(
    document_tokens,
    vector_size=200,
    window=10,
    workers=4,
    seed=42,
    epochs=10,
    min_count=2,
).wv


In [20]:
wv_model.most_similar('oil')

[('petroleum', 0.6585260629653931),
 ('natural', 0.6534021496772766),
 ('pipeline', 0.5819165706634521),
 ('energy', 0.5693051218986511),
 ('crude', 0.5351841449737549),
 ('fuel', 0.5322396159172058),
 ('ecuador', 0.510562539100647),
 ('liquids', 0.48409831523895264),
 ('deliverability', 0.48102378845214844),
 ('pipelines', 0.47501084208488464)]

## Reduce dimensions

In [13]:
from sklearn.decomposition import PCA

# from sklearn.manifold import TSNE
from openTSNE import TSNE


In [14]:
normed_vectors = wv_model.get_normed_vectors()


In [15]:
pca = PCA(n_components=50)
pca_embedding = pca.fit_transform(normed_vectors)

TSNE_embedding = TSNE(
    n_components=2, learning_rate="auto", random_state=420, verbose=1
).fit(pca_embedding)

two_dim_word_vectors = TSNE_embedding.transform(pca_embedding)


--------------------------------------------------------------------------------
TSNE(random_state=420, verbose=1)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 15.20 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 1.20 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.04 seconds
===> Running optimization with exaggeration=12.00, lr=1664.00 for 250 iterations...
Iteration   50, KL divergence 6.0804, 50 iterations in 4.6534 sec
Iteration  100, KL divergence 6.0569, 50 iterations in 5.5207 sec
Iteration  150, KL divergence 6.0559, 50 iterations in 4.5588 sec
Iteration  200, KL divergence 6.0559, 50 iterations in 4.1589 sec
Iteration  250, KL divergence 6.0559, 50 iterations in 4.1326 sec
   --> Time elapsed: 23.02 seconds
===> Running optimization with exaggeration=1.00, lr=1664.00 for 500 iteration

## doc2vec

https://towardsdatascience.com/word-embeddings-and-document-vectors-when-in-doubt-simplify-8c9aaeec244e

https://towardsdatascience.com/document-embedding-techniques-fed3e7a6a25d

In [21]:
%aimport w2v_widget.utils
from w2widget.doc2vec import Doc2Vec, calculate_inverse_frequency

In [22]:
word_weights = calculate_inverse_frequency(document_tokens)


In [33]:
dv_model = Doc2Vec(wv_model, word_weights)

dv_model.add_doc2vec(document_tokens)

dv_model.reduce_dimensions()

dv_tsne_embedding = dv_model.TSNE_embedding_array


  0%|          | 0/10788 [00:00<?, ?it/s]

## Display widget

In [39]:
wv_widget = Widget(
    wv_model=wv_model,
    two_dim_word_embedding=TSNE_embedding,
    dv_model=dv_model,
    two_dim_doc_embedding=dv_tsne_embedding,
    tokens_with_ws=tokens_with_ws,
    initial_search_words=[],
)

wv_widget.display_widget()


VBox(children=(HTML(value='<style>\n.widget-button {\n    margin-right: 160px;\n}\n\n.widget-select-multiple {…

<IPython.core.display.Javascript object>

In [77]:
wv_widget.queries

{'test': ['cameron', 'snap', 'followback']}

In [78]:
wv_widget.topics

{}

In [63]:
from ipywidgets.embed import embed_minimal_html, dependency_state


In [64]:
embed_minimal_html(
    "export.html",
    views=wv_widget.view,
    title="w2widget",
    state=dependency_state(wv_widget.view),
)
