# w2widget
*Words to Widget*

Import widget

In [69]:
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from w2v_widget.widget import WVWidget
%aimport w2v_widget.widget

## Load data

In [3]:
from nltk.corpus import inaugural, twitter_samples

# import nltk
# nltk.download('twitter_samples')


## Twitter samples

In [None]:
import json


In [70]:
twitter_samples._readme = 'README.txt'
print(twitter_samples.readme())

Twitter Samples

These samples of Tweets (or 'status updates') were collected from the
Twitter Streaming and REST APIs (see
https://dev.twitter.com/overview/documentation,
https://dev.twitter.com/streaming/overview). Each file consists of
line-separated JSON-formatted tweets, i.e. one Tweet per line. For a
detailed description of the JSON fields in a Tweet, see
https://dev.twitter.com/overview/api/tweets.

Any use of this data is subject to the Twitter Developer Agreement and
Developer Policy:
https://dev.twitter.com/overview/terms/agreement-and-policy.

####################################################
* tweets.20150430-223406.json:

Collected from the public stream of the Streaming API using the
'statuses / filter' endpoint. The value of 'track' was set to the
following keywords: 

"david cameron, miliband, milliband, sturgeon, clegg, farage, tory,
tories, ukip, snp, libdem"

####################################################
* positive_tweets.json
* negative_tweets.json

These 

In [4]:
twitter_samples.abspaths()


[FileSystemPathPointer('C:\\Users\\tobia\\AppData\\Roaming\\nltk_data\\corpora\\twitter_samples\\negative_tweets.json'),
 FileSystemPathPointer('C:\\Users\\tobia\\AppData\\Roaming\\nltk_data\\corpora\\twitter_samples\\positive_tweets.json'),
 FileSystemPathPointer('C:\\Users\\tobia\\AppData\\Roaming\\nltk_data\\corpora\\twitter_samples\\tweets.20150430-223406.json')]

In [33]:
tweets_path = twitter_samples.abspaths()[-1]
# with open(tweets_path, 'r') as f:
tweets = []
with twitter_samples.open(tweets_path) as f:
    for line in f:
        tweets.append(json.loads(line.strip()))


In [34]:
docs = [tweet["text"] for tweet in tweets if "retweeted_status" not in tweet]


### Reuters

In [4]:
# docs = []

# for path in reuters.fileids():
#     with reuters.open(path) as f:
#         docs.append(f.read())


### Inaugural speeches

In [5]:
docs = []

for doc in inaugural.abspaths():
    # with open(doc, 'r') as f:
    f = inaugural.open(doc)
    docs.append(f.read())
    f.close()


## Text preprocessing

In [35]:
from nltk.tokenize import WordPunctTokenizer
from typing import List


In [36]:
from functools import partial


In [37]:
tokenizer = WordPunctTokenizer()


In [38]:
def tokenize_with_ws(text: str, tokenizer) -> List[str]:
    return [x for y in [tokenizer(x) + [" "] for x in text.split()] for x in y]


tokenizer.tokenize_with_ws = partial(tokenize_with_ws, tokenizer=tokenizer.tokenize)


In [39]:
document_tokens = [
    [token.lower() for token in tokenizer.tokenize_with_ws(doc) if token.isalnum()]
    for doc in docs
]


## Train word2vec model

In [40]:
from gensim.models import Word2Vec


In [41]:
w2v = Word2Vec()


In [42]:
wv_model = Word2Vec(
    document_tokens,
    vector_size=200,
    window=10,
    workers=4,
    seed=42,
    epochs=100,
    min_count=2,
).wv


In [76]:
wv_model.most_similar('cameron')

[('snap', 0.3149271309375763),
 ('followback', 0.29635483026504517),
 ('slip', 0.2953941822052002),
 ('banana', 0.28195929527282715),
 ('slaughtered', 0.2637789845466614),
 ('trip', 0.256316214799881),
 ('bbcquestiontime', 0.24670615792274475),
 ('lip', 0.24356338381767273),
 ('close', 0.24038226902484894),
 ('crosby', 0.2403440773487091)]

## Reduce dimensions

In [43]:
from sklearn.decomposition import PCA

# from sklearn.manifold import TSNE
from openTSNE import TSNE


In [44]:
normed_vectors = wv_model.get_normed_vectors()


In [45]:
pca = PCA(n_components=50)
pca_embedding = pca.fit_transform(normed_vectors)

TSNE_embedding = TSNE(
    n_components=2, learning_rate="auto", random_state=420, verbose=1
).fit(pca_embedding)

two_dim_word_vectors = TSNE_embedding.transform(pca_embedding)


--------------------------------------------------------------------------------
TSNE(random_state=420, verbose=1)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 3.42 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.37 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.01 seconds
===> Running optimization with exaggeration=12.00, lr=414.33 for 250 iterations...
Iteration   50, KL divergence 4.8902, 50 iterations in 4.0526 sec
Iteration  100, KL divergence 4.9038, 50 iterations in 4.0115 sec
Iteration  150, KL divergence 4.9060, 50 iterations in 4.9995 sec
Iteration  200, KL divergence 4.8923, 50 iterations in 5.1460 sec
Iteration  250, KL divergence 4.9091, 50 iterations in 4.2149 sec
   --> Time elapsed: 22.43 seconds
===> Running optimization with exaggeration=1.00, lr=414.33 for 500 iterations..

## doc2vec

https://towardsdatascience.com/word-embeddings-and-document-vectors-when-in-doubt-simplify-8c9aaeec244e

https://towardsdatascience.com/document-embedding-techniques-fed3e7a6a25d

In [48]:
from w2v_widget.utils import Doc2Vec, calculate_inverse_frequency
%aimport w2v_widget.utils

  from tqdm.autonotebook import tqdm


In [49]:
word_weights = calculate_inverse_frequency(document_tokens)


In [50]:
dv_model = Doc2Vec(wv_model, word_weights)


In [51]:
dv_model.add_doc2vec([[word.lower().strip() for word in doc.split()] for doc in docs])


  0%|          | 0/6723 [00:00<?, ?it/s]

In [52]:
dv_model.reduce_dimensions()


In [53]:
dv_tsne_embedding = dv_model.TSNE_embedding_array


## Display widget

In [58]:
wv_widget = WVWidget(
    wv_model=wv_model,
    two_dim_word_embedding=TSNE_embedding,
    dv_model=dv_model,
    two_dim_doc_embedding=dv_tsne_embedding,
    tokens_with_ws=[tokenizer.tokenize_with_ws(doc) for doc in docs],
    initial_search_words=["president", "climate"],
)


In [60]:
wv_widget.display_widget()


VBox(children=(HTML(value='<style>\n.widget-button {\n    margin-right: 160px;\n}\n\n.widget-select-multiple {…

<IPython.core.display.Javascript object>

Add a word:
- to queries
- to topics when save


In [77]:
wv_widget.queries

{'test': ['cameron', 'snap', 'followback']}

In [78]:
wv_widget.topics

{}

In [63]:
from ipywidgets.embed import embed_minimal_html, dependency_state


In [64]:
embed_minimal_html(
    "export.html",
    views=wv_widget.view,
    title="w2widget",
    state=dependency_state(wv_widget.view),
)
