# w2widget
*Words to Widget*

Import widget

In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
import json

import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import WordPunctTokenizer

from w2widget.utils import tokenizer_with_ws
from w2widget.widget import Widget

%aimport w2widget.utils
%aimport w2widget.widget


## Load data

In [1]:
from nltk.corpus import inaugural, twitter_samples, reuters

# import nltk
# nltk.download('twitter_samples')


### Twitter samples

In [70]:
twitter_samples._readme = 'README.txt'
print(twitter_samples.readme())

Twitter Samples

These samples of Tweets (or 'status updates') were collected from the
Twitter Streaming and REST APIs (see
https://dev.twitter.com/overview/documentation,
https://dev.twitter.com/streaming/overview). Each file consists of
line-separated JSON-formatted tweets, i.e. one Tweet per line. For a
detailed description of the JSON fields in a Tweet, see
https://dev.twitter.com/overview/api/tweets.

Any use of this data is subject to the Twitter Developer Agreement and
Developer Policy:
https://dev.twitter.com/overview/terms/agreement-and-policy.

####################################################
* tweets.20150430-223406.json:

Collected from the public stream of the Streaming API using the
'statuses / filter' endpoint. The value of 'track' was set to the
following keywords: 

"david cameron, miliband, milliband, sturgeon, clegg, farage, tory,
tories, ukip, snp, libdem"

####################################################
* positive_tweets.json
* negative_tweets.json

These 

In [4]:
twitter_samples.abspaths()


[FileSystemPathPointer('C:\\Users\\tobia\\AppData\\Roaming\\nltk_data\\corpora\\twitter_samples\\negative_tweets.json'),
 FileSystemPathPointer('C:\\Users\\tobia\\AppData\\Roaming\\nltk_data\\corpora\\twitter_samples\\positive_tweets.json'),
 FileSystemPathPointer('C:\\Users\\tobia\\AppData\\Roaming\\nltk_data\\corpora\\twitter_samples\\tweets.20150430-223406.json')]

In [33]:
tweets_path = twitter_samples.abspaths()[-1]
# with open(tweets_path, 'r') as f:
tweets = []
with twitter_samples.open(tweets_path) as f:
    for line in f:
        tweets.append(json.loads(line.strip()))


In [34]:
docs = [tweet["text"] for tweet in tweets if "retweeted_status" not in tweet]


### Reuters

In [4]:
docs = []

for path in reuters.fileids():
    with reuters.open(path) as f:
        docs.append(f.read())


In [5]:
print(reuters.readme())


      The Reuters-21578 benchmark corpus, ApteMod version

This is a publically available version of the well-known Reuters-21578
"ApteMod" corpus for text categorization.  It has been used in
publications like these:

 * Yiming Yang and X. Liu. "A re-examination of text categorization
   methods".  1999.  Proceedings of 22nd Annual International SIGIR.
   http://citeseer.nj.nec.com/yang99reexamination.html

 * Thorsten Joachims. "Text categorization with support vector
   machines: learning with many relevant features".  1998. Proceedings
   of ECML-98, 10th European Conference on Machine Learning.
   http://citeseer.nj.nec.com/joachims98text.html

ApteMod is a collection of 10,788 documents from the Reuters financial
newswire service, partitioned into a training set with 7769 documents
and a test set with 3019 documents.  The total size of the corpus is
about 43 MB.  It is also available for download from
http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.html ,
which includ

### Inaugural speeches

In [5]:
docs = []

for doc in inaugural.abspaths():
    # with open(doc, 'r') as f:
    f = inaugural.open(doc)
    docs.append(f.read())
    f.close()


### NPR
https://www.kaggle.com/datasets/shuyangli94/interview-npr-media-dialog-transcripts?resource=download&select=headlines.csv

In [43]:
df = pd.read_csv('data/npr/utterances.csv', nrows=200_000)

In [44]:
docs = df['utterance'].dropna().to_list()

## Text preprocessing

In [6]:
from tqdm.notebook import tqdm

In [7]:
tokenizer = WordPunctTokenizer()

tokenizer.tokenize_with_ws = tokenizer_with_ws(tokenizer.tokenize)

In [45]:
document_tokens = [
    [token.lower() for token in tokenizer.tokenize(doc) if token.isalnum()]
    for doc in tqdm(docs, smoothing=0, desc='tokenizing')
]

tokenizing:   0%|          | 0/199999 [00:00<?, ?it/s]

In [46]:
tokens_with_ws = [tokenizer.tokenize_with_ws(doc) for doc in tqdm(docs, smoothing=0, desc='tokenizing with ws')]

tokenizing with ws:   0%|          | 0/199999 [00:00<?, ?it/s]

## Train word2vec model

In [47]:
w2v = Word2Vec()


In [48]:
wv_model = Word2Vec(
    document_tokens,
    vector_size=200,
    window=10,
    workers=4,
    seed=42,
    epochs=20,
    min_count=2,
).wv


In [49]:
wv_model.most_similar('russia')

[('ukraine', 0.7251643538475037),
 ('putin', 0.6974132061004639),
 ('russian', 0.6672940254211426),
 ('iran', 0.6205578446388245),
 ('kremlin', 0.618765652179718),
 ('moscow', 0.6179807782173157),
 ('crimea', 0.6139547824859619),
 ('nato', 0.6061310768127441),
 ('russians', 0.6041618585586548),
 ('kiev', 0.591190755367279)]

## Reduce dimensions

In [24]:
from sklearn.decomposition import PCA

# from sklearn.manifold import TSNE
from openTSNE import TSNE


In [50]:
normed_vectors = wv_model.get_normed_vectors()


In [51]:
pca = PCA(n_components=50)
pca_embedding = pca.fit_transform(normed_vectors)


In [52]:
TSNE_embedding = TSNE(
    n_components=2, learning_rate="auto", random_state=420, verbose=1
).fit(pca_embedding)

wv_tsne_embedding = TSNE_embedding.transform(pca_embedding)

--------------------------------------------------------------------------------
TSNE(random_state=420, verbose=1)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 33.49 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 2.70 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.12 seconds
===> Running optimization with exaggeration=12.00, lr=3931.42 for 250 iterations...
Iteration   50, KL divergence 7.0574, 50 iterations in 7.9650 sec
Iteration  100, KL divergence 7.0596, 50 iterations in 8.2880 sec
Iteration  150, KL divergence 7.0575, 50 iterations in 7.9140 sec
Iteration  200, KL divergence 7.0573, 50 iterations in 7.3550 sec
Iteration  250, KL divergence 7.0568, 50 iterations in 7.0270 sec
   --> Time elapsed: 38.55 seconds
===> Running optimization with exaggeration=1.00, lr=3931.42 for 500 iteration

## doc2vec (optional)

https://towardsdatascience.com/word-embeddings-and-document-vectors-when-in-doubt-simplify-8c9aaeec244e

https://towardsdatascience.com/document-embedding-techniques-fed3e7a6a25d

In [None]:
%aimport w2v_widget.utils
from w2widget.doc2vec import Doc2Vec, calculate_inverse_frequency

In [None]:
word_weights = calculate_inverse_frequency(document_tokens)


In [None]:
dv_model = Doc2Vec(wv_model, word_weights)

dv_model.add_doc2vec(document_tokens)

dv_model.reduce_dimensions()

dv_tsne_embedding = dv_model.TSNE_embedding_array


  0%|          | 0/10788 [00:00<?, ?it/s]

## Save data

In [53]:
import pickle

print("Saving tokens with ws")
with open("data/npr/tokens_with_ws.pkl", "wb") as f:
    pickle.dump(tokens_with_ws, f)

print("Saving wv_model")
with open("data/npr/wv_model.pkl", "wb") as f:
    pickle.dump(wv_model, f)

print("Saving wv_tsne embeddings")
with open("data/npr/wv_tsne_embedding.pkl", "wb") as f:
    pickle.dump(wv_tsne_embedding, f)
    
# print("Saving dv_model")
# with open("data/npr/dv_model.pkl", "wb") as f:
#     pickle.dump(dv_model, f)
    
# print("Saving dv_tsne embeddings")
# with open("data/npr/dv_tsne_embedding.json", "w") as f:
#     json.dump(dv_tsne_embedding, f)


Saving tokens with ws
Saving wv_model
Saving wv_tsne embeddings


## Display widget

In [42]:
wv_widget = Widget(
    wv_model=wv_model,
    two_dim_word_embedding=TSNE_embedding,
    tokens_with_ws=tokens_with_ws,
    dv_model=None,
    two_dim_doc_embedding=None,
    initial_search_words=[],
)

wv_widget.display_widget()


VBox(children=(HTML(value='<style>\n.widget-button {\n    margin-right: 160px;\n}\n\n.widget-select-multiple {…

<IPython.core.display.Javascript object>

In [77]:
wv_widget.queries

{'test': ['cameron', 'snap', 'followback']}

In [78]:
wv_widget.topics

{}

In [63]:
from ipywidgets.embed import embed_minimal_html, dependency_state


In [64]:
embed_minimal_html(
    "export.html",
    views=wv_widget.view,
    title="w2widget",
    state=dependency_state(wv_widget.view),
)
