# Table of Contents
* [Wordcloud by Author](#1)
* [Word Frequencies by Author](#2)
* [Word2Vec - Word Embeddings](#3)
* [Visualize using UMAP](#4)

In [None]:
# PACKAGES

# standard
import numpy as np
import pandas as pd
import time
import random

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# NLP
import nltk
from nltk.tokenize import word_tokenize

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from collections import Counter

# H2O
import h2o
from h2o.estimators import H2OWord2vecEstimator

# UMAP
import umap

In [None]:
# read data
t1 = time.time()
df = pd.read_csv('../input/political-though-work-corpus/political_thought_works_corpus.csv')
t2 = time.time()
print('Elapsed time: ', np.round(t2-t1,2))

In [None]:
# clean up
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('text', axis=1)
# replace special character
df.loc[1,'authors'] = 'Niccolo Machiavelli'

In [None]:
# add features
df['n_char'] = df.text_clean.str.len()
df['n_word'] = df.text_clean.str.split().map(lambda x : len(x))

In [None]:
# show overview
df

In [None]:
# plot number of words
plt.figure(figsize=(12,4))
plt.bar(df.book_title, df.n_word)
plt.title('Number of words')
plt.xticks(rotation=90)
plt.grid()
plt.show()

In [None]:
# extract list of authors
authors = df.authors.tolist()
print(authors)

<a id='1'></a>
# Wordcloud by Author

In [None]:
stopwords = set(STOPWORDS)

t1 = time.time()
for a in authors:
    df_temp = df[df.authors==a]
    
    print('Author = ', a.upper(), ':')
    
    # render wordcloud
    text = ' '.join(txt for txt in df_temp.text_clean)
    wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                          width = 600, height = 400,
                          background_color='white').generate(text)
    plt.figure(figsize=(12,8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
t2 = time.time()
print('Elapsed time: ', np.round(t2-t1,2))

<a id='2'></a>
# Word Frequencies by Author

In [None]:
for a in authors:
    df_temp = df[df.authors==a]
    df_temp = df_temp.reset_index()
    txt = df_temp.text_clean[0]
    
    # token frequencies
    tokens = word_tokenize(txt)
    freq = Counter(tokens)
    freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
    freq10 = dict(freq[0:10]) # get top 10 and convert list => dict
    # plot
    plt.figure(figsize=(14,4))
    plt.bar(freq10.keys(), freq10.values())
    plt.title('Author='+a)
    plt.grid()
    plt.show()

<a id='3'></a>
# Word2Vec - Word Embeddings

#### Using code from: https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/word2vec.html¶

In [None]:
# start H2O
h2o.init()

In [None]:
# utility function for tokenization
def tokenize(sentences, stop_word = stopwords): # use stop words from wordcloud package
    tokenized = sentences.tokenize('\\W+')
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep('[0-9]',invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(stop_word)),:]
    return tokenized_words

In [None]:
# upload data to H2O environment
text_h2o = h2o.H2OFrame(df[['authors','text_clean']])

In [None]:
# tokenize text
t1 = time.time()
words = tokenize(text_h2o['text_clean'])
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')

In [None]:
# train Word2Vec model
random.seed(1234)

t1 = time.time()
w2v_model = H2OWord2vecEstimator(vec_size = 50,
                                 window_size = 5,
                                 sent_sample_rate = 0.001,
                                 init_learning_rate = 0.025,
                                 epochs = 10)
w2v_model.train(training_frame=words)
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')

In [None]:
# check model
w2v_model.find_synonyms('knowledge', count = 5)

In [None]:
# create vector representation for each author
text_vec = w2v_model.transform(words, aggregate_method = 'AVERAGE')
# and add authors to vectors
text_vec = text_vec.cbind(text_h2o['authors'])
text_vec

In [None]:
# vector features (columns w/o the label)
features = text_vec.columns
features.remove('authors')

# convert H2O frame to Pandas data frame
df_text_vec = text_vec.as_data_frame();

# drop rows with missing values
df_text_vec = df_text_vec.dropna(axis=0)

<a id='4'></a>
# Visualize using UMAP

In [None]:
# run UMAP algorithm to get a low dimensional (in our case 2D) representation
dim_reducer = umap.UMAP(random_state=111, n_components=2,
                        n_neighbors=4)

t1 = time.time()
text_vec_umap = dim_reducer.fit_transform(df_text_vec[features])
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')

# convert result matrix to data frame
df_text_vec_umap = pd.DataFrame(text_vec_umap, columns=['x','y'])
# and add school again
df_text_vec_umap['authors'] = df_text_vec.authors.tolist()

In [None]:
# plot author vectors
plt.figure(figsize=(12,8))
sns.scatterplot(data=df_text_vec_umap, x='x', y='y', 
                hue='authors', alpha=1,
                s=1000)
plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), ncol=1)
plt.grid()
plt.show()