**Word Vector from Mahabaratha**

In [None]:
from __future__ import absolute_import, division, print_function

In [None]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [None]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
%pylab inline

Set up logging

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

Prepare Corpus
Load books from files

In [None]:
book_filenames = sorted(glob.glob("../input/*.txt"))

In [None]:
print("Found books:")
book_filenames

Combine the books into one string

In [None]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Split the corpus into sentences

In [None]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [None]:
#convert into a list of words
#remove unnnecessary, split into words, no hyphens
#list of words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [None]:
#sentence where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [None]:
print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))

In [None]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

Train Word2Vec

In [None]:
#ONCE we have vectors
#step 3 - build model
#3 main tasks that vectors help with
#DISTANCE, SIMILARITY, RANKING

# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 300
# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1

In [None]:
mahabharata2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [None]:
mahabharata2vec.build_vocab(sentences)

In [None]:
print("Word2Vec vocabulary length:", len(mahabharata2vec.wv.vocab))

Start training, this might take a minute or two...

In [None]:
mahabharata2vec.train(sentences)

Save to file, can be useful later

In [None]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [None]:
mahabharata2vec.save(os.path.join("trained", "mahabharata2vec.w2v"))

Explore the trained model.

In [None]:
mahabharata2vec = w2v.Word2Vec.load(os.path.join("trained", "mahabharata2vec.w2v"))

Compress the word vectors into 2D space and plot them

In [None]:
#my video - how to visualize a dataset easily
tsne = sklearn.manifold.TSNE(n_components=3, random_state=0)

In [None]:
all_word_vectors_matrix = mahabharata2vec.wv.syn0

Train t-SNE, this could take a minute or two...

In [None]:
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

Plot the big picture

In [None]:
points = pd.DataFrame(
    [
        (word, coords[0], coords[1], coords[2])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[mahabharata2vec.wv.vocab[word].index])
            for word in mahabharata2vec.wv.vocab
        ]
    ],
    columns=["word", "x", "y", "z"]
)

In [None]:
points.head(10)

In [None]:
sns.set_context("poster")

In [None]:
points.plot.scatter("x", "y", c = "z",s=10, figsize=(12, 12))

In [None]:
def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) & 
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])
    ]
    
    ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
    for i, point in slice.iterrows():
        ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)

People related to Kingsguard ended up together

In [None]:
plot_region(x_bounds=(4.0, 4.2), y_bounds=(-0.5, -0.1))

Food products are grouped nicely as well. Aerys (The Mad King) being close to "roasted" also looks sadly correct

In [None]:
plot_region(x_bounds=(0, 1), y_bounds=(4, 4.5))

Explore semantic similarities between book characters. Words closest to the given word

In [None]:
mahabharata2vec.most_similar("Krishna")

In [None]:
mahabharata2vec.most_similar("Arjuna")

In [None]:
mahabharata2vec.most_similar("Karna")

In [None]:
mahabharata2vec.most_similar("Vrishasena")

Linear relationships between word pairs

In [None]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = mahabharata2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [None]:
nearest_similarity_cosmul("Dhritarastra", "Pandu", "Nakula")
nearest_similarity_cosmul("Bhima", "Arjuna", "Ambika")

In [None]:
from nltk.tag import pos_tag

sentence = "Vrishasena Ambalika at the death of Duhshasana and Chitrasena rushed against Nakula desiring to fight with his father's enemy. A fierce battle then ensued between those two heroes. Vrishasena managed to kill Nakula's horses and pierce him with many arrows. Descending from his chariot, Nakula took up his sword and shield, and making his way toward Vrishasena, he severed the heads of two thousand horsemen. Vrishasena, seeing Nakula coming towards him whirling that sword like a discus, shattered the sword and shield with four crescent shaped arrows. Nakula then quickly ascended Bhima's chariot. As Arjuna came near, Nakula requested him Please slay this sinful person Arjuna then ordered Lord Krishna Proceed toward the son of Karna."
tagged_sent = pos_tag(sentence.split())
print (tagged_sent)

propernouns = [word for word,pos in tagged_sent if pos == 'NNP']
print (propernouns)