In [1]:
import re
import nltk
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
from gensim.models import Word2Vec

In [3]:
from sklearn.decomposition import IncrementalPCA   
from sklearn.manifold import TSNE 

In [4]:
file = open("hunger_games.txt", 'r').read()

In [5]:
for char in ["\n", "\r", "\d", "\t"]:
    file = file.replace(char, " ")

In [6]:
print(file[:100])

The Second Book of THE HUNGER GAMES     New York Times Bestsel ling Author   SUZHNNE  COLLINS     PA


In [7]:
def sample_clean_text(text: str):
   
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tokenized_text = [[word.lower() for word in sent] for sent in tokenized_sentences]
    return tokenized_text

In [8]:
tokens = sample_clean_text(text = file)

In [9]:
print(tokens[:10])

[['the', 'second', 'book', 'of', 'the', 'hunger', 'games', 'new', 'york', 'times', 'bestsel', 'ling', 'author', 'suzhnne', 'collins', 'parti', '``', 'the', 'spark', "''", '2', '|', 'p', 'a', 'g', 'e', 'catching', 'fire', '-', 'suzanne', 'collins', 'i', 'clasp', 'the', 'flask', 'between', 'my', 'hands', 'even', 'though', 'the', 'warmth', 'from', 'the', 'tea', 'has', 'long', 'since', 'leached', 'into', 'the', 'frozen', 'air', '.'], ['my', 'muscles', 'are', 'clenched', 'tight', 'against', 'the', 'cold', '.'], ['if', 'a', 'pack', 'of', 'wild', 'dogs', 'were', 'to', 'appear', 'at', 'this', 'moment', ',', 'the', 'odds', 'of', 'scaling', 'a', 'tree', 'before', 'they', 'attacked', 'are', 'not', 'in', 'my', 'favor', '.'], ['i', 'should', 'get', 'up', ',', 'move', 'around', ',', 'and', 'work', 'the', 'stiffness', 'from', 'my', 'limbs', '.'], ['but', 'instead', 'i', 'sit', ',', 'as', 'motionless', 'as', 'the', 'rock', 'beneath', 'me', ',', 'while', 'the', 'dawn', 'begins', 'to', 'lighten', 'the',

In [10]:
model = Word2Vec(tokens,vector_size=100)

In [11]:
model.wv.key_to_index

{'.': 0,
 ',': 1,
 'the': 2,
 'i': 3,
 'to': 4,
 'and': 5,
 'a': 6,
 'of': 7,
 '``': 8,
 "''": 9,
 "'s": 10,
 'it': 11,
 'in': 12,
 'my': 13,
 'he': 14,
 'that': 15,
 'but': 16,
 "n't": 17,
 'me': 18,
 '?': 19,
 'on': 20,
 'with': 21,
 'you': 22,
 'we': 23,
 'for': 24,
 'is': 25,
 'peeta': 26,
 'his': 27,
 'be': 28,
 'as': 29,
 'do': 30,
 'at': 31,
 'have': 32,
 'they': 33,
 'says': 34,
 'from': 35,
 'out': 36,
 'what': 37,
 'him': 38,
 'so': 39,
 'her': 40,
 'up': 41,
 'can': 42,
 'fire': 43,
 'was': 44,
 'she': 45,
 'not': 46,
 'this': 47,
 'no': 48,
 'if': 49,
 'there': 50,
 'catching': 51,
 'are': 52,
 'one': 53,
 'collins': 54,
 'all': 55,
 '-': 56,
 'suzanne': 57,
 'p': 58,
 'g': 59,
 'e': 60,
 '|': 61,
 "'m": 62,
 'know': 63,
 'say': 64,
 'about': 65,
 'into': 66,
 'when': 67,
 'us': 68,
 'haymitch': 69,
 'just': 70,
 'finnick': 71,
 'will': 72,
 'them': 73,
 'back': 74,
 'has': 75,
 'think': 76,
 'then': 77,
 'by': 78,
 'see': 79,
 'an': 80,
 "'re": 81,
 'even': 82,
 'like': 83

In [12]:
model.wv.get_vector("capitol", norm=True)

array([-0.02056501,  0.02521762,  0.08708081, -0.06241399,  0.07681879,
       -0.11091646,  0.01648419,  0.23703468, -0.12767258, -0.09835222,
       -0.06744481, -0.13844848, -0.06691328,  0.08572461, -0.01493865,
       -0.05268228, -0.02760391,  0.01515875, -0.05464501, -0.23441707,
        0.01035415, -0.0018388 ,  0.08443313, -0.12946843,  0.03940867,
       -0.07967506, -0.08660537, -0.01997358, -0.11127333,  0.00869636,
        0.12442708,  0.00741584,  0.07809718, -0.11941067,  0.02704737,
        0.07131161,  0.05456196, -0.05791068,  0.00138192, -0.1547464 ,
        0.06471459, -0.12059753, -0.04943063, -0.02008227,  0.17525671,
       -0.00127438, -0.13704754, -0.07511146,  0.05318543,  0.02483948,
        0.11846557, -0.18212734, -0.03388246, -0.02541975, -0.09762754,
        0.03423771, -0.01218472, -0.02731437, -0.1869811 ,  0.14908195,
        0.08523446,  0.04481921,  0.12299781, -0.0010716 , -0.13977379,
        0.08425236,  0.11889565,  0.18687363, -0.10255188,  0.21

In [13]:
model.wv.similarity('katniss', 'girl')

0.7582978

In [14]:
model.wv.similarity('peeta', 'home')

0.93652195

In [15]:
def reduce_dimensions(model):
    num_dimensions = 2  
   
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  


    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

In [18]:
def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)
