In [1]:
# Bridge between Py2 and Py3
from __future__ import absolute_import, division, print_function
# Word encoding
import codecs
# Regex
import glob
# Concurrency - Multithreading
import multiprocessing
# OS
import os
# Pretty printing
import pprint
# Regular expression
import re
# NLP toolkit
import nltk
# Word to vec
import gensim.models.word2vec as w2v
# Dimensionality reduction 
import sklearn.manifold
# Math
import numpy as np
# Plotting
import matplotlib.pyplot as plt
# Pandas
import pandas as pd
# Visualization
import seaborn as sns

In [2]:
# Process data
# Clean data

nltk.download('punkt') # Pretrainined tokenizer
nltk.download('stopwords') # and,the,a,an

[nltk_data] Downloading package punkt to /home/shankar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shankar/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Get books
book_filenames = sorted(glob.glob('data/*.txt'))
book_filenames

['data/got1.txt',
 'data/got2.txt',
 'data/got3.txt',
 'data/got4.txt',
 'data/got5.txt']

In [4]:
# Raw unicode
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'..".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    

Reading 'data/got1.txt'..
Corpus is now 1770659 characters long
Reading 'data/got2.txt'..
Corpus is now 4071041 characters long
Reading 'data/got3.txt'..
Corpus is now 6391405 characters long
Reading 'data/got4.txt'..
Corpus is now 8107945 characters long
Reading 'data/got5.txt'..
Corpus is now 9719485 characters long


In [5]:
# Split the corpus into sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [6]:
raw_sentences = tokenizer.tokenize(corpus_raw)
raw_sentences

[u'This edition contains the complete text of the original hardcover edition.',
 u'NOT ONE WORD HAS BEEN OMITTED.',
 u'A CLASH OF KINGS\n\nA Bantam Spectra Book\n\nPUBLISHING HISTORY\n\nBantam Spectra hardcover edition published February 1999\n\nBantam Spectra paperback edition / September 2000\n\nSPECTRA and the portrayal of a boxed \u201cs\u201d are trademarks of Bantam Books, a division of Random House, Inc.\n\nAll rights reserved.',
 u'Copyright \xa9 1999 by George R. R. Martin.',
 u'Maps by James Sinclair.',
 u'Heraldic crest by Virginia Norey.',
 u'Library of Congress Catalog Card Number: 98-37954.',
 u'No part of this book may be reproduced or transmitted in any form or by any means, electronic or mechanical, including photocopying, recording, or by any information storage and retrieval system, without permission in writing from the publisher.',
 u'Visit our website at www.bantamdell.com\n\nBantam Books, the rooster colophon, Spectra and the portrayal of a boxed \u201cs\u201d ar

In [7]:
# List of words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [8]:
# Each word tokenized
sentences = []
for raw_sentence in raw_sentences:
    if(len(raw_sentence) > 0):
        sentences.append(sentence_to_wordlist(raw_sentence))

In [9]:
raw_sentences[5]

u'Heraldic crest by Virginia Norey.'

In [10]:
sentence_to_wordlist(raw_sentences[5])

[u'Heraldic', u'crest', u'by', u'Virginia', u'Norey']

In [11]:
sentences[5]

[u'Heraldic', u'crest', u'by', u'Virginia', u'Norey']

In [12]:
token_count = sum([len(sentence) for sentence in sentences])
token_count

1818103

In [7]:
# Dimensionality of the resulting word vectors.
num_features = 300

# Word threshold
min_word_count = 3

# Number of threads
num_workers = multiprocessing.cpu_count()

# Context window length
context_size = 7

# Downsampling rate for frequent words
downsampling = 1e-3

# Seed
seed = 1

num_workers

8

In [14]:
# Create model
thrones2vec = w2v.Word2Vec(
    sg = 1,
    seed = seed,
    workers = num_workers,
    size = num_features,
    min_count = min_word_count,
    window = context_size,
    sample = downsampling
)

In [15]:
# Build vocab
thrones2vec.build_vocab(sentences)

In [16]:
len(thrones2vec.vocab)

17277

In [17]:
# Train with sentences
thrones2vec.train(sentences)

7022609

In [18]:
# Save model
if not os.path.exists('trained'):
    os.makedirs('trained')

In [19]:
thrones2vec.save(os.path.join("trained","thrones2vec.w2v"))

In [20]:
# Load the model
thrones2vec = w2v.Word2Vec.load(os.path.join("trained","thrones2vec.w2v"))

In [21]:
# Convert to 2D
# tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
# tsne2 = sklearn.manifold.TSNE(n_components=100, random_state=0)

In [22]:
# all_word_vectors_matrix = thrones2vec.syn0

In [23]:
# all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

In [24]:
# points = pd.DataFrame(
#    [
#        (word,coords[0],coords[1])
#        for word,coords in [
#            (word, all_word_vectors_matrix_2d[thrones2vec.vocab[word].index])
#            for word in thrones2vec.vocab
#        ]
#    ],
#    columns = ["word","x","y"]
#)

In [25]:
thrones2vec.most_similar("Stark")

[(u'Eddard', 0.7246720790863037),
 (u'Winterfell', 0.648485004901886),
 (u'Brandon', 0.6379424929618835),
 (u'Hornwood', 0.636419415473938),
 (u'Starks', 0.633171021938324),
 (u'Lyanna', 0.6311416625976562),
 (u'beheaded', 0.6298810243606567),
 (u'Arryn', 0.6290252208709717),
 (u'Karstark', 0.6211104393005371),
 (u'executed', 0.6201556921005249)]

In [29]:
#distance, similarity, and ranking
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = thrones2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print(similarities)
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [30]:
nearest_similarity_cosmul("Stark", "Winterfell", "Riverrun")
nearest_similarity_cosmul("Jaime", "sword", "wine")
nearest_similarity_cosmul("Arya", "Nymeria", "dragons")

[(u'Tully', 0.9138249754905701), (u'Blackwood', 0.8708528280258179), (u'Cleos', 0.8612934947013855), (u'Emmon', 0.8566479086875916), (u'Alys', 0.8455457091331482), (u'Genna', 0.8430482149124146), (u'Edmure', 0.8379801511764526), (u'Tytos', 0.8373609781265259), (u'Blackfish', 0.8346664905548096), (u'Roslin', 0.8338011503219604)]
Stark is related to Winterfell, as Tully is related to Riverrun
[(u'Tyrion', 0.9685537219047546), (u'drank', 0.9576870799064636), (u'drinking', 0.9499372243881226), (u'cup', 0.9494040012359619), (u'Cersei', 0.9483168721199036), (u'Shae', 0.9450653195381165), (u'sipped', 0.9335622787475586), (u'hippocras', 0.9299818277359009), (u'drained', 0.923220157623291), (u'ale', 0.9200564622879028)]
Jaime is related to sword, as Tyrion is related to wine
[(u'Dany', 0.8065468668937683), (u'lust', 0.7817673087120056), (u'dragon', 0.7735683917999268), (u'tricks', 0.7723599672317505), (u'magic', 0.7684629559516907), (u'reasons', 0.762871265411377), (u'sale', 0.7619982957839966)

u'Dany'