## 1. Set up environment

Load the dataset

In [14]:
# Import the libraries
import re
import requests
import numpy as np

In [8]:
url = 'https://raw.githubusercontent.com/alexisperrier/intro2nlp/master/data/Shakespeare_alllines.txt'

r = requests.get(url)

# Read the file into list of lines 
lines = r.text.encode('ascii',errors='ignore').decode('utf-8').split("\n")

Remove all punctuation and only keep verses with more than one token to reduce the size of the corpus

In [9]:
sentences = []

for line in lines:
    
    # Remove punctuations
    line = re.sub(r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]', ' ', line)
    
    # simple toknizer
    tokens = re.findall(r'\b\w+\b', line)
    
    # Only keep lines with tokens >1
    if len(tokens) > 1:
        sentences.append(tokens)
        

print("This gives : ", len(sentences), "sentences")

This gives :  108805 sentences


Lets train the `word2Vec` model which we call it `bard2Vec`

In [11]:
from gensim.models import Word2Vec

bard2Vec = Word2Vec(
    sentences,
    min_count = 3, # Ignore the words that appears less than 3 times
    vector_size = 50, # Dimensionality of word embeddings
    sg = 1, # skipgrams
    window = 7, # Context window for words during training
    epochs = 40 # Number of epochs training over corpus
)

Once the training is done, we can explore the results by looking at word similarity for certain word

In [15]:
def similar_words(word):
    print("----most similar words to : ", word)
    for (token, score) in bard2Vec.wv.most_similar(word):
        print(f"\t{token:>10} {np.round(score, 2)}")
    print()
    
similar_words('King')
similar_words('sword')
similar_words('husband')
similar_words('Hamlet')

----most similar words to :  King
	     Henry 0.84
	    Edward 0.82
	   Richard 0.75
	   England 0.71
	     Pepin 0.7
	    Naples 0.69
	    Fourth 0.68
	   Warwick 0.68
	     Ghost 0.66
	     Sixth 0.66

----most similar words to :  sword
	      head 0.75
	  Parthian 0.67
	    finger 0.67
	   stirrup 0.65
	       leg 0.65
	       Tie 0.64
	     edged 0.64
	       axe 0.63
	    pistol 0.63
	      hand 0.63

----most similar words to :  husband
	      wife 0.87
	  daughter 0.78
	    mother 0.78
	  mistress 0.77
	    father 0.77
	    master 0.77
	   Orlando 0.74
	       son 0.74
	    sister 0.72
	bequeathed 0.72

----most similar words to :  Hamlet
	   Laertes 0.69
	  Gertrude 0.69
	    cousin 0.67
	     chuck 0.67
	  Gramercy 0.66
	  Eglamour 0.64
	   bawcock 0.63
	  Popilius 0.63
	  Hereford 0.63
	   Stanley 0.63



The results are dependent on how we trained the model. Let's compare with a model that is trained for a longer time and for larger window

In [16]:
from gensim.models import Word2Vec

bard2vec = Word2Vec(
         sentences,
         min_count=3,   # same
         vector_size=50,  # same
         sg = 0,        # cbow instead of skip-grams
         window=10,      # larger context windows
         epochs=100)       # longer training

In [17]:
def similar_words(word):
    print("-- most similar words to: ", word)
    for (token, score) in bard2vec.wv.most_similar(word):
        print(f"\t{token:>10} {np.round(score,2)}")
    print()
    
similar_words('King')
similar_words('sword')
similar_words('husband')
similar_words('Hamlet')

-- most similar words to:  King
	    Fourth 0.62
	      vial 0.62
	     Queen 0.6
	  Scotland 0.59
	     Sixth 0.57
	   mockery 0.56
	     Gaunt 0.55
	    Europa 0.54
	      Earl 0.53
	   Macduff 0.53

-- most similar words to:  sword
	      head 0.77
	    weapon 0.7
	     horse 0.68
	    dagger 0.67
	    rapier 0.66
	     heart 0.65
	    finger 0.64
	     knife 0.64
	      life 0.64
	    tongue 0.62

-- most similar words to:  husband
	      wife 0.86
	  mistress 0.85
	    mother 0.84
	  daughter 0.82
	    friend 0.81
	   brother 0.8
	    sister 0.8
	       son 0.8
	    master 0.78
	    father 0.78

-- most similar words to:  Hamlet
	Northumberland 0.65
	  Polonius 0.63
	Canterbury 0.61
	 Worcester 0.6
	  Gertrude 0.6
	   Suffolk 0.54
	Gloucester 0.54
	  Clifford 0.54
	  Hastings 0.53
	 Demetrius 0.53

