## Import modules

In [28]:
import numpy as np
import pandas as pd
import re
import nltk
from time import time
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

from sklearn.decomposition import PCA
from matplotlib import pyplot
%matplotlib inline
from gensim.models import KeyedVectors

print(f"Gensim version: {gensim.__version__}") 
print(f"TensorFlow version: {tf.__version__}")

In [29]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package punkt to /home/tdelatte/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/tdelatte/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tdelatte/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Preprocessing

In [15]:
def clean_doc(doc):
    # Lowercase
    doc = doc.lower()
    # Remove numbers
    doc = re.sub(r"[0-9]+", "", doc)
    # Split in tokens
    tokens = doc.split()
    # Remove punctuation
    tokens = [w.translate(str.maketrans('', '', string.punctuation)) for w in tokens]
    return ' '.join(tokens)

def read_files(path):
    documents = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    clean = []
    tokenize  = lambda x: gensim.utils.simple_preprocess(x)
    for d in documents:
        with open(f"{path}/{d}", encoding='utf-8') as f:
            doc = f.read()
            doc = clean_doc(doc)
            clean.append(tokenize(doc))
    return clean

In [3]:
# Directory with raw files
TEXT_DIR  = "/home/tdelatte/projects/notebooks/data/word_embeddings/" 

In [17]:
# Load and clean data
docs = read_files(TEXT_DIR)

print('Number of documents: %i' % len(docs))

Number of documents: 7


In [32]:
# Get Google word embeddings
!wget -P projects/notebooks/data/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-06-17 15:53:20--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.137.62
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.137.62|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘projects/notebooks/data/GoogleNews-vectors-negative300.bin.gz’


2020-06-17 16:29:36 (739 KB/s) - ‘projects/notebooks/data/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [43]:
EMBEDDING_FILE = '/home/tdelatte/projects/notebooks/data/word_embeddings/GoogleNews-vectors-negative300.bin.gz' 
google_word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

2020-06-17 16:35:04,055 : INFO : loading projection weights from /home/tdelatte/projects/notebooks/data/word_embeddings/GoogleNews-vectors-negative300.bin.gz
2020-06-17 16:56:02,161 : INFO : loaded (3000000, 300) matrix from /home/tdelatte/projects/notebooks/data/word_embeddings/GoogleNews-vectors-negative300.bin.gz


In [44]:
%%time
google_model = Word2Vec(size = 300, window=5, min_count = 20, workers = -1)
google_model.build_vocab(docs)

google_model.intersect_word2vec_format(EMBEDDING_FILE, lockf=1.0, binary=True)

google_model.train(docs, total_examples=google_model.corpus_count, epochs = 5)

2020-06-17 16:56:36,964 : INFO : collecting all words and their counts
2020-06-17 16:56:36,966 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-06-17 16:56:37,486 : INFO : collected 22425 word types from a corpus of 1103523 raw words and 7 sentences
2020-06-17 16:56:37,489 : INFO : Loading a fresh vocabulary
2020-06-17 16:56:37,542 : INFO : effective_min_count=20 retains 3747 unique words (16% of original 22425, drops 18678)
2020-06-17 16:56:37,543 : INFO : effective_min_count=20 leaves 1030346 word corpus (93% of original 1103523, drops 73177)
2020-06-17 16:56:37,611 : INFO : deleting the raw counts dictionary of 22425 items
2020-06-17 16:56:37,613 : INFO : sample=0.001 downsamples 53 most-common words
2020-06-17 16:56:37,615 : INFO : downsampling leaves estimated 754522 word corpus (73.2% of prior 1030346)
2020-06-17 16:56:38,477 : INFO : estimated required memory for 3747 words and 300 dimensions: 10866300 bytes
2020-06-17 16:56:38,478 : INFO : resetti

CPU times: user 7min 18s, sys: 0 ns, total: 7min 18s
Wall time: 9min 42s


(0, 0)

In [45]:
google_model.wv.most_similar(positive=["harry"])

2020-06-17 17:07:17,167 : INFO : precomputing L2-norms of word weight vectors


[('neville', 0.5812051296234131),
 ('mr', 0.5559774041175842),
 ('dennis', 0.5308670997619629),
 ('james', 0.5300297737121582),
 ('davies', 0.5283474922180176),
 ('arry', 0.520870566368103),
 ('johnson', 0.5157880783081055),
 ('roberts', 0.4989515244960785),
 ('dont', 0.49838465452194214),
 ('charlie', 0.49837130308151245)]

In [46]:
google_model.wv.most_similar(positive=["voldemort"])

[('spinnet', 0.17876383662223816),
 ('krum', 0.1773030161857605),
 ('umbridge', 0.17520183324813843),
 ('frog', 0.16392986476421356),
 ('lupin', 0.15984302759170532),
 ('arithmancy', 0.15299880504608154),
 ('figure', 0.15231657028198242),
 ('mum', 0.15043088793754578),
 ('newt', 0.1451343595981598),
 ('lily', 0.14472904801368713)]

In [47]:
google_model.wv.most_similar(positive=["basilisk"])

[('dementors', 0.6436929702758789),
 ('hippogriff', 0.5677863955497742),
 ('dementor', 0.5650946497917175),
 ('creature', 0.5508249998092651),
 ('gargoyle', 0.5491913557052612),
 ('spider', 0.5463742017745972),
 ('serpent', 0.5414613485336304),
 ('goblin', 0.5300490260124207),
 ('horcrux', 0.5296100378036499),
 ('spiders', 0.5049328804016113)]

In [48]:
google_model.wv.most_similar(positive=["wand"])

[('wands', 0.7150397300720215),
 ('broom', 0.531521201133728),
 ('broomstick', 0.46065953373908997),
 ('dementors', 0.4059273600578308),
 ('hippogriff', 0.38803809881210327),
 ('basilisk', 0.3835200071334839),
 ('enchantments', 0.38186854124069214),
 ('sword', 0.38140869140625),
 ('dementor', 0.3698751926422119),
 ('gently', 0.3696576654911041)]

In [49]:
sim = google_model.wv.similarity('harry', 'ron')
print("Similarity between 'harry' and 'ron' is {}".format(sim))

Similarity between 'harry' and 'ron' is 0.47980862855911255


In [50]:
odd = google_model.doesnt_match(['harry', 'ron', 'hermione', 'library'])
print(f"word that does not belong in the given list is : {odd}")

word that does not belong in the given list is : library


  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [51]:
print(google_model.most_similar_cosmul(positive=["woman", "king"], negative=["man"], topn=5))

[('prince', 0.800979733467102), ('knight', 0.6788281202316284), ('mistress', 0.6786445379257202), ('witch', 0.6756536364555359), ('doge', 0.6740080118179321)]


  """Entry point for launching an IPython kernel.


In [52]:
print(google_model.most_similar_cosmul(positive=["gryffindor", "malfoy"], negative=["potter"], topn=5))

[('deluminator', 0.6585240960121155), ('risk', 0.6410618424415588), ('directly', 0.6293377876281738), ('lockhart', 0.6256893873214722), ('ogden', 0.6213969588279724)]


  """Entry point for launching an IPython kernel.


## T-SNE Visualization

In [1]:
# Inspired by code here: https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(n_components=2, init="pca", n_iter=1000, random_state=32)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')

In [None]:
%time
tsne_plot(google_model)