In [1]:
!pip install -U gensim

Collecting gensim
  Using cached gensim-4.1.2-cp38-cp38-macosx_10_9_x86_64.whl (24.0 MB)
Collecting smart-open>=1.8.1
  Using cached smart_open-5.2.1-py3-none-any.whl (58 kB)
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.1.2 smart-open-5.2.1


In [13]:
%%capture

!python -m spacy download en_core_web_sm
import spacy

word level https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/

<div class="cite2c-biblio"></div># Import libs

In [4]:
%%capture

from IPython.display import clear_output

import hashlib
import re
import sys
import tarfile
from collections import Counter, defaultdict
from pathlib import Path

from time import time

import matplotlib.pyplot as plt
import requests
from IPython.display import Image

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import multiprocessing

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('all')

# Read the Data

In [5]:
import os
os.getcwd()

'/Users/Oleg_Litvinov1/Documents/Code/fact-ai'

In [6]:
from pathlib import Path
# Read train, val, and test sets into string objects
train_data = Path('wikitext-103/wiki.train.tokens').read_text()
val_data = Path('wikitext-103/wiki.valid.tokens').read_text()
test_data = Path('wikitext-103/wiki.test.tokens').read_text()

# Preprocessing

In [7]:
stop_words = set(stopwords.words('english'))

def preprocess(raw_str: str):
    # Split raw text dataset into individual lines
    lines = raw_str.splitlines()

    # Lowercase
    lower_lines = [line.lower() for line in lines]
    
    # Remove casing, punctuation, special characters, and stop words and also lemmatize the words on a subset of the first 110 articles in the train data
    sentences = [re.sub('[^ a-zA-Z0-9]|unk', '', s) for s in lower_lines]
    
    sentences = [s for s in sentences if s.strip() != '']
    
    sentences_filtered = [[w for w in s.split(' ') if w not in stop_words.union({''})] for s in sentences]  # (w for s in lower_lines for w in s.split(' ') if w not in stop_words)
    
    return sentences_filtered    

In [60]:
%%time

train_sentences = preprocess(train_data)

CPU times: user 2min 5s, sys: 3.55 s, total: 2min 9s
Wall time: 2min 11s


In [14]:
# # Only use the PoS tagger, or processing will take very long
# nlp = spacy.load('en_core_web_sm', disable=[
#     'parser',
#     'entity',
#     'ner',
#     'entity_linker',
#     'entity_ruler',
#     'textcat',
#     'textcat_multilabel',
#     'morphologizer',
#     'senter',
#     'sentencizer',
#     'tok2vec',
#     'transformers'
# ])

# tsn = [nlp(' '.join(s)) for s in train_sentences[:10]]

In [3]:
# %%time

# import re

# # Remove casing, punctuation, special characters, and stop words and also lemmatize the words on a subset of the first 110 articles in the train data
# # my_new_text = re.sub('[^ a-zA-Z0-9]|unk', '', train_data[:10000000])
# sentences = [re.sub('[^ a-zA-Z0-9]|unk', '', s) for s in sentences]

# sentences = [word_tokenize(d.lower()) for s in sentences]

# stop_words = set(stopwords.words('english'))
# filtered_sentence = (w for s in sentences for w in s if w not in stop_words)

# lemma = WordNetLemmatizer()
# normalized = " ".join(lemma.lemmatize(word) for word in filtered_sentence)

CPU times: user 12.1 s, sys: 229 ms, total: 12.3 s
Wall time: 12.3 s


In [62]:
model = Word2Vec(
    # sentences=word_tokens,  # a list of lists of tokens
                 sg=1,
                 negative=5,
                 vector_size=100, 
                 window=5, 
                 min_count=10, 
                 workers=multiprocessing.cpu_count())

In [63]:
t = time()

model.build_vocab(train_sentences, progress_per=1000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.18 mins


In [67]:
# https://stackoverflow.com/questions/52038651/loss-does-not-decrease-during-training-word2vec-gensim
# init callback class
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [68]:
t = time()

model.train(train_sentences, 
            total_examples=model.corpus_count, 
            epochs=10, 
            report_delay=1,
            compute_loss=True, 
            callbacks=[callback()])

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Loss after epoch 0: 44224868.0
Loss after epoch 1: 23500244.0
Loss after epoch 2: 2723640.0
Loss after epoch 3: 2600856.0
Loss after epoch 4: 2449144.0
Loss after epoch 5: 2347256.0
Loss after epoch 6: 2194144.0
Loss after epoch 7: 2098144.0
Loss after epoch 8: 2006360.0
Loss after epoch 9: 2000000.0
Time to train the model: 12.83 mins


In [69]:
model.save("word2vec.model")

In [70]:
model.wv.most_similar(positive=["murder"])

[('murders', 0.8885542154312134),
 ('murderer', 0.8399878144264221),
 ('kidnapping', 0.8296462297439575),
 ('murdering', 0.8261697292327881),
 ('convicted', 0.8101226687431335),
 ('murdered', 0.7853724956512451),
 ('rape', 0.7829334139823914),
 ('arrest', 0.7744185328483582),
 ('robbery', 0.7712156772613525),
 ('crime', 0.7708020806312561)]

In [71]:
model.wv.most_similar(positive=["domestic", "work"])

[('working', 0.5911438465118408),
 ('welfare', 0.5858847498893738),
 ('works', 0.5836688876152039),
 ('production', 0.5688177347183228),
 ('prioritisation', 0.5571842193603516),
 ('craftspeople', 0.5515726804733276),
 ('domestically', 0.5431331396102905),
 ('employment', 0.5428930521011353),
 ('importing', 0.5414029955863953),
 ('finance', 0.5345153212547302)]

# Example of model loading

In [104]:
model_loaded = Word2Vec.load("word2vec.model")

model_loaded.wv.most_similar(positive=["domestic", "work"])

[('working', 0.5911438465118408),
 ('welfare', 0.5858847498893738),
 ('works', 0.5836688876152039),
 ('production', 0.5688177347183228),
 ('prioritisation', 0.5571842193603516),
 ('craftspeople', 0.5515726804733276),
 ('domestically', 0.5431331396102905),
 ('employment', 0.5428930521011353),
 ('importing', 0.5414029955863953),
 ('finance', 0.5345153212547302)]