In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import nltk
import time
from nltk.corpus import gutenberg
import gensim
import json
import random
import numpy as np
from tensorboardX import SummaryWriter
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import scripts

# set random seed for reproducity

In [3]:
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

# settings for training

In [4]:
with open('vocab.json', 'r') as f:
    vocab = json.load(f)

In [5]:
settings = {
                'vocab_size': len(vocab),
                'window_size': 5,
                'num_epochs': 3,
                'embedding_dim': 50,
                'batch_size': 512,
                'num_heads': 12,
                'dim_head': 128,
                'learning_rate': 2e-3
            }

# initialize models

In [15]:
CBoW_model = scripts.pytorch_model('CBoW', settings)

{'vocab_size': 51156, 'window_size': 5, 'num_epochs': 3, 'embedding_dim': 50, 'batch_size': 512, 'num_heads': 12, 'dim_head': 128, 'learning_rate': 0.002}

98552 sentences fetched.

51156 unique words found in corpus
2621762 pairs found for training


In [13]:
MSE_model = scripts.pytorch_model('MSE', settings)

{'vocab_size': 51156, 'window_size': 5, 'num_epochs': 3, 'embedding_dim': 50, 'batch_size': 512, 'num_heads': 12, 'dim_head': 128, 'learning_rate': 0.002}

98552 sentences fetched.

51156 unique words found in corpus
2621762 pairs found for training


In [14]:
COS_model = scripts.pytorch_model("COS", settings)

{'vocab_size': 51156, 'window_size': 5, 'num_epochs': 3, 'embedding_dim': 50, 'batch_size': 512, 'num_heads': 12, 'dim_head': 128, 'learning_rate': 0.002}

98552 sentences fetched.

51156 unique words found in corpus
2621762 pairs found for training


# train the models if needed

In [8]:
prepro = scripts.PreProcessing()
prepro.train_gensim('word2vec_gensim_new.model', settings)

Total number of sentences found: 98552.



In [17]:
if not os.path.exists('MSE_ckpts'):
    MSE_model.train()
if not os.path.exists('CBoW_ckpts'):
    CBoW_model.train()
if not os.path.exists('COS_ckpts'):
    COS_model.train()

# load weights from ckpts

In [20]:
# MSE_model.load_state_dict(torch.load('MSE_ckpts/epoch-0.pt'))
# COS_model.load_state_dict(torch.load('COS_ckpts/epoch-0.pt'))
# CBoW_model.load_state_dict(torch.load('CBoW_ckpts/epoch-0.pt'))

#  compare result from these four models by getting synonyms, getting similarities

In [11]:
w2v_model = gensim.models.Word2Vec.load('word2vec_gensim.model')

In [14]:
w2v_model.wv.most_similar(['man'], topn=3)

[('woman', 0.7312480211257935),
 ('person', 0.7193930149078369),
 ('body', 0.7164236307144165)]

In [18]:
w2v_model.wv.distance('man', 'woman')

0.2687519334169457

In [26]:
gensim_model = scripts.PreProcessing()

In [27]:
gensim_model.load_gensim('word2vec_gensim_new.model')

In [30]:
gensim_model.get_distance('man', 'woman')

0.2273521626677446

In [53]:
((gensim_model.get_embedding('man') - gensim_model.get_embedding('woman')) ** 2).mean()

0.7096199989318848

In [55]:
gensim_model.model.wv.distance('man', 'woman')

0.2273521626677446

In [50]:
gensim_model.model.wv.similarity('man', 'woman')

0.7726478373322554

In [58]:
(tmp1 - tmp2).sum()

0.6428664