<a href="https://colab.research.google.com/github/RohanHanda/NLPUsingDL/blob/main/Word2VecBasic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from __future__ import print_function, division
from builtins import range
import json
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import expit as sigmoid
from datetime import datetime
from scipy.spatial.distance import cosine as cos_dist
from sklearn.metrics.pairwise import pairwise_distances
from glob import glob
import os
import sys
import string

In [None]:
sys.path.append(os.path.abspath('/content/drive/MyDrive/Colab Notebooks'))


In [None]:
from machine_learning_examples.rnn_class.brown import get_sentences_with_word2idx_limit_vocab as get_brown

In [None]:
def remove_punctuation_2(s):
  return s.translate(None, string.punctuation)
def remove_punctuation_3(s):
  return s.translate(str.maketrans('','',string.punctuation))
if sys.version.startswith("2"):
  remove_punctuation = remove_punctuation_2
else:
  remove_punctuation = remove_punctuation_3

In [None]:
def get_wiki():
  V = 20000
  files = glob('/content/drive/MyDrive/Colab Notebooks/machine_learning_examples/large_files/enwiki*.txt')
  all_word_count = {}
  for f in files:
    for line in open(f):
      if line and line[0] not in '[*-|=\\{\\}':
        s = remove_punctuation(line).lower().split()
        if len(s)>1:
          for word in s:
            if word not in all_word_count:
              all_word_count[word] = 0
            all_word_count[word]+=1
  print("Finished Counting")
  V = min(V, len(all_word_count))
  all_word_count = sorted(all_word_count.items(), key=lambda x:x[1], reverse=True)
  top_words = [w for w,count in all_word_count[:V-1] + [('<UNK>', 0)]]
  word2idx = {w:i for i,w in enumerate(top_words)}
  unk = word2idx['<UNK>']
  sents = []
  for f in files:
    for line in open(f):
      if line and line[0] not in '[*-|=\\{\\}':
        s = remove_punctuation(line).lower().split()
        if len(s)>1:
          sent = [word2idx[w] if w in word2idx else unk for w in s]
          sents.append(sent)
  return sents, word2idx

In [None]:
def train_model(savedir):
  sentences, word2idx = get_wiki()
  vocab_size = len(word2idx)
  window_size =5
  learning_rate = 0.025
  final_learning_rate = 0.0001
  num_negatives = 5
  epochs = 20
  D = 50
  learning_rate_delta = (learning_rate - final_learning_rate)/epochs
  W = np.random.randn(vocab_size,D)
  V = np.random.randn(D,vocab_size)
  p_neg = get_negative_sampling_distribution(sentences, vocab_size)
  costs = []
  total_words = sum([len(sentence) for sentence in sentences])
  print("Total Words:",total_words)
  threshold = 1e-5
  p_drop = 1-np.sqrt(threshold/p_neg)
  t0 = datetime.now() # Initialize t0 before the loop

  for epoch in range(epochs):
    np.random.shuffle(sentences)
    cost = 0
    counter = 0
    for sentence in sentences:
      sentence = [w for w in sentence if np.random.random()<(1-p_drop[w])]
      if len(sentence)<2:
        continue
      randomly_ordered_positions = np.random.choice(len(sentence),size = len(sentence),replace=False,)
      for pos in randomly_ordered_positions:
        word = sentence[pos]
        context_words = get_context(pos, sentence, window_size)
        neg_word = np.random.choice(vocab_size, p = p_neg)
        targets = np.array(context_words)

        c = sgd(word,targets,1,learning_rate,W,V)
        cost+=c
        c = sgd(neg_word,targets,0,learning_rate,W,V)
        cost+=c
      counter+=1
      if counter%100 == 0:
        sys.stdout.write("Processed %s/%s\r"%(counter,len(sentences)))
        sys.stdout.flush()
    dt = datetime.now() - t0
    print("Epoch Complete: ",epoch,"cost: ",cost,"dt: ",dt)

    costs.append(cost)
    learning_rate -= learning_rate_delta
  plt.plot(costs)
  plt.show()
  if not os.path.exists(savedir):
    os.mkdir(savedir)
  with open('%s/word2idx.json'%savedir,'w') as f:
    json.dump(word2idx,f)
  np.savez('%s/weights.npz'%savedir,W,V)
  return word2idx,W,V

In [None]:
def get_negative_sampling_distribution(sentences, vocab_size):
  word_freq = np.zeros(vocab_size)
  word_count  = sum(len(sentence) for sentence in sentences)
  for sentence in sentences:
    for word in sentence:
      word_freq[word]+=1
  p_neg = word_freq**0.75
  # Add a small epsilon to ensure all probabilities are positive
  p_neg += 1e-12
  p_neg/=p_neg.sum()
  assert(np.all(p_neg>0))
  return p_neg

In [None]:
def get_context(pos, sentence, window_size):
  start = max(0,pos-window_size)
  end_ = min(len(sentence),pos+window_size)
  context = []
  for ctx_pos, ctx_word in enumerate(sentence[start:end_], start = start):
    if ctx_pos != pos:
      context.append(ctx_word)
  return context

In [None]:
def sgd(input_, targets, label, learning_rate,W,V):
  activation = W[input_].dot(V[:,targets])
  prob = sigmoid(activation)
  gV = np.outer(W[input_], prob-label)
  gW = np.sum((prob-label)*V[:,targets], axis = 1)
  V[:,targets]-=learning_rate*gV
  W[input_]-=learning_rate*gW
  cost = label*np.log(prob+1e-10)+(1-label)*np.log(1-prob+1e-10)
  return cost.sum()

In [None]:
def load_model(savedir):
  with open('%s/word2idx.json'%savedir) as f:
    word2idx = json.load(f)
  npz = np.load('%s/weights.npz'%savedir)
  W = npz['arr_0']
  V = npz['arr_1']
  return word2idx,W,V

In [None]:
def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word,W):
  V,D = W.shape
  print("Testing: %s - %s = %s -%s"%(pos1, neg1, pos2, neg2))
  for w in (pos1,neg1,pos2,neg2):
    if w not in word2idx:
      print("%s is not in word2idx"%w)
      return
  p1 = W[word2idx[pos1]]
  n1 = W[word2idx[neg1]]
  p2 = W[word2idx[pos2]]
  n2 = W[word2idx[neg2]]
  vec = p1-n1+n2
  distances = pairwise_distances(vec.reshape(1,D),W,metric='cosine').reshape(V)
  idx = distances.argsort()[:10]
  best_idx = -1
  keep_out = [word2idx[w] for w in (pos1,neg1,neg2)]
  for i in idx:
    if i not in keep_out:
      best_idx = i
      break
  print("Got: %s-%s = %s-%s"%(pos1, neg1, idx2word[best_idx], neg2))
  print("closet 10:")
  for i in idx:
    print(idx2word[i], distances[i])
  print("Dist to %s: ", pos2, cos_dist(p2,vec))


In [None]:
def test_model(word2idx, W,V):
  idx2word = {i:w for w,i in word2idx.items()}
  for We in(W,(W+V.T)/2):
    print("********************")
    analogy('king','man','queen','woman', word2idx, idx2word, We)
    analogy('king','prince','queen','princess', word2idx, idx2word, We)
    analogy('maimi','florida','dallas','texas', word2idx, idx2word, We)
    analogy('einstein','scientist','picasso','painter', word2idx, idx2word, We)
    analogy('japan','sushi','germany','bratwurst', word2idx, idx2word, We)
    analogy('man','woman','he','she', word2idx, idx2word, We)
    analogy('man','woman','uncle','aunt', word2idx, idx2word, We)
    analogy('man','woman','brother','sister', word2idx, idx2word, We)


In [None]:
word2idx, W,V = train_model('w2v_model')
test_model(word2idx,W,V)

Finished Counting
Total Words: 57868900
Epoch Complete:  0 cost:  -131383093.98033111 dt:  1:15:46.398878
Epoch Complete:  1 cost:  -111979227.2218891 dt:  2:29:30.003137


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU runtime')
else:
  print(gpu_info)