In [None]:
# The below notebook was ran on kaggle
import torch
import torch.nn as nn
import random
import numpy as np
from sklearn.decomposition import PCA
import string
# for progress bars
from tqdm import tqdm
import json
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

In [None]:
device = "cpu"
if torch.cuda.is_available():
    print('using device: cuda')
    device = "cuda"
else:
    print('using device: cpu')

In [None]:
# loading data
sentences = []
with open('../input/electronics-5json/Electronics_5.json') as f:
    for jsonObj in f:
        sent = json.loads(jsonObj)
        sentences.append(sent['reviewText'])

In [None]:
# storing frequency of each token. Will remove the tokens whose frequency is less than a partcular threshold like 5
frequency = {}
num_of_training_sentences = 5000
lst = []

stop_words = set(stopwords.words('english'))

# filtering the top num_of_training sentences
for i in range(num_of_training_sentences):
    sent = sentences[i].lower()
    pure = ''
    for ch in sent:
        if ch not in string.punctuation:
            pure += ch
        else:
            pure += ' '
    lst.append(pure)
    
pure_sentences = []
for i in range(num_of_training_sentences):
    sent = lst[i]
    pure = ''
    for w in sent.split():
        if w.isalpha() and w not in stop_words:
            pure += w
            pure += ' '
            
            if frequency.get(w) == None:
                frequency[w] = 1
            else:
                frequency[w] += 1
                
    pure_sentences.append(pure)

In [None]:
# set good value of threshold
threshold = 5
words = []
index_dict = {}
index_dict['PAD'] = 0
cnt = 1
for w in frequency.keys():
    if frequency[w] >= threshold:
        words.append(w)
        index_dict[w] = cnt
        cnt += 1
        
vocab_size = len(words) + 1 #we have already added 'PAD' at index 0
thresh_sentences = []
for sent in pure_sentences:
    thresh = []
    for w in sent.split():
        if index_dict.get(w) != None:
            thresh.append(w)
    thresh_sentences.append(thresh)
    

# generating context-target pairs
window_size = 2
negative_sampling = 5 #number of negative samples which needs to be added

inp_contexts = []
for lst in thresh_sentences:
    sent_len = len(lst)
    for ind in range(sent_len):
        start = max(0,ind-window_size)
        end = min(sent_len,ind+window_size+1) 
        
        context_indices = [] # -> we are storing in this format:- a,b,c,d,  word,  k-negative samples. where a,b,c,d are context words.
        for i in range(start,end):
            if i != ind:
                context_indices.append(index_dict[lst[i]])
                
        # index of the word         
        context_indices.append(index_dict[lst[ind]])
                
        # assigning k random words which are not context of given word         
        for i in range(negative_sampling):
            while 1:
                # generating in range of 0 to vocab_size -1                 
                k = random.randint(0,vocab_size-1)
                if k not in context_indices:
                    break
            context_indices.append(k)
            
                
        #padding  
        context_len = 1 + 2*window_size + negative_sampling 
        if len(context_indices) < context_len:
            context_indices = [0]*(context_len - len(context_indices)) + list(context_indices)
            
        inp_contexts.append(context_indices)


In [None]:
embedding_dims = 300
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.activation_function = nn.ReLU()
        

    def forward(self, inputs):
        cbow = torch.mean(self.embeddings(inputs[0:4]),0).view(-1,1)
        mat = self.embeddings(inputs[4:10])
        dot_product = torch.matmul(mat,cbow)
        return self.activation_function(dot_product)
        


model = CBOW(vocab_size, embedding_dims)
model.to(device)
loss_function = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

# setting target. Our target
target = np.zeros(negative_sampling + 1)
target[0] = 1.0
target = torch.FloatTensor(target).view(-1,1).to(device)

In [None]:
num_of_epochs = 10
for epoch in range(num_of_epochs):
    total_loss = 0
    for inp in tqdm(inp_contexts):
        context_tensor = torch.tensor(inp, dtype=torch.long)
        output = model(context_tensor.to(device))
        total_loss += loss_function(output, target)
        
    #optimize at the end of each epoch
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()
    #Below TQDM log doesnt containg the time taken by back prop and optimizer

In [None]:
ten = torch.tensor([index_dict['camera']])
print(model.embeddings(ten.to(device)))

In [None]:
#Saving and Loading Model
torch.save(model, "q2.pt")

In [None]:
model = torch.load('cbow.pt')
model.eval()

In [None]:
embedds = list(model.embeddings.parameters())[0].detach().cpu()

In [None]:
def top_10(word):
    target_vec = embedds[index_dict[word]].numpy()

    lst = []
    for w in index_dict.keys():
        lst.append([cosine_similarity([target_vec] , [embedds[index_dict[w]].numpy()]),w])

    lst.sort(reverse=True)
    # output 10 closest words
    print('Top 10 closest word for the word {} are:'.format(word))
    vecs = []
    vals = []
    close_words = []
    for i in range(10):
        vecs.append(embedds[index_dict[lst[i][1]]].numpy())
        vals.append(lst[i][1])
        close_words.append(lst[i][1])

    print(close_words)
        
    pca = PCA(n_components = 2)
    vecs = pca.fit_transform(vecs)
    
    plt.figure(figsize=(10,10))
    plt.scatter(vecs[:,0], vecs[:,1])
    for word, (x,y) in zip(vals, vecs):
        plt.text(x, y, word)
    plt.show()

In [None]:
top_10('camera')

In [None]:
# nouns
top_10('android')

In [None]:
# verb
top_10('play')

In [None]:
# adjective
top_10('happy')

In [None]:
# adjective
top_10('quick')

In [None]:
# verb
top_10('sad')