<a href="https://colab.research.google.com/github/SnkhchyanV/NLP/blob/main/Word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import numpy as np
import tensorflow as tf
import os
from google.colab import drive
drive.mount('/content/drive')

import zipfile

import gensim
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
import random
from collections import deque
from itertools import chain


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
data_path = '/content/drive/MyDrive/DataSets/corpus_100k.zip'
zip_ref = zipfile.ZipFile(data_path , 'r') #Opens the zip file in read mode
zip_ref.extractall('/content/sample_data/corpus_100k') #Extracts the files into the /corpus_100k folder
zip_ref.close()


In [15]:
# with open('/content/sample_data/corpus_100k/corpus_100k', 'r', encoding='utf-8', errors='replace') as f:
#   sentences = [s.strip() for s in f.readlines()]

# print(f'Number of sentences: {len(sentences)}')
# use_first_n = 300000
# sentences = sentences[:use_first_n]
# print(f'Using: {len(sentences)}')

In [16]:
# def data_preprocessing(data):
#   translation_table = str.maketrans('', '', "«»()+-=-,՝.․։՜՛֊՟՚")
#   sentences = [(sentence).lower().translate(translation_table) for sentence in data]
#   return sentences

# data = data_preprocessing(sentences)

# def create_vocabulary(sentences):
#   i = 0
#   word2idx ={}
#   idx2word = {}
#   unique = set()
#   freqs = {}

#   for sentence in sentences:
#     for word in sentence.split(' '):
#       if (word not in unique) and (len(word.strip()) != 0):
#         word2idx[word] = i
#         idx2word[i] = word
#         i += 1
#         unique.add(word)
#         freqs[word] = 1
#       elif word in unique:
#         freqs[word] += 1
#   return unique, word2idx, idx2word, freqs

# vocab, word2idx, idx2word, freqs = create_vocabulary(data)

# # Altering the distribution to perform negative sampling
# totalWords = sum([freq**(3/4) for freq in freqs.values()])
# wordProb = {word:(freq)**(3/4)/totalWords for word, freq in freqs.items()}

In [17]:
with open('/content/sample_data/corpus_100k/corpus_100k', 'r', encoding='utf-8', errors='replace') as f:
    sentences = [s.strip() for s in f.readlines()]

use_first_n = 300000
sentences = sentences[:use_first_n]

def data_preprocessing(data):
    translation_table = str.maketrans('', '', "«»()+-=-,՝.․։՜՛֊՟՚")
    sentences = [sentence.lower().translate(translation_table) for sentence in data]
    return sentences

data = data_preprocessing(sentences)

def create_vocabulary(sentences):
    i = 0
    word2idx = {}
    idx2word = {}
    unique = set()
    freqs = {}

    for sentence in sentences:
        for word in sentence.split():
            if (word not in unique) and (len(word.strip()) != 0):
                word2idx[word] = i
                idx2word[i] = word
                i += 1
                unique.add(word)
                freqs[word] = 1
            elif word in unique:
                freqs[word] += 1
    return unique, word2idx, idx2word, freqs

vocab, word2idx, idx2word, freqs = create_vocabulary(data)
totalWords = sum([freq ** (3/4) for freq in freqs.values()])
wordProb = {word: (freq) ** (3/4) / totalWords for word, freq in freqs.items()}


In [18]:
def skipgram_data_generator(sentences, window_size, batch_size, vocab):
    for sentence in sentences:
        padded_sentence = [None] * (window_size) + sentence + [None] * (window_size)
        window = deque(maxlen=2 * window_size + 1)
        for word in padded_sentence:
            if len(window) == 2 * window_size + 1:
                target_word = window[window_size]
                context_words = list(chain(list(window)[:window_size], list(window)[window_size + 1:]))
                for context_word in context_words:
                    if target_word in vocab and context_word in vocab:
                        yield target_word, context_word
            window.append(word)

        if None in window:
            yield None, None


data = [sentence.split() for sentence in data ]

window_size = 2
batch_size = 32


batches = []
current_batch = []

for target, context in skipgram_data_generator(data[:10], window_size, batch_size, vocab):
    if target is not None and context is not None:
        current_batch.append((target, context))
    if len(current_batch) == batch_size:
        batches.append(current_batch)
        current_batch = []

if current_batch:
    batches.append(current_batch)


values = list(wordProb.keys())
probabilities = list(wordProb.values())
neg_samples = []
num_neg_samples = 10
targets = []
contexts = []
labels = []

for i, batch in enumerate(batches):
    for target, context in batch:
      context_arr = []
      targets.append(word2idx[target])
      context_arr.append(word2idx[context])
      i = 0
      while(i < num_neg_samples):
          word = random.choices(values, weights=probabilities, k=1)[0]
          if word != target and word != context:
            context_arr.append(word2idx[word])
            i += 1

      label_arr = np.zeros(len(context_arr), dtype=int)
      label_arr[0] = 1
      contexts.append(context_arr)
      labels.append(list(label_arr))


In [20]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim=200):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.hidden_weights = tf.Variable(tf.random.uniform(
            shape=(self.vocab_size, embedding_dim),
            minval=-0.5 / embedding_dim,
            maxval=0.5 / embedding_dim,
        ))
        self.output_weights = tf.Variable(tf.random.uniform(
            shape=(embedding_dim, self.vocab_size),
            minval=-0.5 / embedding_dim,
            maxval=0.5 / embedding_dim,
        ))

    def call(self, data):
        hidden_layer = tf.transpose(tf.nn.embedding_lookup(self.hidden_weights, data))
        output_init = tf.linalg.matmul(hidden_layer, self.output_weights)

        y = tf.nn.softmax(output_init)

        return y, hidden_layer, output_init

    def training(self, targ, cont, lab, num_iter=10000):
        for _ in range(num_iter):
            for i, word_idx in enumerate(targ):
                cont_pos = cont[i][0]
                context_indices_neg = cont[i][1:]

                with tf.GradientTape() as tape:
                    pos_logits = tf.reduce_sum(tf.multiply(tf.nn.embedding_lookup(self.output_weights, cont_pos), self.hidden_weights[word_idx]))
                    neg_logits = tf.linalg.matmul(tf.nn.embedding_lookup(self.output_weights, context_indices_neg), tf.transpose(tf.expand_dims(self.hidden_weights[word_idx], axis=1)))

                    loss = tf.reduce_mean(-tf.math.log(tf.nn.sigmoid(pos_logits)) - tf.math.log(tf.nn.sigmoid(-neg_logits)))

                gradients = tape.gradient(loss, self.trainable_variables)
                optimizer = tf.optimizers.Adam(learning_rate=1)
                optimizer.apply_gradients(zip(gradients, self.trainable_variables))
                print(loss)



word2vec = Word2Vec(len(vocab))
word2vec.training(targets, contexts, labels)

InvalidArgumentError: ignored