In [None]:
import struct
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
!pip install pyarrow




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
wikitext_2 = '/content/drive/MyDrive/AI/train-00000-of-00001.parquet'
wikitext_103_1 = '/content/drive/MyDrive/AI/train-00001-of-00002.parquet'
wikitext_103 = '/content/drive/MyDrive/AI/train-00000-of-00002 (1).parquet'

wikitext_2 = pd.read_parquet('/content/drive/MyDrive/AI/train-00000-of-00001.parquet')
wikitext_103_1 = pd.read_parquet('/content/drive/MyDrive/AI/train-00001-of-00002.parquet')
wikitext_103 = pd.read_parquet('/content/drive/MyDrive/AI/train-00000-of-00002 (1).parquet')



In [None]:
wikitext_test = '/content/drive/MyDrive/AI/test-00000-of-00001 (2).parquet'
wikitext_test = pd.read_parquet('/content/drive/MyDrive/AI/test-00000-of-00001 (2).parquet')

Concating all the data a one place

In [None]:

all_data = pd.concat([wikitext_2, wikitext_103_1, wikitext_103, wikitext_test], ignore_index=True)

In [None]:

all_data_train = all_data.sample(n=1000, random_state=42)


#Tokenization


*** Converts text to lowercase and remove non-alphabetic characters such as punctuation, numbers, and etc. Further, it will convert text into number of tokens while removing the stop words using nltk package for stop words and create a vocabulary. ***



In [None]:
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import re

def custom_tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

all_data_train["tokens"] = all_data_train["text"].apply(custom_tokenize)

#create a vocabulary
all_tokens = [token for tokens in all_data_train["tokens"] for token in tokens]

print(f"Total number of tokens: {len(all_tokens)}")

Total number of tokens: 28896


*** This part will store the unique words in vocab using list(set()) and will also add <unknown> token at the 0th index which makes sure that if the mpdel encounters any word that is not present in vocab will be having same vector representation as of <unknown> which can further be considered as one of the consequences of using Word2Vec.***


In [None]:
import collections
from collections import Counter

#get us the most commmon unique words using the Counter
word_counts = Counter(all_tokens)

#get the unique words in the vocab
vocab = list(set(all_tokens))
vocab_size = len(vocab)
print(f"Vocabulary Size: {len(vocab)}")

#store the <unknown> at the 0th index
vocab = {"<unknown>": 0}
for i, (word, _) in enumerate(word_counts.most_common(vocab_size - 1), start=1):
    vocab[word] = i
vocab_size = len(vocab)

#reverse methodolgy for getting similar words at the end of the code
index2word = {index: word for word, index in vocab.items()}

Vocabulary Size: 10070


#Skip-Gram Model

*** This section will generate skip-gram pairs with trhe window size of 2 's which means each center word wil be forming 4 unique pairs. If the token or lets say center word is not present in vocab list then it will skip that word and generate for other.***

In [None]:
window_size = 2

def generate_skip_grams(tokens, vocab):
    pairs = []
    for i, center_word in enumerate(tokens):
        if center_word not in vocab:
            continue  # Skip words not in the 1000-word vocab

        context_words = tokens[max(0, i-window_size):i] + tokens[i+1:i+window_size+1]
        for context_word in context_words:
            if context_word in vocab:
                pairs.append((center_word, context_word))
    return pairs


# Generate skip-gram pairs for training data
skip_grams = []
for df in [all_data_train]:
    for tokens in df["tokens"]:
        skip_grams.extend(generate_skip_grams(tokens, vocab))


print("Sample Skip-Gram Pairs:", skip_grams[:10])
print("Total Pairs:", len(skip_grams))

Sample Skip-Gram Pairs: [('summer', 'olympics'), ('olympics', 'summer'), ('championship', 'standings'), ('championship', 'race'), ('standings', 'championship'), ('standings', 'race'), ('race', 'championship'), ('race', 'standings'), ('smyth', 'report'), ('report', 'smyth')]
Total Pairs: 111730


#Create a Custom Dataset

***

*** (__init__) method also known as constructor will have the skip-gram pairs consisting of center word and context word. It will aslo map uniwue words to the indices present in the dictionary. (__len__) method will be used to return the number of samples present in the vocab so that Pytorch knows how many samokes are available. (__getitem__) will take index value and will correspond it to the center and context word pair, and if the word is not present it wil pass it to index value 0 which indicates <unknown>. Finally, it will create a DataLoader that will be consisting of custom data with shffle = true which means it will shuffle the training data after each epoch which ensures some sort of randomness.***


In [None]:
import torch
from torch.utils.data import Dataset

class SkipGramDataset(Dataset):
    def __init__(self, skip_grams, vocab):
        self.skip_grams = skip_grams
        self.vocab = vocab
        self.vocab_size = len(vocab)

    def __len__(self):
        return len(self.skip_grams)

    def __getitem__(self, idx):
        center_word, context_word = self.skip_grams[idx]
        #0th index indicates the the unknown word whicxh means
        #if it encounters any word out of the vocab list then it will have same vector representation as of unknown
        center_idx = self.vocab.get(center_word, 0)
        context_idx = self.vocab.get(context_word, 0)

        return center_idx, context_idx


# Create an instance of the custom dataset
dataset = SkipGramDataset(skip_grams, vocab)

#Create a dataloader
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)



#Word2Vec Model Architecture

*** The model architecture of Word2Vec will be consisting of on ehidden layer where we will apply ReLU activation function and one embedding layer which is also word embedding in Word2Vec. The dimensional space will be 300 which is one of the standard values defined by Google. Also, in Word2Vec model we will be using Softmax function which generate probablities of each word and help us to get words having higher probablity as an output.***

In [None]:
#architecture of the neural network
import torch.nn as nn
class MyNetwork(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(MyNetwork, self).__init__()
        #Embedding layer will covert word index into vector
        #the hidden layer will have the ReLU activation function
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(embedding_dim, 300),
            nn.ReLU(),
            nn.Linear(300, vocab_size)
        )

    def forward(self, center_word):
        embedding_layer = self.embedding(center_word)
        hidden_layer = self.linear_relu_stack(embedding_layer)
        prob_dist = torch.softmax(hidden_layer, dim=1)

        return prob_dist


In [None]:
model = MyNetwork(vocab_size, embedding_dim=300)
print(model)

MyNetwork(
  (embedding): Embedding(10070, 300)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=300, out_features=300, bias=True)
    (1): ReLU()
    (2): Linear(in_features=300, out_features=10070, bias=True)
  )
)


In [None]:
for name, param in model.named_parameters():
  print(f'Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n')

Layer: embedding.weight | Size: torch.Size([10070, 300]) | Values : tensor([[-4.9781e-01, -7.7608e-01, -6.5185e-01,  1.6786e+00, -1.1290e+00,
         -9.9998e-01,  3.1039e-01,  4.6455e-01, -1.1168e+00,  1.1845e+00,
          3.7747e-01, -5.0946e-01,  8.8729e-01, -1.7501e+00, -3.3604e-01,
         -3.8515e-01, -5.9830e-01,  4.5290e-01,  7.5620e-01, -1.0394e+00,
         -3.7295e-01, -4.5421e-01,  9.0775e-02, -4.9140e-01,  1.3058e+00,
          4.6818e-01,  7.3596e-01, -8.1127e-01,  6.9989e-01, -5.6058e-02,
         -2.0802e-01,  1.3760e+00,  8.7889e-01, -3.1846e-01,  4.9374e-01,
          1.1827e+00,  1.4793e+00,  3.7898e-01, -8.4512e-01,  6.2153e-01,
          7.5080e-01,  9.3488e-01, -6.8032e-01, -7.7463e-02, -1.1154e+00,
         -6.4441e-01, -3.8082e-01,  9.6739e-01, -1.6951e+00,  1.5994e-01,
         -4.7873e-01,  3.8184e-01, -8.0511e-01,  4.4277e-01,  1.1747e+00,
         -9.4026e-01, -8.8750e-01,  1.8181e-02, -7.3827e-01,  1.1808e-01,
          4.7645e-01, -1.8497e+00,  1.0097e+

#Training and Evaluation

In [None]:
#hyperparameters
learning_rate = 0.0001
#batch size has already been defined

#loss function for th multi-class clasificiation problem
loss_fn = nn.CrossEntropyLoss() #since this is a multi class clasificiation problem we will use Cross Entropy loss

#optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
#just for the personal refernce to know how many times the loop will work
epochs = 5
num_batches = len(dataloader)
print(num_batches)

num_prints_per_epoch = num_batches//100
print(f"{num_prints_per_epoch}")

total_prints  = num_prints_per_epoch * epochs
print(f"{total_prints}")



873
8
40


*** Forward pass will pass the middle word into the model to get predictions and will calculate loss compared to output and true context word. Whereas, backward pass will update parameters using optimizer.step()***


In [None]:
# Training Loop
num_epochs = 5


for epoch in range(num_epochs):
    total_loss = 0

    for batch_idx, (center, context) in enumerate(dataloader):
        optimizer.zero_grad() #clears previous gradients to prevent accumulation
        center = center.long()
        context = context.long()

        # Forward pass
        output = model(center)
        loss = loss_fn(output, context)

        # Backward pass and optimization
        loss.backward()
        optimizer.step() #update parameters

        total_loss += loss.item()

        #will print in gaps of 100 batches
        if batch_idx % 100 == 0:
            print(f"Batch {batch_idx}, Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")



Batch 0, Loss: 9.2173
Batch 100, Loss: 9.2173
Batch 200, Loss: 9.2173
Batch 300, Loss: 9.2173
Batch 400, Loss: 9.2173
Batch 500, Loss: 9.2172
Batch 600, Loss: 9.2171
Batch 700, Loss: 9.2168
Batch 800, Loss: 9.2158
Epoch 1/5, Loss: 9.2169
Batch 0, Loss: 9.2101
Batch 100, Loss: 9.2136
Batch 200, Loss: 9.2172
Batch 300, Loss: 9.2124
Batch 400, Loss: 9.2092
Batch 500, Loss: 9.2086
Batch 600, Loss: 9.2172
Batch 700, Loss: 9.2163
Batch 800, Loss: 9.2107
Epoch 2/5, Loss: 9.2125
Batch 0, Loss: 9.2120
Batch 100, Loss: 9.2094
Batch 200, Loss: 9.2172
Batch 300, Loss: 9.2022
Batch 400, Loss: 9.2150
Batch 500, Loss: 9.2168
Batch 600, Loss: 9.2088
Batch 700, Loss: 9.1945
Batch 800, Loss: 9.2153
Epoch 3/5, Loss: 9.2110
Batch 0, Loss: 9.2090
Batch 100, Loss: 9.2159
Batch 200, Loss: 9.2095
Batch 300, Loss: 9.2064
Batch 400, Loss: 9.2053
Batch 500, Loss: 9.2091
Batch 600, Loss: 9.2079
Batch 700, Loss: 9.2111
Batch 800, Loss: 9.2097
Epoch 4/5, Loss: 9.2101
Batch 0, Loss: 9.2134
Batch 100, Loss: 9.2174
Ba

#Inference words

***Using cosine similarity which means if closer to 1 → more similar words
and if closer to 0 → unrelated words ***

In [None]:
import torch.nn.functional as F

#function to throw a statement if word is not present in vocabulary
def find_similar_words(word, top_k=10):
    if word not in vocab:
        print("Word not in vocabulary!")
        return

    word_idx = torch.tensor([vocab[word]], dtype=torch.long)
    word_vector = model.embedding(word_idx).detach() #this will be used for word embedding

    # use cosine similarity
    similarities = []
    for i in range(vocab_size):
        other_vector = model.embedding(torch.tensor([i], dtype=torch.long)).detach() #embedding every other word in vocab
        similarity = F.cosine_similarity(word_vector, other_vector).item()
        similarities.append((index2word[i], similarity)) #this will store word and similarity score in similarities list

    # Sort by similarity and return top-K words
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    return [word for word, _ in similarities[:top_k]]

#example of vocab not present in dictionary
print(find_similar_words("religion", top_k=10))
#example 2
print(find_similar_words("film", top_k=10))

Word not in vocabulary!
None
['film', 'riverbanks', 'termination', 'yucatn', 'birth', 'marvel', 'rode', 'bring', 'helpful', 'terri']
