In [53]:
import numpy as np 
import pandas as pd
import torch
from datasets import load_dataset

newsqa = load_dataset("StellarMilk/newsqa")
print(newsqa)

train = newsqa["train"]

DatasetDict({
    train: Dataset({
        features: ['paragraph', 'questions', 'answers', 'questions_answers'],
        num_rows: 10327
    })
    validation: Dataset({
        features: ['paragraph', 'questions', 'answers', 'questions_answers'],
        num_rows: 574
    })
    test: Dataset({
        features: ['paragraph', 'questions', 'answers', 'questions_answers'],
        num_rows: 574
    })
})


## Data Preprocessing

In [54]:
df = train.to_pandas()
df.head()
df.isnull().sum() 

paragraph            0
questions            0
answers              0
questions_answers    0
dtype: int64

#### no null rows

#### We use a contraction map to map popular contractions to their full forms
#### Then convert the text to lower case; extract only letters from it as its enough for word2vec

In [55]:
import pandas as pd
import re

CONTRACTION_MAP = {
    "ain't": "am not", "aren't": "are not", "can't": "cannot",
    "can't've": "cannot have", "'cause": "because", "could've": "could have",
    "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not",
    "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not",
    "he'd": "he would", "he'd've": "he would have", "he'll": "he will",
    "he'll've": "he will have", "he's": "he is", "how'd": "how did",
    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
    "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
    "I'll've": "I will have", "I'm": "I am", "I've": "I have",
    "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
    "it'll": "it will", "it'll've": "it will have", "it's": "it is",
    "let's": "let us", "ma'am": "madam", "mayn't": "may not",
    "might've": "might have", "mightn't": "might not", "mightn't've": "might not have",
    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
    "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock",
    "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
    "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would",
    "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
    "she's": "she is", "should've": "should have", "shouldn't": "should not",
    "shouldn't've": "should not have", "so've": "so have", "so's": "so is",
    "that'd": "that would", "that'd've": "that would have", "that's": "that is",
    "there'd": "there would", "there'd've": "there would have", "there's": "there is",
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
    "they'll've": "they will have", "they're": "they are", "they've": "they have",
    "to've": "to have", "wasn't": "was not", "we'd": "we would",
    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
    "we're": "we are", "we've": "we have", "weren't": "were not",
    "what'll": "what will", "what'll've": "what will have", "what're": "what are",
    "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is",
    "where've": "where have", "who'll": "who will", "who'll've": "who will have",
    "who's": "who is", "who've": "who have", "why's": "why is",
    "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
    "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
    "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
    "you'd": "you would", "you'd've": "you would have", "you'll": "you will",
    "you'll've": "you all have", "you're": "you are", "you've": "you have"
}

# above is generated by gpt
# Regex pattern to find contractions quickly
CONTRACTIONS_RE = re.compile('({})'.format('|'.join(re.escape(key) for key in CONTRACTION_MAP.keys())), re.IGNORECASE)

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    def replace(match):
        # Get the matched contraction, convert to lowercase, and look up in map
        try:
            return contraction_mapping[match.group(0).lower()]
        except KeyError:
            return
    # Apply the replacement using the regex pattern
    return CONTRACTIONS_RE.sub(replace, text)

#above was done just to reduce number of tokens; as certain letters like 's', 'll' etc would also be counted.

def clean_text(text):
    text = expand_contractions(text)
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df['questions_answers'] = df['questions_answers'].apply(clean_text)

# Vocabulary Creation

In [56]:
import re

text = " ".join(df['questions_answers'].tolist())

In [57]:
tokens = list(set(text.split()))             # first to set then to list as set narrows all the tokens from the corpus to only unique tokens
len(tokens)

34041

# TF-IDF vectorization
### Formula: TF * IDF where,
#### TF (token, doc) = no. of times token occurs in that particular document / total no. of tokens in that document
#### IDF (token) = log (Total no. of documents / no. of documents that contain that token)

In [58]:
from collections import defaultdict
N = len(df) 

# This is for counting how many documents each token appears in at least once.
doc_freq = defaultdict(int)

for doc_text in df['questions_answers']:
    doc_text = re.sub(r'[^a-z\s]', '', doc_text.lower())
    doc_text = re.sub(r'\s+', ' ', doc_text).strip()
    for token in set(doc_text.split()):
        doc_freq[token] += 1                       # takes all unique tokens in the doc and updates the frequency of all unique tokens in that document by 1

# calculate the final IDF score for each token
idf_scores = {token: np.log(N / (doc_freq[token] + 1)) for token in tokens}

tfidf_matrix = np.zeros((len(df), len(tokens)))

for doc_idx, i in enumerate(df['questions_answers']):
    doc = "".join(i)
    doc = doc.lower()
    doc = re.sub(r'[^a-z\s]', '', doc)
    doc = re.sub(r'\s+', ' ', doc).strip()

    doc_words = doc.split()                    # all the tokens in the doc
    n_doc = len(doc_words)                     # total no. of words/tokens in the document

    # Avoid division by zero if a document is empty after processing()
    if n_doc == 0:
        continue
        
    for token_idx, j in enumerate(tokens):
        # TF calculation
        n_j = doc_words.count(j)               # no. of times token/word 'j' occurs in that particular document
        tf = n_j / n_doc                       # TF calculation
        idf = idf_scores[j]                    # Extracting idf which was pre computed
        
        tfidf_matrix[doc_idx, token_idx] = tf * idf

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)   

TF-IDF Matrix Shape: (10327, 34041)


# Example:
### finding cosine similarity between two similar words

In [59]:
ar1 = np.array(tfidf_matrix[:, tokens.index('iphone')])
ar2 = np.array(tfidf_matrix[:, tokens.index('apple')])
dot_product = ar1@ar2
norm_ar1 = np.linalg.norm(ar1)
norm_ar2 = np.linalg.norm(ar2)
if norm_ar1 > 0 and norm_ar2 > 0:
    cosine_sim = dot_product / (norm_ar1 * norm_ar2)
else:
    cosine_sim = 0.0 
cosine_sim


np.float64(0.2578297499808587)

# Word2Vec, CBOW torch implementation:
### CBOW- Continuous Bag Of Words- uses a sliding window across the corpus; takes centre word as target and passes vocab indices of surrounding words(radius= r) and passes through embedding layer to get vector embeddings of those words then to an output layer which gives a one hot enc to extract centre word

### Model Architecture:

In [60]:
import torch.nn as nn

class W2V(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        self.embed = nn.Embedding(num_embeddings = vocab_size, embedding_dim = emb_size)
        self.out = nn.Linear(emb_size, vocab_size)
    
    def forward(self, indices):
        embs = self.embed(indices)
        avg = embs.mean(axis = 1)            # mean across the surrounding words
        return self.out(avg)

In [61]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [62]:
vocab_size = len(tokens)
emb_size = 200
vocab_size+=1

In [63]:
idx2val = {key: val for key, val in enumerate(tokens)} #as the names suggest
val2idx = {val: key for key, val in enumerate(tokens)}
idx2val[vocab_size-1] = ''
val2idx[''] = vocab_size - 1 

In [64]:
context_size = 10
window_rad = (context_size - 1) // 2  

pad_index = val2idx['']               # padding token

contexts_data = []
targets = []

for doc in df['questions_answers']:
    doc_tokens = doc.split()
    
    padded_toks = [''] * window_rad + doc_tokens + [''] * window_rad   #padding added this way to handle case when target word is at the start(in first radius) or end(last radius)
    #Slide the window over the doc_tokens
    for i in range(len(doc_tokens)):
        target_word = doc_tokens[i]   #target
        
        center_position = i + window_rad            # where centre of the padded_toks represent the target
        
        words_before = padded_toks[center_position - window_rad : center_position]
        words_after = padded_toks[center_position + 1 : center_position + 1 + window_rad]
        
        context_words = words_before + words_after
        
        try:
            target_index = val2idx[target_word]
            context_indices = [val2idx[word] for word in context_words]
            targets.append(target_index)
            contexts_data.append(context_indices)

        except KeyError as e:        # if word not present in vocab
            pass

In [65]:
contexts_data[:5], targets[:5]  # list of lists

([[34041, 34041, 34041, 34041, 16818, 24139, 30891, 7454],
  [34041, 34041, 34041, 16042, 24139, 30891, 7454, 11648],
  [34041, 34041, 16042, 16818, 30891, 7454, 11648, 26755],
  [34041, 16042, 16818, 24139, 7454, 11648, 26755, 16042],
  [16042, 16818, 24139, 30891, 11648, 26755, 16042, 25135]],
 [16042, 16818, 24139, 30891, 7454])

## Dataset wrapper and dataloader creation
#### dataloader essentially allows model to select data in batches and allow parallel proccessing

In [66]:
from torch.utils.data import Dataset, DataLoader

class Word2VecDataset(Dataset):
    def __init__(self, contexts_data, targets):
        self.contexts_data = torch.LongTensor(contexts_data)        # as they were initially list of lists
        self.targets = torch.LongTensor(targets)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.contexts_data[idx], self.targets[idx]

In [67]:
dataset = Word2VecDataset(contexts_data, targets)

batch_size = 64
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

## Model Initialization and training

In [68]:
model = W2V(vocab_size, emb_size).to(device)

In [69]:
def train_fn(model, optimizer, loss_fn, dataloader, device):

    model.train()
    total_loss = 0

    for contexts, targets in dataloader:

        contexts = contexts.to(device)
        targets = targets.to(device)
        
        optimizer.zero_grad()

        outputs = model(contexts)
        loss = loss_fn(outputs, targets)

        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
        
    return total_loss / len(dataloader)

In [71]:
import torch.optim as optim

num_epochs = 70
optimizer = optim.Adam(params = model.parameters(), lr = 0.003)
loss_fn = nn.CrossEntropyLoss()
for epoch in range(num_epochs):
    
    avg_loss = train_fn(model, optimizer, loss_fn, dataloader, device)
    
    print(f"Epoch {epoch+1:02d}/{num_epochs} | Average Training Loss: {avg_loss:.4f}")


Epoch 01/70 | Average Training Loss: 4.8486
Epoch 02/70 | Average Training Loss: 3.8633
Epoch 03/70 | Average Training Loss: 3.2094
Epoch 04/70 | Average Training Loss: 2.7709
Epoch 05/70 | Average Training Loss: 2.4714
Epoch 06/70 | Average Training Loss: 2.2579
Epoch 07/70 | Average Training Loss: 2.1032
Epoch 08/70 | Average Training Loss: 1.9801
Epoch 09/70 | Average Training Loss: 1.8851
Epoch 10/70 | Average Training Loss: 1.8084
Epoch 11/70 | Average Training Loss: 1.7459
Epoch 12/70 | Average Training Loss: 1.6946
Epoch 13/70 | Average Training Loss: 1.6509
Epoch 14/70 | Average Training Loss: 1.6123
Epoch 15/70 | Average Training Loss: 1.5789
Epoch 16/70 | Average Training Loss: 1.5513
Epoch 17/70 | Average Training Loss: 1.5256
Epoch 18/70 | Average Training Loss: 1.5026
Epoch 19/70 | Average Training Loss: 1.4819
Epoch 20/70 | Average Training Loss: 1.4648
Epoch 21/70 | Average Training Loss: 1.4480
Epoch 22/70 | Average Training Loss: 1.4305
Epoch 23/70 | Average Training L

In [72]:
torch.save(model.state_dict(), "word2vec_params.pth")             # to save the model

In [73]:
loaded_model = W2V(vocab_size, emb_size) 
loaded_model.load_state_dict(torch.load("word2vec_params.pth"))
loaded_model.eval()

W2V(
  (embed): Embedding(34042, 200)
  (out): Linear(in_features=200, out_features=34042, bias=True)
)

## Example:
#### checking cosine similarity between two similar words

In [74]:
import torch.nn.functional as F

embedding_weights = loaded_model.embed.weight
idx1 = val2idx['iphone']
idx2 = val2idx['apple']
emb1 = embedding_weights[idx1]
emb2 = embedding_weights[idx2]
emb1_unsqueezed = emb1.unsqueeze(0)
emb2_unsqueezed = emb2.unsqueeze(0)
similarity = F.cosine_similarity(emb1_unsqueezed, emb2_unsqueezed)
print(similarity)

tensor([0.0781], grad_fn=<SumBackward1>)


#### Better results could have been acheived with better tuning and longer training