## Import Libraries

In [3]:
!pip install pandas
!pip install pyarrow
!pip install numpy
!pip install sentencepiece
!pip install torch

import pandas as pd
import pyarrow as pa
import numpy as np

import sentencepiece as spm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from tqdm import tqdm
import os

Collecting pyarrow
  Downloading pyarrow-15.0.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.0 kB)
Downloading pyarrow-15.0.1-cp312-cp312-macosx_11_0_arm64.whl (24.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.2/24.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-15.0.1
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
Collecting torch
  Downloading torch-2.2.1-cp312-none-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting filelock (from torch)
  Using cached filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Co

# Import data

In [4]:
# Load Parquet data
data = pd.read_csv('hacknews.csv')
# Now, you can use the `data' DataFrame to analyse and manipulate the data.
print(data.head())

FileNotFoundError: [Errno 2] No such file or directory: 'hacknews.csv'

In [None]:
# Extract the columns with the correct rows
story_data = data[data["type"] == "story"]

# Extract the relevant columns
relevant_data = story_data[["title", "score"]]

# Remove any null entries
processed_data = relevant_data.dropna()

print(processed_data.head())

In [None]:
# Convert to array
data_array = processed_data.to_numpy()
print(data_array)

# Process data with sentencepiece

In [None]:
# Join all the titles together one by one
corpus = "\n".join(data_array[:, 0])
print(corpus)

In [None]:
# Get the titles
title_array = data_array[:, 0]

# Convert corpus to a file
corpus = "\n".join(title_array)
with open("corpus.txt", "w") as f:
    f.write(corpus)

# Train SentencePiece model
spm.SentencePieceTrainer.train(input="corpus.txt", model_prefix='hackernews', vocab_size=10000)

In [None]:
# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load('hackernews.model')

# Tokenize titles
tokenized_titles = [sp.encode_as_pieces(title) for title in title_array]

# Turn the tokens into tokenids
id_titles = [[sp.piece_to_id(token) for token in title_tokens] for title_tokens in tokenized_titles]

## Apply word2vec to tokens

In [None]:
# Hyperparameters
vocab_size = sp.get_piece_size()
embedding_dim = 128
window_size = 2
learning_rate = 0.01
epochs = 5
batch_size = 32

### CBOW Model

In [None]:
# Generate all the pairs of tokens and context windows for CBOW
def generate_cbow_pairs(tokenized_sentences, window_size=2):
    cbow_pairs = []
    for sentence in tokenized_sentences:
        sentence_length = len(sentence)
        for index, word in enumerate(sentence):
            start = index - window_size
            end = index + window_size + 1
            
            context = [sentence[i] for i in range(start, end) if 0 <= i < sentence_length and i != index]
            target = word
            cbow_pairs.append((context, target))
    return cbow_pairs

In [None]:
# A `Dataset` class for generating pairs of tokens and context windows for CBOW one at a time, to use in the `DataLoader` to generate mini-batches
class CBOWDataset(Dataset):
    def __init__(self, cbow_pairs, context_size):
        self.cbow_pairs = cbow_pairs
        self.context_size = context_size

    def __len__(self):
        return len(self.cbow_pairs)

    def __getitem__(self, idx):
        context, target = self.cbow_pairs[idx]
        # Pad or trim the context to ensure uniform size
        if len(context) < 2 * self.context_size:
            context += [0] * (2 * self.context_size - len(context))  # Assuming 0 is the padding index
        else:
            context = context[:2 * self.context_size]
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)

In [None]:
# The CBOW model, which is just an embedding layer, an average pooling, and a final output linear layer followed by a softmax activation
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context_words):
        embedded = self.embeddings(context_words)
        projection = torch.mean(embedded, dim=1)
        out = self.linear(projection)
        log_probs = nn.functional.log_softmax(out, dim=1)
        return log_probs
    
    def embed(self, word):
        self.eval()
        embedding = self.embeddings(word)
        return embedding

In [None]:
# Applying the data generating functions we wrote earlier to our hacker news data
cbow_pairs = generate_cbow_pairs(id_titles, window_size)
cbow_dataset = CBOWDataset(cbow_pairs, window_size)
cbow_dataloader = DataLoader(cbow_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Model, loss, and optimizer
cbow_model = CBOWModel(vocab_size, embedding_dim)  # Assuming CBOWModel is defined as before
cbow_loss_function = nn.NLLLoss()
cbow_optimizer = optim.SGD(cbow_model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    with tqdm(cbow_dataloader, unit="batch") as tepoch:
        for context, target in tepoch:
            tepoch.set_description(f"Epoch {epoch+1}")
            
            cbow_model.zero_grad()
            log_probs = cbow_model(context)
            loss = cbow_loss_function(log_probs, target)
            loss.backward()
            cbow_optimizer.step()

            total_loss += loss.item()
            tepoch.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} finished with total loss: {total_loss:.4f}")
    torch.save(cbow_model.state_dict(), os.path.join("cbow_weights", f"cbow_epoch_{epoch+1}.pth"))

### Skip-gram model

In [None]:
# Generate all the pairs of tokens and context windows for Skip-gram
def generate_skip_gram_pairs(tokenized_sentences, window_size=2):
    pairs = []
    for sentence in tokenized_sentences:
        for center_word_pos in range(len(sentence)):
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                    continue
                center_word_idx = sentence[center_word_pos]
                context_word_idx = sentence[context_word_pos]
                pairs.append((center_word_idx, context_word_idx))
    return pairs

In [None]:
# A `Dataset` class for generating pairs of tokens and context windows for Skip-gram one at a time, to use in the `DataLoader` to generate mini-batches
class SkipGramDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        center, context = self.pairs[idx]
        return torch.tensor(center, dtype=torch.long), torch.tensor(context, dtype=torch.long)

In [None]:
# Applying the data generating functions we wrote earlier to our hacker news data
sg_pairs = generate_skip_gram_pairs(id_titles, window_size)
sg_dataset = SkipGramDataset(sg_pairs)
sg_dataloader = DataLoader(sg_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# The Skip-gram model, which is just an embedding layer, and a final output linear layer followed by a softmax activation
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_word):
        embedded = self.embeddings(target_word)
        out = self.linear(embedded)
        log_probs = nn.functional.log_softmax(out, dim=1)
        return log_probs
    
    def embed(self, word):
        self.eval()
        embedding = self.embeddings(word)
        return embedding

In [None]:
# Model, loss, and optimizer
sg_model = SkipGramModel(vocab_size, embedding_dim)
sg_loss_function = nn.NLLLoss()
sg_optimizer = optim.Adam(sg_model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for center_word, context_word in tqdm(sg_dataloader, desc=f"Epoch {epoch+1}"):
        sg_optimizer.zero_grad()
        log_probs = sg_model(center_word)
        loss = sg_loss_function(log_probs, context_word)
        loss.backward()
        sg_optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}: Total Loss = {total_loss}")

    # Save the model state after each epoch
    torch.save(sg_model.state_dict(), os.path.join("skipgram_weights", f"skipgram_epoch_{epoch+1}.pth"))

## Predicting the Score

In [None]:
# A `Dataset` class for generating pairs of id-tokenised titles and scores one at a time, to use in the `DataLoader` to generate mini-batches
class TitlesAndScores(Dataset):
    def __init__(self, titles, scores):
        """
        Args:
            token_ids (list of list of words): Nested list where each sublist contains token IDs for a sentence.
            scores (list of float): List of scores associated with each list of token IDs.
        """
        self.titles = titles
        self.tokenized_titles = [sp.encode_as_pieces(title) for title in self.titles]
        self.id_titles = [[sp.piece_to_id(token) for token in title_tokens] for title_tokens in self.tokenized_titles]
        self.scores = scores
    
    def __len__(self):
        return len(self.scores)
    
    def __getitem__(self, idx):
        return self.id_titles[idx], self.scores[idx]

In [None]:
# A padding function to make sure that every list of title token ids in a mini-batch is the same length, so that the batch can be tensorised
def pad_collate(batch):
    (id_titles, scores) = zip(*batch)
    
    # Padding the sequences with 0
    padded_titles = pad_sequence([torch.tensor(title_ids) for title_ids in id_titles], batch_first=True, padding_value=0)
    
    # Convert scores to a tensor
    tensor_scores = torch.tensor(scores, dtype=torch.float)

    return padded_titles, tensor_scores

In [None]:
# Create the dataset on our hackernews data
sp_dataset = TitlesAndScores(data_array[:,0], data_array[:,1])

# Create the dataLoader on our hackernews data
sp_dataloader = DataLoader(sp_dataset, batch_size=32, shuffle=True, collate_fn=pad_collate)

In [96]:
# A model for predicting scores given titles. 
# This particular architecture is intended to use one of the word2vec models we trained earlier (CBOW or Skip-gram) -- with its weights frozen -- to embed the input title tokens.
# It then uses a classic neural network architecture to predict the scores given the embeddings.
# This architecture has two hidden layers with ReLU activations, and a final output layer with no activation (a linear output layer) because we are doing a regression task. 
class ScorePredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim_1, hidden_dim_2, model, weights):
        super(ScorePredictor, self).__init__()
        self.embedding_model = model(vocab_size, embedding_dim)
        self.embedding_model.load_state_dict(torch.load(weights))
        self.embedding_model.eval()

        self.hidden_1 = nn.Linear(embedding_dim, hidden_dim_1)
        self.hidden_2 = nn.Linear(hidden_dim_1, hidden_dim_2)
        
        self.relu = nn.ReLU()
        self.output = nn.Linear(hidden_dim_2, 1)

    def forward(self, titles):
        with torch.no_grad():  #To ensure no gradients are computed for the embedding model
            embeddings = self.embedding_model.embed(titles)
        pooled_embeddings = embeddings.mean(dim=1)  # Now x is of shape (batch_size, embedding_dim)
        hidden_1_embeddings = self.hidden_1(pooled_embeddings)
        activated_1_embeddings = self.relu(hidden_1_embeddings)
        hidden_2_embeddings = self.hidden_2(activated_1_embeddings)
        activated_2_embeddings = self.relu(hidden_2_embeddings)
        score_predictions = self.output(activated_2_embeddings)
        return score_predictions
    
    def predict(self, title):
        self.eval()
        self.forward(title)

In [None]:
# Hyperparameters
hidden_dim_1= 64
hidden_dim_2 = 32
sp_epochs = 100

### Using the CBOW version of word2vec

In [None]:
# Model, loss, and optimizer
sp_cbow_model = ScorePredictor(vocab_size, embedding_dim, hidden_dim_1, hidden_dim_2, model = CBOWModel, weights = "cbow_weights/cbow_epoch_5.pth")
sp_cbow_loss_function = nn.MSELoss()  # Mean Squared Error Loss for regression
sp_cbow_optimizer = optim.Adam(sp_cbow_model.parameters(), lr=learning_rate)  # Using Adam optimizer

# Training Loop
for epoch in range(sp_epochs):
    total_loss = 0
    # Wrap your dataloader with tqdm for a progress bar
    for inputs, targets in tqdm(sp_dataloader, desc=f'Epoch {epoch+1}/{sp_epochs}'):
        
        inputs = inputs.long()  
        sp_cbow_model.zero_grad()
        outputs = sp_cbow_model(inputs)
        loss = sp_cbow_loss_function(outputs, targets.view(outputs.size()))  # Compute loss, ensuring target shape matches output
        loss.backward()  # Backpropagation
        sp_cbow_optimizer.step()  # Update weights
        total_loss += loss.item()

    # Print the average loss for the epoch
    print(f'Epoch {epoch+1}/{sp_epochs}, Loss: {total_loss/len(sp_dataloader)}')
    
    # Save the model state after each epoch
    torch.save(sp_cbow_model.state_dict(), os.path.join("sp_cbow_weights", f"sp_cbow_epoch_{epoch+1}.pth"))

### Using the Skipgram version of Word2Vec

In [None]:
# Model, loss, and optimizer
sp_skipgram_model = ScorePredictor(vocab_size, embedding_dim, hidden_dim_1, hidden_dim_2, model = SkipGramModel, weights = "skipgram_weights/skipgram_epoch_5.pth")
sp_skipgram_loss_function = nn.MSELoss()  # Mean Squared Error Loss for regression
sp_skipgram_optimizer = optim.Adam(sp_skipgram_model.parameters(), lr=learning_rate)  # Using Adam optimizer

# Training Loop
for epoch in range(sp_epochs):
    total_loss = 0
    # Wrap your dataloader with tqdm for a progress bar
    for inputs, targets in tqdm(sp_dataloader, desc=f'Epoch {epoch+1}/{sp_epochs}'):
        
        inputs = inputs.long()  
        sp_skipgram_model.zero_grad()
        outputs = sp_skipgram_model(inputs)
        loss = sp_skipgram_loss_function(outputs, targets.view(outputs.size()))  # Compute loss, ensuring target shape matches output
        loss.backward()  # Backpropagation
        sp_skipgram_optimizer.step()  # Update weights
        total_loss += loss.item()

    # Print the average loss for the epoch
    print(f'Epoch {epoch+1}/{sp_epochs}, Loss: {total_loss/len(sp_dataloader)}')
    
    # Save the model state after each epoch
    torch.save(sp_skipgram_model.state_dict(), os.path.join("sp_skipgram_weights", f"sp_skipgram_epoch_{epoch+1}.pth"))

In [1]:
# A function for using our model to perform inference
def score_predictor(title, vocab_size, embedding_dim, hidden_dim_1, hidden_dim_2, embed_model, embed_weights, weights):
    model = ScorePredictor(vocab_size, embedding_dim, hidden_dim_1, hidden_dim_2, embed_model, embed_weights)
    model.load_state_dict(torch.load(weights))
    
    title_ids = (torch.tensor([torch.tensor(sp.piece_to_id(token)) for token in sp.encode_as_pieces(title)])).unsqueeze(0)

    with torch.no_grad():  #To ensure no gradients are computed for the embedding model
        score = model.predict(title_ids)

    return score.item()

In [None]:
# Doing inference on the fake example title "Hello world"
score_prediction = score_predictor("Hello world", vocab_size, embedding_dim, hidden_dim_1, hidden_dim_2, embed_model=SkipGramModel, embed_weights="skipgram_weights/skipgram_epoch_5.pth", weights="sp_skipgram_weights/sp_skipgram_epoch_10.pth")
print(score_prediction)