In [66]:
import re
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.init as init


In [67]:
# Path to the input text file
input_file_path = "Harry potter lab dataset.txt"

# Function to clean and split text
def clean_and_split_text(file_path):
    # Read the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Remove punctuation (including speech marks)
    text = re.sub(r'[^\w\s]', '', text.lower())
    
    # Split text by whitespace
    words = text.split()
    
    return words

# Get the cleaned and split words
words = clean_and_split_text(input_file_path)

print(words)

['he', 'found', 'it', 'a', 'lot', 'harder', 'to', 'concentrate', 'on', 'drills', 'that', 'afternoon', 'and', 'when', 'he', 'left', 'the', 'building', 'at', 'five', 'oclock', 'he', 'was', 'still', 'so', 'worried', 'that', 'he', 'walked', 'straight', 'into', 'someone', 'just', 'outside', 'the', 'door', 'sorry', 'he', 'grunted', 'as', 'the', 'tiny', 'old', 'man', 'stumbled', 'and', 'almost', 'fell', 'it', 'was', 'a', 'few', 'seconds', 'before', 'mr', 'dursley', 'realized', 'that', 'the', 'man', 'was', 'wearing', 'a', 'violet', 'cloak', 'he', 'didnt', 'seem', 'at', 'all', 'upset', 'at', 'being', 'almost', 'knocked', 'to', 'the', 'ground', 'on', 'the', 'contrary', 'his', 'face', 'split', 'into', 'a', 'wide', 'smile', 'and', 'he', 'said', 'in', 'a', 'squeaky', 'voice', 'that', 'made', 'passersby', 'stare', 'dont', 'be', 'sorry', 'my', 'dear', 'sir', 'for', 'nothing', 'could', 'upset', 'me', 'today', 'rejoice', 'for', 'you', 'knowwho', 'has', 'gone', 'at', 'last', 'even', 'muggles', 'like', '

In [68]:
len(words)

298

In [69]:
sum(pd.Series(words).duplicated())

127

In [70]:
words_series = pd.Series(words)
unique_words = words_series.drop_duplicates().tolist()
print(unique_words)

['he', 'found', 'it', 'a', 'lot', 'harder', 'to', 'concentrate', 'on', 'drills', 'that', 'afternoon', 'and', 'when', 'left', 'the', 'building', 'at', 'five', 'oclock', 'was', 'still', 'so', 'worried', 'walked', 'straight', 'into', 'someone', 'just', 'outside', 'door', 'sorry', 'grunted', 'as', 'tiny', 'old', 'man', 'stumbled', 'almost', 'fell', 'few', 'seconds', 'before', 'mr', 'dursley', 'realized', 'wearing', 'violet', 'cloak', 'didnt', 'seem', 'all', 'upset', 'being', 'knocked', 'ground', 'contrary', 'his', 'face', 'split', 'wide', 'smile', 'said', 'in', 'squeaky', 'voice', 'made', 'passersby', 'stare', 'dont', 'be', 'my', 'dear', 'sir', 'for', 'nothing', 'could', 'me', 'today', 'rejoice', 'you', 'knowwho', 'has', 'gone', 'last', 'even', 'muggles', 'like', 'yourself', 'should', 'celebrating', 'this', 'happy', 'day', 'hugged', 'around', 'middle', 'off', 'stood', 'rooted', 'spot', 'had', 'been', 'by', 'complete', 'stranger', 'also', 'thought', 'called', 'muggle', 'whatever', 'rattled'

In [71]:
sum(pd.Series(unique_words).duplicated())

0

In [72]:
len(unique_words)

171

In [73]:
word_to_index = {}
def one_hot_vectors(words):
    # Create a dictionary to map each
    # unique word to an index
    for i, word in enumerate(words):
        word_to_index[word] = i
    
    # Create one-hot encoded vectors for
    # each word in the corpus
    one_hot_vectors = []
    for word in words:
        vector = np.zeros(len(words))
        vector[word_to_index[word]] = 1
        one_hot_vectors.append(vector)

    return one_hot_vectors

In [74]:
encoded_words = one_hot_vectors(unique_words)

In [75]:
encoded_words

[array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [76]:
len(encoded_words)

171

# QUESTION 5

In [77]:
def create_context_dataset(words, window_size=2):
    context_pairs = []

    for i in range(window_size, len(words) - window_size):
        input_word = words[i]
        context_words = words[i - window_size:i] + words[i + 1:i + 1 + window_size]

        for context_word in context_words:
            context_pairs.append((input_word, context_word))
    
    return context_pairs

context_pairs = create_context_dataset(words)

print(context_pairs[:6])

[('it', 'he'), ('it', 'found'), ('it', 'a'), ('it', 'lot'), ('a', 'found'), ('a', 'it')]


# Question 6

In [78]:
class WordDataset(Dataset):
    def __init__(self, context_pairs, word_to_index):
        self.context_pairs = context_pairs
        self.word_to_index = word_to_index
    
    def __len__(self):
        return len(self.context_pairs)
    
    def __getitem__(self, idx):
        input_word, context_word = self.context_pairs[idx]
        input_idx = self.word_to_index[input_word]
        context_idx = self.word_to_index[context_word]
        return torch.tensor(input_idx), torch.tensor(context_idx)

class Word2VecModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, input_word):
        embed = self.embeddings(input_word)
        output = self.linear(embed)
        return output
    
def init_weights(m):
    if isinstance(m, nn.Embedding):
        nn.init.normal_(m.weight, mean=0.0, std=0.1)
    elif isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0.0, std=0.1)

# Convert context pairs to indices
dataset = WordDataset(context_pairs, word_to_index)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Define the model
embedding_dim = 100
vocab_size = len(unique_words)
model = Word2VecModel(vocab_size, embedding_dim)
model.apply(init_weights)

# Define loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

# Training loop
for epoch in range(50):  # Number of epochs
    total_loss = 0
    for input_word, context_word in dataloader:
        input_word = input_word.long()
        context_word = context_word.long()

        # Forward pass
        output = model(input_word)
        
        # Compute loss
        loss = loss_fn(output, context_word)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss}")

# Save the trained model
torch.save(model.state_dict(), 'word2vec_model.pth')


Epoch 1, Loss: 108.43371343612671
Epoch 2, Loss: 105.17480993270874
Epoch 3, Loss: 94.25799369812012
Epoch 4, Loss: 88.8466546535492
Epoch 5, Loss: 86.46667766571045
Epoch 6, Loss: 83.16722869873047
Epoch 7, Loss: 80.6899631023407
Epoch 8, Loss: 83.18583106994629
Epoch 9, Loss: 81.4988100528717
Epoch 10, Loss: 82.37760019302368
Epoch 11, Loss: 79.17477202415466
Epoch 12, Loss: 79.15091586112976
Epoch 13, Loss: 80.13649272918701
Epoch 14, Loss: 79.15159845352173
Epoch 15, Loss: 80.89008593559265
Epoch 16, Loss: 80.4545636177063
Epoch 17, Loss: 82.71335196495056
Epoch 18, Loss: 80.34213185310364
Epoch 19, Loss: 82.49196338653564
Epoch 20, Loss: 83.31198906898499
Epoch 21, Loss: 81.21312689781189
Epoch 22, Loss: 80.83932065963745
Epoch 23, Loss: 80.22491836547852
Epoch 24, Loss: 80.91930842399597
Epoch 25, Loss: 80.0064971446991
Epoch 26, Loss: 81.51211357116699
Epoch 27, Loss: 82.15288019180298
Epoch 28, Loss: 82.80036640167236
Epoch 29, Loss: 79.99797487258911
Epoch 30, Loss: 80.5700268