In [105]:
import re
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.init as init


In [106]:
# Path to the input text file
input_file_path = "Harry potter lab dataset.txt"

# Function to clean and split text
def clean_and_split_text(file_path):
    # Read the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Remove punctuation (including speech marks)
    text = re.sub(r'[^\w\s]', '', text.lower())
    
    # Split text by whitespace
    words = text.split()
    
    return words

# Get the cleaned and split words
words = clean_and_split_text(input_file_path)

print(words)

['he', 'found', 'it', 'a', 'lot', 'harder', 'to', 'concentrate', 'on', 'drills', 'that', 'afternoon', 'and', 'when', 'he', 'left', 'the', 'building', 'at', 'five', 'oclock', 'he', 'was', 'still', 'so', 'worried', 'that', 'he', 'walked', 'straight', 'into', 'someone', 'just', 'outside', 'the', 'door', 'sorry', 'he', 'grunted', 'as', 'the', 'tiny', 'old', 'man', 'stumbled', 'and', 'almost', 'fell', 'it', 'was', 'a', 'few', 'seconds', 'before', 'mr', 'dursley', 'realized', 'that', 'the', 'man', 'was', 'wearing', 'a', 'violet', 'cloak', 'he', 'didnt', 'seem', 'at', 'all', 'upset', 'at', 'being', 'almost', 'knocked', 'to', 'the', 'ground', 'on', 'the', 'contrary', 'his', 'face', 'split', 'into', 'a', 'wide', 'smile', 'and', 'he', 'said', 'in', 'a', 'squeaky', 'voice', 'that', 'made', 'passersby', 'stare', 'dont', 'be', 'sorry', 'my', 'dear', 'sir', 'for', 'nothing', 'could', 'upset', 'me', 'today', 'rejoice', 'for', 'you', 'knowwho', 'has', 'gone', 'at', 'last', 'even', 'muggles', 'like', '

In [107]:
len(words)

298

In [108]:
sum(pd.Series(words).duplicated())

127

In [109]:
words_series = pd.Series(words)
unique_words = words_series.drop_duplicates().tolist()
print(unique_words)

['he', 'found', 'it', 'a', 'lot', 'harder', 'to', 'concentrate', 'on', 'drills', 'that', 'afternoon', 'and', 'when', 'left', 'the', 'building', 'at', 'five', 'oclock', 'was', 'still', 'so', 'worried', 'walked', 'straight', 'into', 'someone', 'just', 'outside', 'door', 'sorry', 'grunted', 'as', 'tiny', 'old', 'man', 'stumbled', 'almost', 'fell', 'few', 'seconds', 'before', 'mr', 'dursley', 'realized', 'wearing', 'violet', 'cloak', 'didnt', 'seem', 'all', 'upset', 'being', 'knocked', 'ground', 'contrary', 'his', 'face', 'split', 'wide', 'smile', 'said', 'in', 'squeaky', 'voice', 'made', 'passersby', 'stare', 'dont', 'be', 'my', 'dear', 'sir', 'for', 'nothing', 'could', 'me', 'today', 'rejoice', 'you', 'knowwho', 'has', 'gone', 'last', 'even', 'muggles', 'like', 'yourself', 'should', 'celebrating', 'this', 'happy', 'day', 'hugged', 'around', 'middle', 'off', 'stood', 'rooted', 'spot', 'had', 'been', 'by', 'complete', 'stranger', 'also', 'thought', 'called', 'muggle', 'whatever', 'rattled'

In [110]:
sum(pd.Series(unique_words).duplicated())

0

In [111]:
len(unique_words)

171

In [112]:
word_to_index = {}
def one_hot_vectors(words):
    # Create a dictionary to map each
    # unique word to an index
    for i, word in enumerate(words):
        word_to_index[word] = i
    
    # Create one-hot encoded vectors for
    # each word in the corpus
    one_hot_vectors = []
    for word in words:
        vector = np.zeros(len(words))
        vector[word_to_index[word]] = 1
        one_hot_vectors.append(vector)

    return one_hot_vectors

In [113]:
encoded_words = one_hot_vectors(unique_words)

In [114]:
encoded_words

[array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [115]:
len(encoded_words)

171

# QUESTION 5

In [116]:
def create_context_dataset(words, window_size=2):
    context_pairs = []

    for i in range(window_size, len(words) - window_size):
        input_word = words[i]
        context_words = words[i - window_size:i] + words[i + 1:i + 1 + window_size]

        for context_word in context_words:
            context_pairs.append((input_word, context_word))
    
    return context_pairs

context_pairs = create_context_dataset(words)

for pair in context_pairs:
    print(f"Input (Word): {pair[0]}, Label (Context Word): {pair[1]}")

Input (Word): it, Label (Context Word): he
Input (Word): it, Label (Context Word): found
Input (Word): it, Label (Context Word): a
Input (Word): it, Label (Context Word): lot
Input (Word): a, Label (Context Word): found
Input (Word): a, Label (Context Word): it
Input (Word): a, Label (Context Word): lot
Input (Word): a, Label (Context Word): harder
Input (Word): lot, Label (Context Word): it
Input (Word): lot, Label (Context Word): a
Input (Word): lot, Label (Context Word): harder
Input (Word): lot, Label (Context Word): to
Input (Word): harder, Label (Context Word): a
Input (Word): harder, Label (Context Word): lot
Input (Word): harder, Label (Context Word): to
Input (Word): harder, Label (Context Word): concentrate
Input (Word): to, Label (Context Word): lot
Input (Word): to, Label (Context Word): harder
Input (Word): to, Label (Context Word): concentrate
Input (Word): to, Label (Context Word): on
Input (Word): concentrate, Label (Context Word): harder
Input (Word): concentrate, Labe

# Question 6

In [117]:
class WordDataset(Dataset):
    def __init__(self, context_pairs, word_to_index):
        self.context_pairs = context_pairs
        self.word_to_index = word_to_index
    
    def __len__(self):
        return len(self.context_pairs)
    
    def __getitem__(self, idx):
        input_word, context_word = self.context_pairs[idx]
        input_idx = self.word_to_index[input_word]
        context_idx = self.word_to_index[context_word]
        return torch.tensor(input_idx), torch.tensor(context_idx)

class Word2VecModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, input_word):
        embed = self.embeddings(input_word)
        output = self.linear(embed)
        return output
    
def init_weights(m):
    if isinstance(m, nn.Embedding):
        nn.init.normal_(m.weight, mean=0.0, std=0.1)
    elif isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0.0, std=0.1)

# Convert context pairs to indices
dataset = WordDataset(context_pairs, word_to_index)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Define the model
embedding_dim = 100
vocab_size = len(unique_words)
model = Word2VecModel(vocab_size, embedding_dim)
model.apply(init_weights)

# Define loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

# Training loop
for epoch in range(50):  # Number of epochs
    total_loss = 0
    for input_word, context_word in dataloader:
        input_word = input_word.long()
        context_word = context_word.long()

        # Forward pass
        output = model(input_word)
        
        # Compute loss
        loss = loss_fn(output, context_word)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss}")

# Save the trained model
torch.save(model.state_dict(), 'word2vec_model.pth')


Epoch 1, Loss: 108.22525262832642
Epoch 2, Loss: 103.54651975631714
Epoch 3, Loss: 94.27450394630432
Epoch 4, Loss: 88.43719935417175
Epoch 5, Loss: 85.4240026473999
Epoch 6, Loss: 84.4989173412323
Epoch 7, Loss: 82.97266411781311
Epoch 8, Loss: 81.56163191795349
Epoch 9, Loss: 81.55856537818909
Epoch 10, Loss: 81.72530007362366
Epoch 11, Loss: 81.84100341796875
Epoch 12, Loss: 81.46742749214172
Epoch 13, Loss: 82.49884915351868
Epoch 14, Loss: 83.39615797996521
Epoch 15, Loss: 83.17169666290283
Epoch 16, Loss: 83.5880081653595
Epoch 17, Loss: 84.22199559211731
Epoch 18, Loss: 82.93829798698425
Epoch 19, Loss: 84.78184533119202
Epoch 20, Loss: 82.0993709564209
Epoch 21, Loss: 81.74229192733765
Epoch 22, Loss: 83.56153559684753
Epoch 23, Loss: 83.00392723083496
Epoch 24, Loss: 80.5343029499054
Epoch 25, Loss: 81.9312071800232
Epoch 26, Loss: 80.12239074707031
Epoch 27, Loss: 76.95668005943298
Epoch 28, Loss: 77.94024705886841
Epoch 29, Loss: 75.46818089485168
Epoch 30, Loss: 75.41433548