In [356]:
import re
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.init as init


In [357]:
# Path to the input text file
input_file_path = "Harry potter lab dataset.txt"

# Function to clean and split text
def clean_and_split_text(file_path):
    # Read the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Remove punctuation (including speech marks)
    text = re.sub(r'[^\w\s]', '', text.lower())
    
    # Split text by whitespace
    words = text.split()
    
    return words

# Get the cleaned and split words
words = clean_and_split_text(input_file_path)

print(words)

['he', 'found', 'it', 'a', 'lot', 'harder', 'to', 'concentrate', 'on', 'drills', 'that', 'afternoon', 'and', 'when', 'he', 'left', 'the', 'building', 'at', 'five', 'oclock', 'he', 'was', 'still', 'so', 'worried', 'that', 'he', 'walked', 'straight', 'into', 'someone', 'just', 'outside', 'the', 'door', 'sorry', 'he', 'grunted', 'as', 'the', 'tiny', 'old', 'man', 'stumbled', 'and', 'almost', 'fell', 'it', 'was', 'a', 'few', 'seconds', 'before', 'mr', 'dursley', 'realized', 'that', 'the', 'man', 'was', 'wearing', 'a', 'violet', 'cloak', 'he', 'didnt', 'seem', 'at', 'all', 'upset', 'at', 'being', 'almost', 'knocked', 'to', 'the', 'ground', 'on', 'the', 'contrary', 'his', 'face', 'split', 'into', 'a', 'wide', 'smile', 'and', 'he', 'said', 'in', 'a', 'squeaky', 'voice', 'that', 'made', 'passersby', 'stare', 'dont', 'be', 'sorry', 'my', 'dear', 'sir', 'for', 'nothing', 'could', 'upset', 'me', 'today', 'rejoice', 'for', 'you', 'knowwho', 'has', 'gone', 'at', 'last', 'even', 'muggles', 'like', '

In [358]:
len(words)

298

In [359]:
sum(pd.Series(words).duplicated())

127

In [360]:
words_series = pd.Series(words)
unique_words = words_series.drop_duplicates().tolist()
print(unique_words)

['he', 'found', 'it', 'a', 'lot', 'harder', 'to', 'concentrate', 'on', 'drills', 'that', 'afternoon', 'and', 'when', 'left', 'the', 'building', 'at', 'five', 'oclock', 'was', 'still', 'so', 'worried', 'walked', 'straight', 'into', 'someone', 'just', 'outside', 'door', 'sorry', 'grunted', 'as', 'tiny', 'old', 'man', 'stumbled', 'almost', 'fell', 'few', 'seconds', 'before', 'mr', 'dursley', 'realized', 'wearing', 'violet', 'cloak', 'didnt', 'seem', 'all', 'upset', 'being', 'knocked', 'ground', 'contrary', 'his', 'face', 'split', 'wide', 'smile', 'said', 'in', 'squeaky', 'voice', 'made', 'passersby', 'stare', 'dont', 'be', 'my', 'dear', 'sir', 'for', 'nothing', 'could', 'me', 'today', 'rejoice', 'you', 'knowwho', 'has', 'gone', 'last', 'even', 'muggles', 'like', 'yourself', 'should', 'celebrating', 'this', 'happy', 'day', 'hugged', 'around', 'middle', 'off', 'stood', 'rooted', 'spot', 'had', 'been', 'by', 'complete', 'stranger', 'also', 'thought', 'called', 'muggle', 'whatever', 'rattled'

In [361]:
sum(pd.Series(unique_words).duplicated())

0

In [362]:
len(unique_words)

171

In [363]:
word_to_index = {}
def one_hot_vectors(words):
    # Create a dictionary to map each
    # unique word to an index
    for i, word in enumerate(words):
        word_to_index[word] = i
    
    # Create one-hot encoded vectors for
    # each word in the corpus
    one_hot_vectors = []
    for word in words:
        vector = np.zeros(len(words))
        vector[word_to_index[word]] = 1
        one_hot_vectors.append(vector)

    return one_hot_vectors

In [364]:
encoded_words = one_hot_vectors(unique_words)

In [365]:
encoded_words

[array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [366]:
len(encoded_words)

171

# QUESTION 5

In [367]:
def create_context_dataset(words, window_size=2):
    context_pairs = []

    for i in range(window_size, len(words) - window_size):
        input_word = words[i]
        context_words = words[i - window_size:i] + words[i + 1:i + 1 + window_size]

        for context_word in context_words:
            context_pairs.append((input_word, context_word))
    
    return context_pairs

context_pairs = create_context_dataset(words)

for pair in context_pairs:
    print(f"Input (Word): {pair[0]}, Label (Context Word): {pair[1]}")

Input (Word): it, Label (Context Word): he
Input (Word): it, Label (Context Word): found
Input (Word): it, Label (Context Word): a
Input (Word): it, Label (Context Word): lot
Input (Word): a, Label (Context Word): found
Input (Word): a, Label (Context Word): it
Input (Word): a, Label (Context Word): lot
Input (Word): a, Label (Context Word): harder
Input (Word): lot, Label (Context Word): it
Input (Word): lot, Label (Context Word): a
Input (Word): lot, Label (Context Word): harder
Input (Word): lot, Label (Context Word): to
Input (Word): harder, Label (Context Word): a
Input (Word): harder, Label (Context Word): lot
Input (Word): harder, Label (Context Word): to
Input (Word): harder, Label (Context Word): concentrate
Input (Word): to, Label (Context Word): lot
Input (Word): to, Label (Context Word): harder
Input (Word): to, Label (Context Word): concentrate
Input (Word): to, Label (Context Word): on
Input (Word): concentrate, Label (Context Word): harder
Input (Word): concentrate, Labe

# Question 6

In [368]:
class WordDataset(Dataset):
    def __init__(self, context_pairs, word_to_index):
        self.context_pairs = context_pairs
        self.word_to_index = word_to_index
    
    def __len__(self):
        return len(self.context_pairs)
    
    def __getitem__(self, idx):
        input_word, context_word = self.context_pairs[idx]
        input_idx = self.word_to_index[input_word]
        context_idx = self.word_to_index[context_word]
        return torch.tensor(input_idx), torch.tensor(context_idx)

class Word2VecModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, input_word):
        embed = self.embeddings(input_word)
        output = self.linear(embed)
        return output
    
def init_weights(m):
    if isinstance(m, nn.Embedding):
        nn.init.normal_(m.weight, mean=0.0, std=0.1)
    elif isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0.0, std=0.1)

# Convert context pairs to indices
dataset = WordDataset(context_pairs, word_to_index)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Define the model
embedding_dim = 100
vocab_size = len(unique_words)
model = Word2VecModel(vocab_size, embedding_dim)
model.apply(init_weights)

# Define loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

# Training loop
for epoch in range(100):  # Number of epochs
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    for input_word, context_word in dataloader:
        input_word = input_word.long()
        context_word = context_word.long()

        # Forward pass
        output = model(input_word)
        
        # Compute loss
        loss = loss_fn(output, context_word)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Calculate accuracy
        _, predicted_word = torch.max(output, 1)
        correct_predictions += (predicted_word == context_word).sum().item()
        total_predictions += context_word.size(0)

        total_loss += loss.item()
    
    accuracy = correct_predictions / total_predictions
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}")

# Save the trained model
torch.save(model.state_dict(), 'word2vec_model.pth')


Epoch 1, Loss: 108.6520, Accuracy: 0.0272
Epoch 2, Loss: 103.3926, Accuracy: 0.0850
Epoch 3, Loss: 93.6318, Accuracy: 0.0765
Epoch 4, Loss: 88.0157, Accuracy: 0.0867
Epoch 5, Loss: 85.7031, Accuracy: 0.0808
Epoch 6, Loss: 82.6931, Accuracy: 0.0782
Epoch 7, Loss: 81.1249, Accuracy: 0.0859
Epoch 8, Loss: 82.3276, Accuracy: 0.0842
Epoch 9, Loss: 82.9989, Accuracy: 0.0765
Epoch 10, Loss: 80.3996, Accuracy: 0.0978
Epoch 11, Loss: 81.2133, Accuracy: 0.0833
Epoch 12, Loss: 82.6376, Accuracy: 0.0748
Epoch 13, Loss: 82.9301, Accuracy: 0.0901
Epoch 14, Loss: 82.4911, Accuracy: 0.0850
Epoch 15, Loss: 81.4803, Accuracy: 0.0859
Epoch 16, Loss: 80.4400, Accuracy: 0.0842
Epoch 17, Loss: 80.5853, Accuracy: 0.0884
Epoch 18, Loss: 78.6347, Accuracy: 0.0927
Epoch 19, Loss: 81.1277, Accuracy: 0.0893
Epoch 20, Loss: 81.1753, Accuracy: 0.0876
Epoch 21, Loss: 78.7235, Accuracy: 0.0884
Epoch 22, Loss: 80.5589, Accuracy: 0.0731
Epoch 23, Loss: 81.6613, Accuracy: 0.0799
Epoch 24, Loss: 82.6284, Accuracy: 0.0859

# Question 7

In [369]:
import torch

def one_hot_to_index(one_hot_vector):
    # Convert the one-hot vector from numpy array to a torch tensor
    one_hot_tensor = torch.tensor(one_hot_vector, dtype=torch.float32)
    
    # Find the index of the maximum value (which is 1 in a one-hot vector)
    index = torch.argmax(one_hot_tensor).item()
    return index

def get_embedding_from_one_hot(one_hot_vector, model, index_to_word):
    # Convert one-hot vector to index
    index = one_hot_to_index(one_hot_vector)
    
    # Retrieve the word corresponding to this index
    word = index_to_word[index]
    
    # Convert the index to a tensor and pass it through the model's embedding layer
    index_tensor = torch.tensor([index])
    
    # Get the embedding
    with torch.no_grad():  # No need to compute gradients during inference
        embedding = model.embeddings(index_tensor)
    
    return word, embedding

# Create a mapping from index to word
index_to_word = {i: word for i, word in enumerate(unique_words)}

# Get the one-hot vector for a specific word (e.g., the first unique word)
one_hot_vector = encoded_words[0]  # For demonstration, take the first word's one-hot vector

# Get the corresponding word and embedding
word, embedding = get_embedding_from_one_hot(one_hot_vector, model, index_to_word)

# Print the word and its embedding
print(f"Embedding for the word '{word}': {embedding}")


Embedding for the word 'he': tensor([[ 0.0547,  0.0244,  0.2581,  0.1208, -0.2401, -0.0444,  0.0797,  0.0398,
          0.0919, -0.1399, -0.1886,  0.0666,  0.0676,  0.0110, -0.0780, -0.0051,
          0.1710,  0.2429, -0.1419,  0.0851, -0.0924, -0.0696, -0.0182,  0.2324,
          0.0458, -0.2074,  0.0507, -0.0232, -0.0863,  0.0454,  0.1378,  0.1273,
         -0.1693, -0.0571,  0.0798, -0.0428, -0.0892, -0.1560, -1.1240, -0.1941,
         -0.0845, -0.2582, -0.0225, -0.0726,  0.0318,  0.0026, -0.0639, -0.3026,
          0.1058,  0.1128, -0.1308,  0.1383,  0.0948,  0.1203,  0.3060, -0.2140,
         -0.1641, -0.0478,  0.0449, -0.0577,  0.0455,  0.0640,  0.0655,  0.1264,
          0.0300,  0.0281,  0.2063, -0.1871, -0.1599,  1.5239, -0.2781, -0.0196,
         -0.0020, -0.2046,  0.1219,  0.0208, -0.0471, -0.1844, -0.0226,  0.0224,
         -0.0326,  0.0901, -0.0562,  0.3012,  0.0151,  0.0071, -0.0561, -0.0973,
          0.0816,  0.1324,  0.0876,  0.1104, -0.1100, -0.1186,  0.0932, -0.2338,