<a href="https://colab.research.google.com/github/TWaugh12/Projects/blob/main/TheTimeTravelerLanguageModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import numpy as np
import re
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [3]:
corpus = [line.strip() for line in open('./TheTimeMachine.txt') if line.strip()][2:]
print("\n".join(corpus[:10]))

# Tokenize the sentences into words. All lower case. Ignore punctuation.
corpus = [re.sub('[^A-Za-z0-9]+', ' ', line).lower() for line in corpus]
corpus = [re.sub(' +', ' ', line) for line in corpus]
corpus = [word for line in corpus for word in line.split()]


The Time Traveller (for so it will be convenient to speak of him)
was expounding a recondite matter to us. His grey eyes shone and
twinkled, and his usually pale face was flushed and animated. The
fire burned brightly, and the soft radiance of the incandescent
lights in the lilies of silver caught the bubbles that flashed and
passed in our glasses. Our chairs, being his patents, embraced and
caressed us rather than submitted to be sat upon, and there was that
luxurious after-dinner atmosphere when thought roams gracefully
free of the trammels of precision. And he put it to us in this
way--marking the points with a lean forefinger--as we sat and lazily


In [4]:
vocab_size = 2999
tkn_counter = Counter([word for word in corpus])
vocab = {word: idx for idx, (word, _) in enumerate(tkn_counter.most_common(vocab_size))}
vocab["/UNK"] = len(vocab)
print("Total words =", len(vocab))
print("10 most popular words are:", list(vocab.keys())[:10])


Total words = 3000
10 most popular words are: ['the', 'i', 'and', 'of', 'a', 'to', 'was', 'in', 'that', 'my']


In [5]:
class TextCorpusDataset(Dataset):
    def __init__(self, corpus, vocab, snippet_len=50):
        super().__init__()
        self.corpus = corpus
        self.snippet_len = snippet_len

        # Vocabulary (word-to-index mapping)
        self.vocab = vocab

        # Inverse vocabulary (index-to-word mapping)
        self.inv_vocab = {idx: word for word, idx in self.vocab.items()}

    def convert2idx(self, word_sequence):
        return [self.vocab[word if word in self.vocab else "/UNK"] for word in word_sequence]

    def convert2words(self, idx_sequence):
        return [self.inv_vocab[idx] for idx in idx_sequence]

    def __len__(self):
        return len(self.corpus) - self.snippet_len

    def __getitem__(self, idx):
        snippet = self.corpus[idx:idx+self.snippet_len]
        snippet = torch.tensor(self.convert2idx(snippet))
        return snippet

# Test dataset function
dataset = TextCorpusDataset(corpus, vocab, snippet_len=50)
snippet = dataset[1234]
print("\nRandom snippet from the corpus.")
print("  * Token IDS:\t", snippet)
print("  * Words:\t\t", " ".join([dataset.inv_vocab[i] for i in snippet.tolist()]))


Random snippet from the corpus.
  * Token IDS:	 tensor([ 312,   54,   27,   42,  600,    3, 1472,  110,   15,  108,  439,    3,
          18,  108,   72,  130,    4,  849,   51,   52,  370,  187,    3, 1472,
        2275,  231,  182,    0,  235,   17,    4, 1473,   64,   37,  371,  151,
         130,    0,  849,    7,   20, 2276,   26,  188,  219,   63,  140, 1462,
           7,    4])
  * Words:		 course we have no means of staying back for any length of time any more than a savage or an animal has of staying six feet above the ground but a civilized man is better off than the savage in this respect he can go up against gravitation in a


In [6]:
# Define the Word2Vec CBOW model
class Word2Vec_CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        context_embeds = self.embeddings(context)
        avg_context_embeds = context_embeds.mean(dim=1)
        logits = self.linear(avg_context_embeds)
        return logits

# Hyperparameters
context_len = 2
vocab_size = len(dataset.vocab)
embedding_dim = 128
learning_rate = 5e-3
num_epochs = 100

# Create DataLoader for batch training
dataset = TextCorpusDataset(corpus, vocab, snippet_len=2*context_len + 1)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Create and train the CBOW model
w2v = Word2Vec_CBOW(vocab_size, embedding_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(w2v.parameters(), lr=learning_rate)
context_idx = [idx for idx in range(2*context_len+1) if idx != context_len]
for epoch in range(num_epochs):
    total_loss = 0
    for snippet in train_loader:
        context = snippet[:, context_idx].to(device)
        target = snippet[:, context_len].to(device)
        logits = w2v(context)
        loss = criterion(logits, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() / len(train_loader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss:.4f}')


Epoch [1/100], Loss: 6.0542
Epoch [2/100], Loss: 4.6372
Epoch [3/100], Loss: 3.8251
Epoch [4/100], Loss: 3.1766
Epoch [5/100], Loss: 2.6902
Epoch [6/100], Loss: 2.3215
Epoch [7/100], Loss: 2.0395
Epoch [8/100], Loss: 1.8193
Epoch [9/100], Loss: 1.6479
Epoch [10/100], Loss: 1.5152
Epoch [11/100], Loss: 1.4047
Epoch [12/100], Loss: 1.3189
Epoch [13/100], Loss: 1.2475
Epoch [14/100], Loss: 1.1886
Epoch [15/100], Loss: 1.1378
Epoch [16/100], Loss: 1.0949
Epoch [17/100], Loss: 1.0573
Epoch [18/100], Loss: 1.0262
Epoch [19/100], Loss: 1.0001
Epoch [20/100], Loss: 0.9727
Epoch [21/100], Loss: 0.9518
Epoch [22/100], Loss: 0.9344
Epoch [23/100], Loss: 0.9156
Epoch [24/100], Loss: 0.8993
Epoch [25/100], Loss: 0.8884
Epoch [26/100], Loss: 0.8741
Epoch [27/100], Loss: 0.8583
Epoch [28/100], Loss: 0.8531
Epoch [29/100], Loss: 0.8420
Epoch [30/100], Loss: 0.8327
Epoch [31/100], Loss: 0.8223
Epoch [32/100], Loss: 0.8160
Epoch [33/100], Loss: 0.8083
Epoch [34/100], Loss: 0.8014
Epoch [35/100], Loss: 0

In [7]:
# Function to calculate correclty predicted words
def calculate_accuracy(model, data_loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for snippet in data_loader:
            context = snippet[:, context_idx].to(device)
            target = snippet[:, context_len].to(device)
            logits = model(context)
            _, predicted = torch.max(logits.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
    return correct / total

# Calculate accuracy on the training set
accuracy = calculate_accuracy(w2v, train_loader)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 84.89%


In [11]:
def predict_missing_word(model, sentence, vocab, context_len):
    model.eval()  # set the model to evaluation mode
    words = sentence.split()
    for i in range(context_len, len(words) - context_len):
        context = words[i-context_len:i] + words[i+1:i+1+context_len]
        context_indices = [vocab[word] for word in context if word in vocab]
        context_tensor = torch.tensor(context_indices, dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            logits = model(context_tensor)
        predicted_index = torch.argmax(logits, dim=1).item()
        predicted_word = [word for word, idx in vocab.items() if idx == predicted_index][0]
        print(f"Context: {' '.join(context)} -> Predicted word: {predicted_word}")

In [12]:
# Extract the word embeddings to analyze it
word_embeddings = w2v.embeddings.weight.detach().cpu().numpy()

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Normalize the embeddings
normalized_embeddings = word_embeddings / np.linalg.norm(word_embeddings, axis=1, keepdims=True)

# Function to find nearest neighbors
def find_nearest_neighbors(word, embeddings, vocab, num_neighbors=5):
    if word not in vocab:
        return f"{word} not in vocabulary."

    word_index = vocab[word]
    word_embedding = embeddings[word_index].reshape(1, -1)

    # Compute similarities
    similarities = cosine_similarity(word_embedding, embeddings)[0]

    # Get the indices of the most similar embeddings
    most_similar_indices = np.argsort(-similarities)[1:num_neighbors+1]

    # Map indices back to words
    index_to_word = {idx: word for word, idx in vocab.items()}
    nearest_neighbors = [index_to_word[idx] for idx in most_similar_indices]

    return nearest_neighbors

# Example usage
word = 'actually'
nearest_neighbors = find_nearest_neighbors(word, normalized_embeddings, dataset.vocab)
print(f"Nearest neighbors of '{word}': {nearest_neighbors}")

Nearest neighbors of 'actually': ['distance', 'rushed', 'curtain', 'lucid', 'frankness']


In [14]:
class NextWordPredictionMLP(nn.Module):
    def __init__(self, num_context, embedding, depth=3, hidden_dim=50):
        super().__init__()
        self.embedding = embedding

        self.mlp = nn.Sequential()
        for d in range(depth):
            if d == 0:
                in_chans = num_context * embedding.embedding_dim
                out_chans = hidden_dim
            elif d == depth - 1:
                in_chans = hidden_dim
                out_chans = embedding.num_embeddings
            else:
                in_chans = out_chans = hidden_dim

            self.mlp.add_module(f'linear{d}', nn.Linear(in_chans, out_chans))
            self.mlp.add_module(f'bn{d}', nn.BatchNorm1d(out_chans))
            self.mlp.add_module(f'act{d}', nn.ReLU(inplace=True))

    def forward(self, context):
        emb = self.embedding(context).flatten(1)
        return self.mlp(emb)

In [15]:
def train_one_epoch(model, loss_fcn, optimizer, dataloader):
    total_loss = 0.
    for it, batch in enumerate(dataloader):
        batch_past = batch[:, :T].to(device)
        batch_now = batch[:, -1].to(device)

        pred_now = model(batch_past)
        l = loss_fcn(pred_now, batch_now)
        total_loss += l.item()

        optimizer.zero_grad()
        l.backward()
        optimizer.step()

    total_loss = total_loss / len(dataloader)
    return total_loss


def fit(model, loss_fcn, dataloader, optimizer, epochs=30):
    for ep in range(epochs):
        loss = train_one_epoch(model, loss_fcn, optimizer, dataloader)
        print(f"[Ep{ep:03}] | Loss {loss:.3f} \t Perplexity  {np.exp(loss):.3f}")


T = 10
dataset = TextCorpusDataset(corpus, vocab, snippet_len=T+1)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)

model = NextWordPredictionMLP(T, w2v.embeddings, depth=2, hidden_dim=50).to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.0005)
loss_fcn = F.cross_entropy

fit(model, loss_fcn, dataloader, opt, epochs=100)

[Ep000] | Loss 7.636 	 Perplexity  2071.029
[Ep001] | Loss 6.559 	 Perplexity  705.237
[Ep002] | Loss 5.897 	 Perplexity  364.012
[Ep003] | Loss 5.414 	 Perplexity  224.534
[Ep004] | Loss 5.062 	 Perplexity  157.873
[Ep005] | Loss 4.781 	 Perplexity  119.250
[Ep006] | Loss 4.563 	 Perplexity  95.882
[Ep007] | Loss 4.399 	 Perplexity  81.373
[Ep008] | Loss 4.254 	 Perplexity  70.400
[Ep009] | Loss 4.125 	 Perplexity  61.898
[Ep010] | Loss 4.016 	 Perplexity  55.467
[Ep011] | Loss 3.920 	 Perplexity  50.392
[Ep012] | Loss 3.828 	 Perplexity  45.972
[Ep013] | Loss 3.755 	 Perplexity  42.721
[Ep014] | Loss 3.685 	 Perplexity  39.851
[Ep015] | Loss 3.615 	 Perplexity  37.160
[Ep016] | Loss 3.544 	 Perplexity  34.591
[Ep017] | Loss 3.485 	 Perplexity  32.615
[Ep018] | Loss 3.437 	 Perplexity  31.101
[Ep019] | Loss 3.373 	 Perplexity  29.174
[Ep020] | Loss 3.328 	 Perplexity  27.877
[Ep021] | Loss 3.276 	 Perplexity  26.472
[Ep022] | Loss 3.230 	 Perplexity  25.284
[Ep023] | Loss 3.186 	 Perp

In [18]:
with torch.no_grad():
    prompt = " ".join(corpus[:10])
    print("PROMPT:", prompt)
    context = torch.tensor([dataset.vocab[word] for word in prompt.split()]).to(device)
    context = context.unsqueeze(0)  # Reshape it into a batch of 1
    model.train(False)
    for _ in range(100):
        next_word_logits = model(context)
        next_word_idx = next_word_logits[:, :-1].argmax(dim=1)
        next_word = dataset.inv_vocab[next_word_idx[0].item()]
        context = torch.cat((context[:, 1:], next_word_idx.unsqueeze(1)), 1)
        print(next_word, end=' ')

PROMPT: the time traveller for so it will be convenient to
speak of him was not huge fear for that what was change of looked round me i was heard a slight s further and little people there in the shadow for a sudden i gave the two view the fire to make his hand and i felt where is very large as was that the world had always weena proceeded them from weena but really of weena had matches where day i looked under them the long for beautiful day been intense red place to full and the little in a most animals that was time in the thing was 