<a href="https://colab.research.google.com/github/Prarthana-10/GPT-Mastery-SOC/blob/main/22B0327_WEEK2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Ensure you have the necessary libraries
!pip install torch

# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Define the dataset class
class TextDataset(Dataset):
    def __init__(self, text, context_size=2):
        self.text = text
        self.context_size = context_size
        self.vocab = sorted(set(text))
        self.char_to_idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx_to_char = {idx: char for idx, char in enumerate(self.vocab)}
        self.data = self.create_dataset()

    def create_dataset(self):
        data = []
        for i in range(len(self.text) - self.context_size):
            context = self.text[i:i + self.context_size]
            target = self.text[i + self.context_size]
            context_idxs = [self.char_to_idx[char] for char in context]
            target_idx = self.char_to_idx[target]
            data.append((context_idxs, target_idx))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])

# Sample text for training
text = "hello world. this is a simple text for training a trigram model."

# Create bigram and trigram datasets
bigram_dataset = TextDataset(text, context_size=1)
trigram_dataset = TextDataset(text, context_size=2)

# Create data loaders
bigram_dataloader = DataLoader(bigram_dataset, batch_size=32, shuffle=True)
trigram_dataloader = DataLoader(trigram_dataset, batch_size=32, shuffle=True)

# Define the bigram model
class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super(BigramModel, self).__init__()
        self.fc1 = nn.Linear(vocab_size, 128)
        self.fc2 = nn.Linear(128, vocab_size)

    def forward(self, x):
        x = torch.nn.functional.one_hot(x, num_classes=vocab_size).float().sum(dim=1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the trigram model
class TrigramModel(nn.Module):
    def __init__(self, vocab_size):
        super(TrigramModel, self).__init__()
        self.fc1 = nn.Linear(vocab_size * 2, 128)
        self.fc2 = nn.Linear(128, vocab_size)

    def forward(self, x):
        x = torch.nn.functional.one_hot(x, num_classes=vocab_size).float().view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Get the vocabulary size
vocab_size = len(trigram_dataset.vocab)

# Instantiate models
bigram_model = BigramModel(vocab_size)
trigram_model = TrigramModel(vocab_size)

# Training function
def train_model(model, dataloader, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

# Train the bigram model
print("Training Bigram Model")
train_model(bigram_model, bigram_dataloader)

# Train the trigram model
print("Training Trigram Model")
train_model(trigram_model, trigram_dataloader)

# Evaluation function
def evaluate_model(model, dataloader):
    criterion = nn.CrossEntropyLoss()
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    return total_loss / len(dataloader)

# Evaluate the models
bigram_loss = evaluate_model(bigram_model, bigram_dataloader)
trigram_loss = evaluate_model(trigram_model, trigram_dataloader)

print(f"Bigram Model Loss: {bigram_loss}")
print(f"Trigram Model Loss: {trigram_loss}")


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

# Define the dataset class
class TextDataset(Dataset):
    def __init__(self, text, context_size=2):
        self.text = text
        self.context_size = context_size
        self.vocab = sorted(set(text))
        self.char_to_idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx_to_char = {idx: char for idx, char in enumerate(self.vocab)}
        self.data = self.create_dataset()

    def create_dataset(self):
        data = []
        for i in range(len(self.text) - self.context_size):
            context = self.text[i:i + self.context_size]
            target = self.text[i + self.context_size]
            context_idxs = [self.char_to_idx[char] for char in context]
            target_idx = self.char_to_idx[target]
            data.append((context_idxs, target_idx))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])

# Sample text for training
text = "hello world. this is a simple text for training a trigram model."

# Create datasets
bigram_dataset = TextDataset(text, context_size=1)
trigram_dataset = TextDataset(text, context_size=2)

# Split the datasets
def split_dataset(dataset, train_ratio=0.8, dev_ratio=0.1, test_ratio=0.1):
    train_size = int(train_ratio * len(dataset))
    dev_size = int(dev_ratio * len(dataset))
    test_size = len(dataset) - train_size - dev_size
    return random_split(dataset, [train_size, dev_size, test_size])

bigram_train, bigram_dev, bigram_test = split_dataset(bigram_dataset)
trigram_train, trigram_dev, trigram_test = split_dataset(trigram_dataset)

# Create data loaders
bigram_train_loader = DataLoader(bigram_train, batch_size=32, shuffle=True)
bigram_dev_loader = DataLoader(bigram_dev, batch_size=32, shuffle=False)
bigram_test_loader = DataLoader(bigram_test, batch_size=32, shuffle=False)

trigram_train_loader = DataLoader(trigram_train, batch_size=32, shuffle=True)
trigram_dev_loader = DataLoader(trigram_dev, batch_size=32, shuffle=False)
trigram_test_loader = DataLoader(trigram_test, batch_size=32, shuffle=False)

# Define the bigram model
class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super(BigramModel, self).__init__()
        self.fc1 = nn.Linear(vocab_size, 128)
        self.fc2 = nn.Linear(128, vocab_size)

    def forward(self, x):
        x = torch.nn.functional.one_hot(x, num_classes=vocab_size).float().sum(dim=1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the trigram model
class TrigramModel(nn.Module):
    def __init__(self, vocab_size):
        super(TrigramModel, self).__init__()
        self.fc1 = nn.Linear(vocab_size * 2, 128)
        self.fc2 = nn.Linear(128, vocab_size)

    def forward(self, x):
        x = torch.nn.functional.one_hot(x, num_classes=vocab_size).float().view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Get the vocabulary size
vocab_size = len(trigram_dataset.vocab)

# Instantiate models
bigram_model = BigramModel(vocab_size)
trigram_model = TrigramModel(vocab_size)

# Training function
def train_model(model, dataloader, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

# Train the bigram model
print("Training Bigram Model")
train_model(bigram_model, bigram_train_loader)

# Train the trigram model
print("Training Trigram Model")
train_model(trigram_model, trigram_train_loader)

# Evaluation function
def evaluate_model(model, dataloader):
    criterion = nn.CrossEntropyLoss()
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    return total_loss / len(dataloader)

# Evaluate the models on dev set
bigram_dev_loss = evaluate_model(bigram_model, bigram_dev_loader)
trigram_dev_loss = evaluate_model(trigram_model, trigram_dev_loader)

# Evaluate the models on test set
bigram_test_loss = evaluate_model(bigram_model, bigram_test_loader)
trigram_test_loss = evaluate_model(trigram_model, trigram_test_loader)

print(f"Bigram Model Dev Loss: {bigram_dev_loss}")
print(f"Trigram Model Dev Loss: {trigram_dev_loss}")
print(f"Bigram Model Test Loss: {bigram_test_loss}")
print(f"Trigram Model Test Loss: {trigram_test_loss}")


Training Bigram Model
Epoch 1, Loss: 2.968363881111145
Epoch 2, Loss: 2.9524093866348267
Epoch 3, Loss: 2.9329882860183716
Epoch 4, Loss: 2.9222463369369507
Epoch 5, Loss: 2.9082382917404175
Epoch 6, Loss: 2.897231698036194
Epoch 7, Loss: 2.8811813592910767
Epoch 8, Loss: 2.876084089279175
Epoch 9, Loss: 2.8507518768310547
Epoch 10, Loss: 2.845628261566162
Training Trigram Model
Epoch 1, Loss: 2.9443992376327515
Epoch 2, Loss: 2.9195446968078613
Epoch 3, Loss: 2.9103747606277466
Epoch 4, Loss: 2.897378087043762
Epoch 5, Loss: 2.877183198928833
Epoch 6, Loss: 2.859540343284607
Epoch 7, Loss: 2.8442124128341675
Epoch 8, Loss: 2.833723306655884
Epoch 9, Loss: 2.8084535598754883
Epoch 10, Loss: 2.793848156929016
Bigram Model Dev Loss: 2.879796028137207
Trigram Model Dev Loss: 2.865471124649048
Bigram Model Test Loss: 2.833972215652466
Trigram Model Test Loss: 2.9337661266326904


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

# Define the dataset class
class TextDataset(Dataset):
    def __init__(self, text, context_size=2):
        self.text = text
        self.context_size = context_size
        self.vocab = sorted(set(text))
        self.char_to_idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx_to_char = {idx: char for idx, char in enumerate(self.vocab)}
        self.data = self.create_dataset()

    def create_dataset(self):
        data = []
        for i in range(len(self.text) - self.context_size):
            context = self.text[i:i + self.context_size]
            target = self.text[i + self.context_size]
            context_idxs = [self.char_to_idx[char] for char in context]
            target_idx = self.char_to_idx[target]
            data.append((context_idxs, target_idx))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])

# Sample text for training
text = "hello world. this is a simple text for training a trigram model."

# Create datasets
bigram_dataset = TextDataset(text, context_size=1)
trigram_dataset = TextDataset(text, context_size=2)

# Split the datasets
def split_dataset(dataset, train_ratio=0.8, dev_ratio=0.1, test_ratio=0.1):
    train_size = int(train_ratio * len(dataset))
    dev_size = int(dev_ratio * len(dataset))
    test_size = len(dataset) - train_size - dev_size
    return random_split(dataset, [train_size, dev_size, test_size])

bigram_train, bigram_dev, bigram_test = split_dataset(bigram_dataset)
trigram_train, trigram_dev, trigram_test = split_dataset(trigram_dataset)

# Create data loaders
bigram_train_loader = DataLoader(bigram_train, batch_size=32, shuffle=True)
bigram_dev_loader = DataLoader(bigram_dev, batch_size=32, shuffle=False)
bigram_test_loader = DataLoader(bigram_test, batch_size=32, shuffle=False)

trigram_train_loader = DataLoader(trigram_train, batch_size=32, shuffle=True)
trigram_dev_loader = DataLoader(trigram_dev, batch_size=32, shuffle=False)
trigram_test_loader = DataLoader(trigram_test, batch_size=32, shuffle=False)

# Define the bigram model
class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super(BigramModel, self).__init__()
        self.fc1 = nn.Linear(vocab_size, 128)
        self.fc2 = nn.Linear(128, vocab_size)

    def forward(self, x):
        x = torch.nn.functional.one_hot(x, num_classes=vocab_size).float().sum(dim=1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the trigram model
class TrigramModel(nn.Module):
    def __init__(self, vocab_size):
        super(TrigramModel, self).__init__()
        self.fc1 = nn.Linear(vocab_size * 2, 128)
        self.fc2 = nn.Linear(128, vocab_size)

    def forward(self, x):
        x = torch.nn.functional.one_hot(x, num_classes=vocab_size).float().view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Get the vocabulary size
vocab_size = len(trigram_dataset.vocab)

# Instantiate models
bigram_model = BigramModel(vocab_size)
trigram_model = TrigramModel(vocab_size)

# Training function
def train_model(model, dataloader, epochs=10, weight_decay=0.0):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=weight_decay)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

# Evaluation function
def evaluate_model(model, dataloader):
    criterion = nn.CrossEntropyLoss()
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    return total_loss / len(dataloader)

# Train the bigram model without regularization
print("Training Bigram Model")
train_model(bigram_model, bigram_train_loader)

# Evaluate the bigram model on dev and test sets
bigram_dev_loss = evaluate_model(bigram_model, bigram_dev_loader)
bigram_test_loss = evaluate_model(bigram_model, bigram_test_loader)

print(f"Bigram Model Dev Loss: {bigram_dev_loss}")
print(f"Bigram Model Test Loss: {bigram_test_loss}")

# Tune the regularization strength for the trigram model
weight_decays = [0.0, 0.001, 0.01, 0.1, 1.0]
best_weight_decay = 0.0
best_dev_loss = float('inf')

for weight_decay in weight_decays:
    print(f"\nTraining Trigram Model with weight_decay={weight_decay}")
    model = TrigramModel(vocab_size)
    train_model(model, trigram_train_loader, weight_decay=weight_decay)
    dev_loss = evaluate_model(model, trigram_dev_loader)
    print(f"Dev Loss: {dev_loss}")
    if dev_loss < best_dev_loss:
        best_dev_loss = dev_loss
        best_weight_decay = weight_decay

print(f"\nBest weight decay: {best_weight_decay}, Dev Loss: {best_dev_loss}")

# Evaluate the best trigram model on test set
best_trigram_model = TrigramModel(vocab_size)
train_model(best_trigram_model, trigram_train_loader, weight_decay=best_weight_decay)
trigram_test_loss = evaluate_model(best_trigram_model, trigram_test_loader)

print(f"Trigram Model Test Loss with best weight decay ({best_weight_decay}): {trigram_test_loss}")


Training Bigram Model
Epoch 1, Loss: 2.9531177282333374
Epoch 2, Loss: 2.935624122619629
Epoch 3, Loss: 2.915940284729004
Epoch 4, Loss: 2.8986111879348755
Epoch 5, Loss: 2.888336658477783
Epoch 6, Loss: 2.8679792881011963
Epoch 7, Loss: 2.857132315635681
Epoch 8, Loss: 2.8377009630203247
Epoch 9, Loss: 2.8311187028884888
Epoch 10, Loss: 2.8222053050994873
Bigram Model Dev Loss: 2.951111078262329
Bigram Model Test Loss: 2.9269378185272217

Training Trigram Model with weight_decay=0.0
Epoch 1, Loss: 2.9539849758148193
Epoch 2, Loss: 2.926152467727661
Epoch 3, Loss: 2.9019569158554077
Epoch 4, Loss: 2.900794267654419
Epoch 5, Loss: 2.8654823303222656
Epoch 6, Loss: 2.8587454557418823
Epoch 7, Loss: 2.8359209299087524
Epoch 8, Loss: 2.814158320426941
Epoch 9, Loss: 2.8017191886901855
Epoch 10, Loss: 2.771131753921509
Dev Loss: 2.8956024646759033

Training Trigram Model with weight_decay=0.001
Epoch 1, Loss: 2.9386545419692993
Epoch 2, Loss: 2.919864296913147
Epoch 3, Loss: 2.9063303470611

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

# Define the dataset class
class TextDataset(Dataset):
    def __init__(self, text, context_size=2):
        self.text = text
        self.context_size = context_size
        self.vocab = sorted(set(text))
        self.char_to_idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx_to_char = {idx: char for idx, char in enumerate(self.vocab)}
        self.data = self.create_dataset()

    def create_dataset(self):
        data = []
        for i in range(len(self.text) - self.context_size):
            context = self.text[i:i + self.context_size]
            target = self.text[i + self.context_size]
            context_idxs = [self.char_to_idx[char] for char in context]
            target_idx = self.char_to_idx[target]
            data.append((context_idxs, target_idx))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])

# Sample text for training
text = "hello world. this is a simple text for training a trigram model."

# Create datasets
bigram_dataset = TextDataset(text, context_size=1)
trigram_dataset = TextDataset(text, context_size=2)

# Split the datasets
def split_dataset(dataset, train_ratio=0.8, dev_ratio=0.1, test_ratio=0.1):
    train_size = int(train_ratio * len(dataset))
    dev_size = int(dev_ratio * len(dataset))
    test_size = len(dataset) - train_size - dev_size
    return random_split(dataset, [train_size, dev_size, test_size])

bigram_train, bigram_dev, bigram_test = split_dataset(bigram_dataset)
trigram_train, trigram_dev, trigram_test = split_dataset(trigram_dataset)

# Create data loaders
bigram_train_loader = DataLoader(bigram_train, batch_size=32, shuffle=True)
bigram_dev_loader = DataLoader(bigram_dev, batch_size=32, shuffle=False)
bigram_test_loader = DataLoader(bigram_test, batch_size=32, shuffle=False)

trigram_train_loader = DataLoader(trigram_train, batch_size=32, shuffle=True)
trigram_dev_loader = DataLoader(trigram_dev, batch_size=32, shuffle=False)
trigram_test_loader = DataLoader(trigram_test, batch_size=32, shuffle=False)

# Define the bigram model
class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super(BigramModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 128)
        self.fc2 = nn.Linear(128, vocab_size)

    def forward(self, x):
        x = self.embedding(x).sum(dim=1)
        x = torch.relu(x)
        x = self.fc2(x)
        return x

# Define the trigram model
class TrigramModel(nn.Module):
    def __init__(self, vocab_size):
        super(TrigramModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 128)
        self.fc2 = nn.Linear(128 * 2, vocab_size)

    def forward(self, x):
        x = self.embedding(x).view(x.size(0), -1)
        x = torch.relu(x)
        x = self.fc2(x)
        return x

# Get the vocabulary size
vocab_size = len(trigram_dataset.vocab)

# Instantiate models
bigram_model = BigramModel(vocab_size)
trigram_model = TrigramModel(vocab_size)

# Training function
def train_model(model, dataloader, epochs=10, weight_decay=0.0):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=weight_decay)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

# Evaluation function
def evaluate_model(model, dataloader):
    criterion = nn.CrossEntropyLoss()
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    return total_loss / len(dataloader)

# Train the bigram model without regularization
print("Training Bigram Model")
train_model(bigram_model, bigram_train_loader)

# Evaluate the bigram model on dev and test sets
bigram_dev_loss = evaluate_model(bigram_model, bigram_dev_loader)
bigram_test_loss = evaluate_model(bigram_model, bigram_test_loader)

print(f"Bigram Model Dev Loss: {bigram_dev_loss}")
print(f"Bigram Model Test Loss: {bigram_test_loss}")

# Tune the regularization strength for the trigram model
weight_decays = [0.0, 0.001, 0.01, 0.1, 1.0]
best_weight_decay = 0.0
best_dev_loss = float('inf')

for weight_decay in weight_decays:
    print(f"\nTraining Trigram Model with weight_decay={weight_decay}")
    model = TrigramModel(vocab_size)
    train_model(model, trigram_train_loader, weight_decay=weight_decay)
    dev_loss = evaluate_model(model, trigram_dev_loader)
    print(f"Dev Loss: {dev_loss}")
    if dev_loss < best_dev_loss:
        best_dev_loss = dev_loss
        best_weight_decay = weight_decay

print(f"\nBest weight decay: {best_weight_decay}, Dev Loss: {best_dev_loss}")

# Evaluate the best trigram model on test set
best_trigram_model = TrigramModel(vocab_size)
train_model(best_trigram_model, trigram_train_loader, weight_decay=best_weight_decay)
trigram_test_loss = evaluate_model(best_trigram_model, trigram_test_loader)

print(f"Trigram Model Test Loss with best weight decay ({best_weight_decay}): {trigram_test_loss}")


Training Bigram Model
Epoch 1, Loss: 3.1371724605560303
Epoch 2, Loss: 3.0789023637771606
Epoch 3, Loss: 2.9869070053100586
Epoch 4, Loss: 2.908730983734131
Epoch 5, Loss: 2.8277335166931152
Epoch 6, Loss: 2.7639886140823364
Epoch 7, Loss: 2.6885894536972046
Epoch 8, Loss: 2.6682519912719727
Epoch 9, Loss: 2.6143622398376465
Epoch 10, Loss: 2.522128939628601
Bigram Model Dev Loss: 2.8781604766845703
Bigram Model Test Loss: 3.0224175453186035

Training Trigram Model with weight_decay=0.0
Epoch 1, Loss: 3.1593868732452393
Epoch 2, Loss: 2.9626407623291016
Epoch 3, Loss: 2.800540566444397
Epoch 4, Loss: 2.662756085395813
Epoch 5, Loss: 2.527352452278137
Epoch 6, Loss: 2.383245825767517
Epoch 7, Loss: 2.2904163599014282
Epoch 8, Loss: 2.186179280281067
Epoch 9, Loss: 2.084089159965515
Epoch 10, Loss: 1.9861007332801819
Dev Loss: 3.4883201122283936

Training Trigram Model with weight_decay=0.001
Epoch 1, Loss: 2.969280242919922
Epoch 2, Loss: 2.78306245803833
Epoch 3, Loss: 2.59988641738891

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

# Define the dataset class
class TextDataset(Dataset):
    def __init__(self, text, context_size=2):
        self.text = text
        self.context_size = context_size
        self.vocab = sorted(set(text))
        self.char_to_idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx_to_char = {idx: char for idx, char in enumerate(self.vocab)}
        self.data = self.create_dataset()

    def create_dataset(self):
        data = []
        for i in range(len(self.text) - self.context_size):
            context = self.text[i:i + self.context_size]
            target = self.text[i + self.context_size]
            context_idxs = [self.char_to_idx[char] for char in context]
            target_idx = self.char_to_idx[target]
            data.append((context_idxs, target_idx))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])

# Sample text for training
text = "hello world. this is a simple text for training a trigram model."

# Create datasets
bigram_dataset = TextDataset(text, context_size=1)
trigram_dataset = TextDataset(text, context_size=2)

# Split the datasets
def split_dataset(dataset, train_ratio=0.8, dev_ratio=0.1, test_ratio=0.1):
    train_size = int(train_ratio * len(dataset))
    dev_size = int(dev_ratio * len(dataset))
    test_size = len(dataset) - train_size - dev_size
    return random_split(dataset, [train_size, dev_size, test_size])

bigram_train, bigram_dev, bigram_test = split_dataset(bigram_dataset)
trigram_train, trigram_dev, trigram_test = split_dataset(trigram_dataset)

# Create data loaders
bigram_train_loader = DataLoader(bigram_train, batch_size=32, shuffle=True)
bigram_dev_loader = DataLoader(bigram_dev, batch_size=32, shuffle=False)
bigram_test_loader = DataLoader(bigram_test, batch_size=32, shuffle=False)

trigram_train_loader = DataLoader(trigram_train, batch_size=32, shuffle=True)
trigram_dev_loader = DataLoader(trigram_dev, batch_size=32, shuffle=False)
trigram_test_loader = DataLoader(trigram_test, batch_size=32, shuffle=False)

# Define the bigram model
class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super(BigramModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 128)
        self.fc2 = nn.Linear(128, vocab_size)

    def forward(self, x):
        x = self.embedding(x).sum(dim=1)
        x = torch.relu(x)
        x = self.fc2(x)
        return x

# Define the trigram model
class TrigramModel(nn.Module):
    def __init__(self, vocab_size):
        super(TrigramModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 128)
        self.fc2 = nn.Linear(128 * 2, vocab_size)

    def forward(self, x):
        x = self.embedding(x).view(x.size(0), -1)
        x = torch.relu(x)
        x = self.fc2(x)
        return x

# Get the vocabulary size
vocab_size = len(trigram_dataset.vocab)

# Instantiate models
bigram_model = BigramModel(vocab_size)
trigram_model = TrigramModel(vocab_size)

# Training function
def train_model(model, dataloader, epochs=10, weight_decay=0.0):
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=weight_decay)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = F.cross_entropy(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

# Evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            outputs = model(inputs)
            loss = F.cross_entropy(outputs, targets)
            total_loss += loss.item()
    return total_loss / len(dataloader)

# Train the bigram model without regularization
print("Training Bigram Model")
train_model(bigram_model, bigram_train_loader)

# Evaluate the bigram model on dev and test sets
bigram_dev_loss = evaluate_model(bigram_model, bigram_dev_loader)
bigram_test_loss = evaluate_model(bigram_model, bigram_test_loader)

print(f"Bigram Model Dev Loss: {bigram_dev_loss}")
print(f"Bigram Model Test Loss: {bigram_test_loss}")

# Tune the regularization strength for the trigram model
weight_decays = [0.0, 0.001, 0.01, 0.1, 1.0]
best_weight_decay = 0.0
best_dev_loss = float('inf')

for weight_decay in weight_decays:
    print(f"\nTraining Trigram Model with weight_decay={weight_decay}")
    model = TrigramModel(vocab_size)
    train_model(model, trigram_train_loader, weight_decay=weight_decay)
    dev_loss = evaluate_model(model, trigram_dev_loader)
    print(f"Dev Loss: {dev_loss}")
    if dev_loss < best_dev_loss:
        best_dev_loss = dev_loss
        best_weight_decay = weight_decay

print(f"\nBest weight decay: {best_weight_decay}, Dev Loss: {best_dev_loss}")

# Evaluate the best trigram model on test set
best_trigram_model = TrigramModel(vocab_size)
train_model(best_trigram_model, trigram_train_loader, weight_decay=best_weight_decay)
trigram_test_loss = evaluate_model(best_trigram_model, trigram_test_loader)

print(f"Trigram Model Test Loss with best weight decay ({best_weight_decay}): {trigram_test_loss}")


Training Bigram Model
Epoch 1, Loss: 2.8900279998779297
Epoch 2, Loss: 2.8204110860824585
Epoch 3, Loss: 2.74286425113678
Epoch 4, Loss: 2.6939122676849365
Epoch 5, Loss: 2.5935546159744263
Epoch 6, Loss: 2.5574045181274414
Epoch 7, Loss: 2.4643744230270386
Epoch 8, Loss: 2.4315407276153564
Epoch 9, Loss: 2.335947036743164
Epoch 10, Loss: 2.3249012231826782
Bigram Model Dev Loss: 3.0295238494873047
Bigram Model Test Loss: 3.198779582977295

Training Trigram Model with weight_decay=0.0
Epoch 1, Loss: 2.9970760345458984
Epoch 2, Loss: 2.837743878364563
Epoch 3, Loss: 2.7239545583724976
Epoch 4, Loss: 2.6064172983169556
Epoch 5, Loss: 2.4771848917007446
Epoch 6, Loss: 2.399138331413269
Epoch 7, Loss: 2.3023072481155396
Epoch 8, Loss: 2.219529628753662
Epoch 9, Loss: 2.163233757019043
Epoch 10, Loss: 2.0930097103118896
Dev Loss: 2.2652432918548584

Training Trigram Model with weight_decay=0.001
Epoch 1, Loss: 3.1107257604599
Epoch 2, Loss: 2.9711945056915283
Epoch 3, Loss: 2.83568572998046