<a href="https://colab.research.google.com/github/UmaNagirireddi/RTML/blob/main/HW5/RTML_5_1_5_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Problem_1**

In [None]:
!pip install torchinfo

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import time
import numpy as np
import torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
# Sample text
text = '''Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text. At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model. One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks. Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time. Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants. In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology.'''

max_length = 10
sequences = [text[i:i + max_length] for i in range(len(text) - max_length)]
labels = [text[i + max_length] for i in range(len(text) - max_length)]
chars = sorted(set(text))
char_to_ix = {ch: i for i, ch in enumerate(chars)}
X, y = torch.tensor([[char_to_ix[ch] for ch in seq] for seq in sequences], dtype=torch.long), torch.tensor([char_to_ix[label] for label in labels], dtype=torch.long)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Define Transformer model
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        transformer_output = self.transformer_encoder(embedded)
        output = self.fc(transformer_output[:, -1, :])  # Get output of last Transformer block
        return output

hidden_size, num_layers, nhead = 128, 3, 2
learning_rate, epochs = 0.001, 50

model = CharTransformer(len(chars), hidden_size, len(chars), num_layers, nhead)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

print(torchinfo.summary(model, input_data=X_train))



Layer (type:depth-idx)                        Output Shape              Param #
CharTransformer                               [1900, 44]                --
├─Embedding: 1-1                              [1900, 10, 128]           5,632
├─TransformerEncoder: 1-2                     [1900, 10, 128]           --
│    └─ModuleList: 2-1                        --                        --
│    │    └─TransformerEncoderLayer: 3-1      [1900, 10, 128]           593,024
│    │    └─TransformerEncoderLayer: 3-2      [1900, 10, 128]           593,024
│    │    └─TransformerEncoderLayer: 3-3      [1900, 10, 128]           593,024
├─Linear: 1-3                                 [1900, 44]                5,676
Total params: 1,790,380
Trainable params: 1,790,380
Non-trainable params: 0
Total mult-adds (G): 3.03
Input size (MB): 0.15
Forward/backward pass size (MB): 1129.12
Params size (MB): 6.37
Estimated Total Size (MB): 1135.64


In [None]:
# Training the model
total_start_time = time.time()
for epoch in range(epochs):
    start_time = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_output = model(X_val)
        val_loss = criterion(val_output, y_val)
        _, predicted = torch.max(val_output, 1)
        val_accuracy = (predicted == y_val).float().mean()

    if (epoch+1) % 5 == 0:
        end_time = time.time()
        execution_time = end_time - start_time
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Validation Accuracy: {val_accuracy.item()}, Execution Time: {execution_time} seconds')

total_end_time = time.time()
total_execution_time = total_end_time - total_start_time
print(f'Total Execution Time: {total_execution_time} seconds')



Layer (type:depth-idx)                        Output Shape              Param #
CharTransformer                               [1900, 44]                --
├─Embedding: 1-1                              [1900, 10, 128]           5,632
├─TransformerEncoder: 1-2                     [1900, 10, 128]           --
│    └─ModuleList: 2-1                        --                        --
│    │    └─TransformerEncoderLayer: 3-1      [1900, 10, 128]           593,024
│    │    └─TransformerEncoderLayer: 3-2      [1900, 10, 128]           593,024
│    │    └─TransformerEncoderLayer: 3-3      [1900, 10, 128]           593,024
├─Linear: 1-3                                 [1900, 44]                5,676
Total params: 1,790,380
Trainable params: 1,790,380
Non-trainable params: 0
Total mult-adds (G): 3.03
Input size (MB): 0.15
Forward/backward pass size (MB): 1129.12
Params size (MB): 6.37
Estimated Total Size (MB): 1135.64
Epoch 5, Loss: 2.901587724685669, Validation Loss: 2.791863441467285, Valida

In [None]:
# Sample text
text = '''Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text. At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model. One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks. Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time. Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants. In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology.'''

# Preparing the dataset for sequence prediction
max_length = 20  # Maximum length of input sequences
sequences = [text[i:i + max_length] for i in range(len(text) - max_length)]
labels = [text[i + max_length] for i in range(len(text) - max_length)]

# Creating character vocabulary
chars = sorted(list(set(text)))
char_to_ix = {ch: i for i, ch in enumerate(chars)}

# Convert sequences and labels to tensors
X = torch.tensor([[char_to_ix[ch] for ch in seq] for seq in sequences], dtype=torch.long)
y = torch.tensor([char_to_ix[label] for label in labels], dtype=torch.long)

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Define Transformer model
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(hidden_size, nhead), num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        return self.fc(self.transformer_encoder(self.embedding(x))[:, -1, :])

# Hyperparameters
hidden_size, num_layers, nhead = 128, 3, 2
learning_rate, epochs = 0.001, 50

# Model, loss, and optimizer
model = CharTransformer(len(chars), hidden_size, len(chars), num_layers, nhead)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)




In [None]:

# Training the model
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_output = model(X_val)
        val_loss = criterion(val_output, y_val)
        val_accuracy = (torch.max(val_output, 1)[1] == y_val).float().mean()

    if (epoch+1) % 5 == 0:
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Validation Accuracy: {val_accuracy.item()}')

Epoch 5, Loss: 2.8753092288970947, Validation Loss: 2.748119354248047, Validation Accuracy: 0.24894514679908752
Epoch 10, Loss: 2.575629711151123, Validation Loss: 2.5266993045806885, Validation Accuracy: 0.26160338521003723
Epoch 15, Loss: 2.4594979286193848, Validation Loss: 2.4707467555999756, Validation Accuracy: 0.26160338521003723
Epoch 20, Loss: 2.3932676315307617, Validation Loss: 2.4332962036132812, Validation Accuracy: 0.2594936788082123
Epoch 25, Loss: 2.347302198410034, Validation Loss: 2.407989025115967, Validation Accuracy: 0.2763713002204895
Epoch 30, Loss: 2.30851149559021, Validation Loss: 2.393947124481201, Validation Accuracy: 0.28270041942596436
Epoch 35, Loss: 2.2774720191955566, Validation Loss: 2.3833768367767334, Validation Accuracy: 0.2637130916118622
Epoch 40, Loss: 2.2483856678009033, Validation Loss: 2.361375570297241, Validation Accuracy: 0.26582279801368713
Epoch 45, Loss: 2.240648031234741, Validation Loss: 2.355828285217285, Validation Accuracy: 0.272151

In [None]:
# Sample text
text = '''Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text. At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model. One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks. Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time. Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants. In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology.'''

# Preparing the dataset for sequence prediction
max_length = 30  # Maximum length of input sequences
sequences = [text[i:i + max_length] for i in range(len(text) - max_length)]
labels = [text[i + max_length] for i in range(len(text) - max_length)]

# Creating character vocabulary
chars = sorted(list(set(text)))
char_to_ix = {ch: i for i, ch in enumerate(chars)}

# Convert sequences and labels to tensors
X = torch.tensor([[char_to_ix[ch] for ch in seq] for seq in sequences], dtype=torch.long)
y = torch.tensor([char_to_ix[label] for label in labels], dtype=torch.long)

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Define Transformer model
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(hidden_size, nhead), num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        transformer_output = self.transformer_encoder(embedded)
        output = self.fc(transformer_output[:, -1, :])  # Get output of last Transformer block
        return output

# Hyperparameters
hidden_size, num_layers, nhead = 128, 2, 2
learning_rate, epochs = 0.001, 50

# Model, loss, and optimizer
model = CharTransformer(len(chars), hidden_size, len(chars), num_layers, nhead)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)



In [None]:
# Training the model
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_output = model(X_val)
        val_loss = criterion(val_output, y_val)
        val_accuracy = (torch.argmax(val_output, dim=1) == y_val).float().mean()

    if (epoch+1) % 5 == 0:
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Validation Accuracy: {val_accuracy.item()}')


Epoch 5, Loss: 2.7416276931762695, Validation Loss: 2.7078185081481934, Validation Accuracy: 0.26483049988746643
Epoch 10, Loss: 2.500795602798462, Validation Loss: 2.561040163040161, Validation Accuracy: 0.2330508530139923
Epoch 15, Loss: 2.4127399921417236, Validation Loss: 2.5171854496002197, Validation Accuracy: 0.23728813230991364
Epoch 20, Loss: 2.357806921005249, Validation Loss: 2.480773687362671, Validation Accuracy: 0.24788135290145874
Epoch 25, Loss: 2.313812255859375, Validation Loss: 2.462399959564209, Validation Accuracy: 0.25
Epoch 30, Loss: 2.2803871631622314, Validation Loss: 2.457984209060669, Validation Accuracy: 0.24788135290145874
Epoch 35, Loss: 2.2599520683288574, Validation Loss: 2.446197986602783, Validation Accuracy: 0.24788135290145874
Epoch 40, Loss: 2.2401366233825684, Validation Loss: 2.4405295848846436, Validation Accuracy: 0.25211864709854126
Epoch 45, Loss: 2.229083299636841, Validation Loss: 2.4385602474212646, Validation Accuracy: 0.24576270580291748


**Problem_2**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import requests
import time
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Step 1: Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
text = requests.get(url).text[:20 * (len(requests.get(url).text) // 20)]

# Step 2: Prepare the dataset
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}

# Encode the text into integers
encoded_text = [char_to_int[ch] for ch in text]

# Create sequences and targets
sequences = [encoded_text[i:i+20] for i in range(len(encoded_text) - 20)]
targets = [encoded_text[i+20] for i in range(len(encoded_text) - 20)]

# Convert lists to PyTorch tensors
sequences = torch.tensor(sequences, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

# Step 3: Create a dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]



In [None]:
# Instantiate the dataset
dataset = CharDataset(sequences, targets)

# Step 4: Create data loaders
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=128)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=128)

class CharModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=2, dim_feedforward=256, dropout=0.1)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        transformer_output = self.transformer_encoder(embedded)
        output = self.fc(transformer_output[:, -1, :])
        return output



In [None]:
# Train and evaluate function
def train_evaluate(train_loader, val_loader, device):
    model = CharModel(len(chars), 512, len(chars)).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    start_time = time.time()
    for epoch in range(20):
        model.train()
        train_loss = 0.0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            output = model(inputs)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        epoch_train_loss = train_loss / len(train_loader.dataset)

        if (epoch+1) % 5 == 0:
            model.eval()
            val_loss = 0.0
            correct = 0
            total = 0
            with torch.no_grad():
                for inputs, targets in val_loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    val_output = model(inputs)
                    loss = criterion(val_output, targets)
                    val_loss += loss.item() * inputs.size(0)

            epoch_val_loss = val_loss / len(val_loader.dataset)
            print(f'Epoch {epoch+1}, Train Loss: {epoch_train_loss}, Validation Loss: {epoch_val_loss}')

    end_time = time.time()
    execution_time = end_time - start_time

    return epoch_train_loss, epoch_val_loss, execution_time

# Train and evaluate models
print("\nTraining models...")
loss, val_loss, execution_time = train_evaluate(train_loader, test_loader, torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Print and compare results
print("\nResults:")
print(f"Training Loss: {loss}")
print(f"Validation Loss: {val_loss}")
print(f"Execution Time: {execution_time} seconds")



Training models...
Epoch 5, Train Loss: 2.472497434018247, Validation Loss: 2.4687606867340266
Epoch 10, Train Loss: 2.4679200839421753, Validation Loss: 2.46457775841996
Epoch 15, Train Loss: 2.4660417705607576, Validation Loss: 2.4626034912223744
Epoch 20, Train Loss: 2.464618385752994, Validation Loss: 2.462831518978731

Results:
Training Loss: 2.464618385752994
Validation Loss: 2.462831518978731
Execution Time: 1139.7919027805328 seconds


In [None]:
# Step 1: Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
text = requests.get(url).text[:30 * (len(requests.get(url).text)//30)]  # Truncate text

# Character mapping to integers
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}

# Encode text into integers
encoded_text = [char_to_int[ch] for ch in text]

# Create sequences and targets
sequence_length = 30
sequences = [encoded_text[i:i+sequence_length] for i in range(len(encoded_text)-sequence_length)]
targets = [encoded_text[i+sequence_length] for i in range(len(encoded_text)-sequence_length)]

# Convert lists to PyTorch tensors
sequences, targets = torch.tensor(sequences, dtype=torch.long), torch.tensor(targets, dtype=torch.long)

# Step 3: Create a dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences, self.targets = sequences, targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# Step 4: Instantiate the dataset and create data loaders
dataset = CharDataset(sequences, targets)
train_size = int(len(dataset) * 0.8)
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])
train_loader, test_loader = DataLoader(train_dataset, shuffle=True, batch_size=128), DataLoader(test_dataset, shuffle=False, batch_size=128)

# Step 5: Define the model
class CharModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, model_type='Transformer', num_layers=2, num_heads=2, dim_feedforward=256, dropout=0.1):
        super(CharModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=hidden_size, nhead=num_heads, dim_feedforward=dim_feedforward, dropout=dropout), num_layers=num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        transformer_output = self.transformer_encoder(embedded)
        return self.fc(transformer_output[:, -1, :])




In [None]:
# Train and evaluate function
def train_evaluate(model_type, train_loader, val_loader, device):
    model = CharModel(len(chars), hidden_size, len(chars), model_type).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    start_time = time.time()
    for epoch in range(20):
        model.train()
        train_loss = 0.0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            output = model(inputs)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        epoch_train_loss = train_loss / len(train_loader.dataset)

        if (epoch+1) % 5 == 0 or epoch == 19:
            # Validation
            model.eval()
            val_loss, correct, total = 0.0, 0, 0
            with torch.no_grad():
                for inputs, targets in val_loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    val_output = model(inputs)
                    loss = criterion(val_output, targets)
                    val_loss += loss.item() * inputs.size(0)
                    _, predicted = torch.max(val_output, 1)
                    total += targets.size(0)
                    correct += (predicted == targets).sum().item()

            epoch_val_loss, epoch_val_accuracy = val_loss / len(val_loader.dataset), correct / total

            print(f'Epoch {epoch+1}, Train Loss: {epoch_train_loss}, Validation Loss: {epoch_val_loss}, Validation Accuracy: {epoch_val_accuracy}')

    end_time = time.time()
    execution_time = end_time - start_time

    return epoch_train_loss, epoch_val_loss, epoch_val_accuracy, execution_time



In [None]:
# Define parameters
hidden_size, num_layers, num_heads, dim_feedforward, dropout = 512, 2, 2, 256, 0.1
device, epochs = torch.device("cuda" if torch.cuda.is_available() else "cpu"), 20

# Train and evaluate models
print("\nTraining models for sequence length: 30")
results = {}
for model_type in ['Transformer']:
    print(f"\nTraining {model_type} model...")
    loss, val_loss, val_accuracy, execution_time = train_evaluate(model_type, train_loader, test_loader, device)
    results[model_type] = {
        'loss': loss,
        'val_loss': val_loss,
        'val_accuracy': val_accuracy,
        'execution_time': execution_time
    }

# Print and compare results
print("\nResults for sequence length: 30")
for model_type, data in results.items():
    print(f"\n{model_type} Model:")
    print(f"Training Loss: {data['loss']}")
    print(f"Validation Loss: {data['val_loss']}")
    print(f"Validation Accuracy: {data['val_accuracy']}")
    print(f"Execution Time: {data['execution_time']} seconds")


Training models for sequence length: 30

Training Transformer model...




Epoch 5, Train Loss: 2.4722231779928423, Validation Loss: 2.4711605255866336, Validation Accuracy: 0.26976527336955547
Epoch 10, Train Loss: 2.4675313118576483, Validation Loss: 2.469579779022486, Validation Accuracy: 0.27063944626750586
Epoch 15, Train Loss: 2.4648077351094395, Validation Loss: 2.4665070803188054, Validation Accuracy: 0.2715136191654563
Epoch 20, Train Loss: 2.463641424682019, Validation Loss: 2.4661055199695974, Validation Accuracy: 0.27044219699822475

Results for sequence length: 30

Transformer Model:
Training Loss: 2.463641424682019
Validation Loss: 2.4661055199695974
Validation Accuracy: 0.27044219699822475
Execution Time: 1469.1312699317932 seconds


In [None]:
# Step 1: Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text[:200000]  # Limit text to 200,000 characters

# Step 2: Prepare the dataset
sequence_length = 50
text = text[:sequence_length * (len(text)//sequence_length)]  # Truncate text to fit sequence length
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}
encoded_text = [char_to_int[ch] for ch in text]

# Create sequences and targets
sequences = [encoded_text[i:i+sequence_length] for i in range(0, len(encoded_text) - sequence_length)]
targets = [encoded_text[i+sequence_length] for i in range(0, len(encoded_text) - sequence_length)]

# Convert lists to PyTorch tensors
sequences = torch.tensor(sequences, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

# Step 3: Create a dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# Instantiate the dataset
dataset = CharDataset(sequences, targets)

# Step 4: Create data loaders
batch_size = 128
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)



In [None]:

class CharModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, model_type='Transformer', num_layers=2, num_heads=2, dim_feedforward=256, dropout=0.1):
        super(CharModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        if model_type == 'Transformer':
            encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=num_heads, dim_feedforward=dim_feedforward, dropout=dropout)
            self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        else:
            raise ValueError("Invalid model type. Choose 'Transformer'.")
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        transformer_output = self.transformer_encoder(embedded)
        output = self.fc(transformer_output[:, -1, :])
        return output

# Train and evaluate function
def train_evaluate(model_type, train_loader, val_loader, device):
    model = CharModel(len(chars), hidden_size, len(chars), model_type).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            output = model(inputs)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        epoch_train_loss = train_loss / len(train_loader.dataset)

        if (epoch + 1) % 5 == 0 or epoch == 0:
            model.eval()
            val_loss = 0.0
            correct = 0
            total = 0
            with torch.no_grad():
                for inputs, targets in val_loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    val_output = model(inputs)
                    loss = criterion(val_output, targets)
                    val_loss += loss.item() * inputs.size(0)

            epoch_val_loss = val_loss / len(val_loader.dataset)

            print(f'Epoch {epoch+1}, Train Loss: {epoch_train_loss}, Validation Loss: {epoch_val_loss}')

    end_time = time.time()
    execution_time = end_time - start_time

    return epoch_train_loss, epoch_val_loss, execution_time



In [None]:
# Define parameters
hidden_size = 512
num_layers = 2
num_heads = 2
dim_feedforward = 256
dropout = 0.1
learning_rate = 0.0001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 20

# Train and evaluate models for sequence length 50
print("\nTraining models for sequence length: 50")
results = {}
for model_type in ['Transformer']:
    print(f"\nTraining {model_type} model...")
    loss, val_loss, execution_time = train_evaluate(model_type, train_loader, test_loader, device)
    results[model_type] = {
        'loss': loss,
        'val_loss': val_loss,
        'execution_time': execution_time
    }

# Print and compare results
print("\nResults for sequence length: 50")
for model_type, data in results.items():
    print(f"\n{model_type} Model:")
    print(f"Training Loss: {data['loss']}")
    print(f"Validation Loss: {data['val_loss']}")
    print(f"Execution Time: {data['execution_time']} seconds")


Training models for sequence length: 50

Training Transformer model...




Epoch 1, Train Loss: 2.5392365783654203, Validation Loss: 2.4550808295335553
Epoch 5, Train Loss: 2.440501050419675, Validation Loss: 2.4383642554253333
Epoch 10, Train Loss: 2.434343239068091, Validation Loss: 2.4306177715564794
Epoch 15, Train Loss: 2.429223425086065, Validation Loss: 2.428668259513113
Epoch 20, Train Loss: 2.427266352139106, Validation Loss: 2.425524602123546

Results for sequence length: 50

Transformer Model:
Training Loss: 2.427266352139106
Validation Loss: 2.425524602123546
Execution Time: 403.91916823387146 seconds
