In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import requests
import os

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 1: Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text  # This is the entire text data

# Creating character vocabulary
chars = sorted(list(set(text)))
ix_to_char = {i: ch for i, ch in enumerate(chars)}
char_to_ix = {ch: i for i, ch in enumerate(chars)}

# Preparing the dataset
max_length = 20  # Maximum length of input sequences
X = []
y = []

for i in range(len(text) - max_length):
    sequence = text[i:i + max_length]
    label = text[i + max_length]
    X.append([char_to_ix[char] for char in sequence])
    y.append(char_to_ix[label])

X = np.array(X)
y = np.array(y)

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Converting data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.long)
y_val = torch.tensor(y_val, dtype=torch.long)

# Defining the Transformer model
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        transformer_output = self.transformer_encoder(embedded)
        output = self.fc(transformer_output[:, -1, :])  # Get the output of the last Transformer block
        return output

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

Using device: cuda:0


In [2]:
for inputs, targets in train_loader:
    print(inputs[:10])
    #print(targets[:10])
    break

tensor([[51, 57,  1, 51, 39, 49, 43,  1, 51, 43, 52,  1, 43, 62, 54, 43, 41, 58,
          1, 39],
        [ 1, 51, 53, 60, 43,  1, 51, 43, 52, 11,  1, 40, 43, 57, 47, 42, 43,  6,
          1, 57],
        [47, 40, 39, 50, 57,  0, 35, 53, 59, 50, 42,  1, 52, 53, 58,  1, 46, 39,
         60, 43],
        [57,  1, 42, 56, 53, 61, 52,  1, 58, 46, 43, 47, 56,  1, 57, 46, 53, 56,
         43, 57],
        [ 1, 58, 46, 63,  1, 50, 53, 56, 42,  6,  1, 58, 46, 63,  1, 49, 47, 52,
         45,  6],
        [43, 43, 42,  1, 44, 56, 47, 43, 52, 42, 57, 10,  1, 57, 59, 40, 48, 43,
         41, 58],
        [43, 56,  1, 63, 53, 59, 56,  1, 45, 56, 39, 60, 47, 58, 63,  1, 53,  5,
         43, 56],
        [18,  1, 13, 33, 25, 17, 30, 24, 17, 10,  0, 20, 43,  1, 51, 43, 39, 52,
         57,  6],
        [44,  1, 51, 43,  8,  0,  0, 28, 17, 32, 30, 33, 15, 20, 21, 27, 10,  0,
         37, 53],
        [63,  1,  5, 52, 53,  5,  1, 58, 53,  1, 51, 63,  1, 42, 43, 51, 39, 52,
         42,  8]])


In [3]:
# Hyperparameters
hidden_size = 16
num_layers = 2
nhead = 2
learning_rate = 0.001
epochs = 15

# Model, loss, and optimizer
model = CharTransformer(len(chars), hidden_size, len(chars), num_layers, nhead).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training the model
for epoch in range(epochs):
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        model.train()
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    total_val_loss = 0
    total_val_accuracy = 0
    num_batches = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)        
            val_output = model(inputs)
            val_loss = criterion(val_output, labels)
            total_val_loss += val_loss.item()

            _, predicted = torch.max(val_output, 1)
            val_accuracy = (predicted == labels).float().mean()
            total_val_accuracy += val_accuracy.item()
            num_batches += 1
               
    average_val_loss = total_val_loss / num_batches
    average_val_accuracy = total_val_accuracy / num_batches

    if (epoch+1) % 1 == 0:
     print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {average_val_loss}, Validation Accuracy: {average_val_accuracy}')

Epoch 1, Loss: 2.8292577266693115, Validation Loss: 2.3859910989416377, Validation Accuracy: 0.29618175081562464
Epoch 2, Loss: 2.8695273399353027, Validation Loss: 2.3518227858920926, Validation Accuracy: 0.30523731832232753
Epoch 3, Loss: 2.3850204944610596, Validation Loss: 2.331058119554596, Validation Accuracy: 0.31009006263432803


In [None]:
# Prediction function
def predict_next_char(model, char_to_ix, ix_to_char, initial_str, max_length):
    model.to(device)
    model.eval()
    with torch.no_grad():
        # Ensure the input is not shorter than expected
        if len(initial_str) < max_length:
            initial_str = (' ' * (max_length - len(initial_str))) + initial_str
        
        # Convert characters to indices, handling characters not in the dictionary
        initial_indices = [char_to_ix.get(c, char_to_ix[' ']) for c in initial_str[-max_length:]]
        
        initial_input = torch.tensor(initial_indices, dtype=torch.long).unsqueeze(0).to(device)
        prediction = model(initial_input)
        last_timestep_pred = prediction.squeeze(0)[-1]
        predicted_index = torch.argmax(last_timestep_pred, dim=0).item()
        return ix_to_char[predicted_index]

# Predicting the next character
test_str = "This is a simple example to demonstrate how to predict the next char"
predicted_char = predict_next_char(model, char_to_ix, ix_to_char, test_str, max_length)
print(f"Predicted next character: '{predicted_char}'")


NameError: name 'model' is not defined