In [16]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import numpy as np
import torch.nn as nn
import torch.optim as optim
import string

class TextDataset(Dataset):
    def __init__(self, text, block_size):
        self.block_size = block_size
        self.vocab, self.char_to_index, self.index_to_char = self.build_vocab(text)
        self.data = self.process_text(text)

    def build_vocab(self, text):
        chars = list(set(text))
        chars = sorted(chars + list(string.printable))  # Add printable characters
        char_to_index = {char: i for i, char in enumerate(chars)}
        index_to_char = {i: char for i, char in enumerate(chars)}
        return chars, char_to_index, index_to_char

    def process_text(self, text):
        indices = [self.char_to_index[char] for char in text]
        sequences = [indices[i:i+self.block_size] for i in range(0, len(indices)-self.block_size, self.block_size)]
        return sequences

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx])

In [6]:
file_path = 'paul_graham_essay.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

block_size = 64
train_dataset = TextDataset(text, block_size)

batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_sequence)

In [9]:
chars, char_to_index, index_to_char = train_dataset.vocab, train_dataset.char_to_index, train_dataset.index_to_char

In [13]:
class NextChar(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = torch.sin(self.lin1(x))
        x = self.lin2(x)
        return x

In [14]:
vocab_size = len(train_dataset.vocab)
emb_dim = 64
hidden_size = 128
learning_rate = 0.001

In [17]:
model = NextChar(block_size, vocab_size, emb_dim, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [18]:
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in train_dataloader:
        inputs = batch.to(device)

        # Target is the next character in the sequence
        targets = inputs[:, 1:].contiguous().view(-1)
        inputs = inputs[:, :-1]

        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, vocab_size), targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}")

RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x4032 and 4096x128)