In [2]:
import numpy as np
import torch
from tqdm import tqdm
import torchtext
from torchtext.data import get_tokenizer

In [3]:
filename = "wonderland.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()

tokenizer = get_tokenizer("spacy")

raw_text = tokenizer(raw_text)

print(raw_text)





In [4]:
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [5]:
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  41509
Total Vocab:  3140


In [6]:
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  41409


In [7]:
X = torch.tensor(dataX, dtype=torch.float32).reshape(n_patterns, seq_length, 1)
X = X / float(n_vocab)
y = torch.tensor(dataY)

In [8]:
lookback = 1
print(X.shape, y.shape)

torch.Size([41409, 100, 1]) torch.Size([41409])


In [9]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=100, num_layers=2, batch_first=True)
        self.linear = nn.Linear(100, n_vocab)
    def forward(self, x):
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.linear(x)
        return x

In [16]:
import numpy as np
import torch.optim as optim
import torch.utils.data as data

def my_collate(batch):

    # Preparing input sequences
    x = [item[0] for item in batch]
    x = torch.stack(x)
    # Preparing target values
    y = [item[1] for item in batch]
    y = torch.stack(y)

    return [x, y]

device = torch.device("cuda:0")

model = LSTMModel().to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss()
loader = tqdm(data.DataLoader(data.TensorDataset(X, y), shuffle=True, batch_size=128))

best_model = None
best_loss = np.inf

n_epochs = 40
for epoch in range(n_epochs):
    model.float()
    model.train()
    for X_batch, y_batch in loader:
        y_pred = model(X_batch.float().to(device))
        loss = loss_fn(y_pred.to(device), y_batch.long().to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    loss = 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            y_pred = model(X_batch.float().to(device))
            loss += loss_fn(y_pred.to(device), y_batch.long().to(device))
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))

torch.save([best_model, char_to_int], "single-tokenized_all.pth")

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [10]:
seq_length = 100
start = np.random.randint(0, len(raw_text)-seq_length)
prompt = raw_text[start:start+seq_length]

In [14]:
import numpy as np
import torch
import torch.nn as nn

best_model, char_to_int = torch.load("single-tokenized_all.pth")
n_vocab = len(char_to_int)
int_to_char = dict((i, c) for c, i in char_to_int.items())

model = LSTMModel()
model.load_state_dict(best_model)

filename = "wonderland.txt"
seq_length = 100
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()
start = np.random.randint(0, len(raw_text)-seq_length)
prompt = 'it repeated again '
prompt = tokenizer(prompt)
print("xd " + str(prompt) + " dx ")
pattern = [char_to_int[c] for c in prompt]

model.eval()
print('Prompt: "%s"' % prompt)
with torch.no_grad():
    for i in range(1000):
        # format input array of int into PyTorch tensor
        x = np.reshape(pattern, (1, len(pattern), 1)) / float(n_vocab)
        x = torch.tensor(x, dtype=torch.float32)
        # generate logits as output from the model
        prediction = model(x)
        # convert logits into one character
        index = int(prediction.argmax())
        result = int_to_char[index]
        print(result, end="")
        # append the new character into the prompt for the next iteration
        pattern.append(index)
        pattern = pattern[1:]
print()
print("Done.")

xd ['it', 'repeated', 'again'] dx 
Prompt: "['it', 'repeated', 'again']"

andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

““gutenberg,andthe

In [10]:
print("Torch version:",torch.__version__)

print("Is CUDA enabled?",torch.cuda.is_available())

Torch version: 2.0.1+cpu
Is CUDA enabled? False
