In [None]:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

from collections import defaultdict

word_freqs = defaultdict(int)

with open("input.txt","r") as file:
  contents = file.read()

corpus = contents

for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

vocab = ["<|endoftext|>"] + alphabet.copy()

splits = {word: [c for c in word] for word in word_freqs.keys()}

def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i >= 5:
        break

best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq


def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

    vocab_size = 50

    while len(vocab) < vocab_size:
        pair_freqs = compute_pair_freqs(splits)
        best_pair = ""
        max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

def tokenize(text):
    merges = {("Ġ", "t"): "Ġt"}
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
           i = 0
           while i < len(split) - 1:
               if split[i] == pair[0] and split[i + 1] == pair[1]:
                  split = split[:i] + [merge] + split[i + 2 :]
               else:
                  i += 1
           splits[idx] = split

    return sum(splits, [])



In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokens = tokenizer.tokenize(["pi","phi"])

print(tokens)



ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
decoded = tokenizer.decode(ids)
print(decoded)

In [None]:

import torch
from tqdm.notebook import tqdm
import string

BATCH_SIZE = 128
SEQ_LEN = 4 #4
EPOCHS = 10
LEARNING_RATE = 1e-3
device = 'cpu'

with open("input.txt","r") as file:
  contents = file.read()

letters = string.printable
char_to_id = {k: i for i, k in enumerate(letters)}

print(char_to_id)

id_to_char = {i: k for i, k in enumerate(letters)}
char_to_id['Ċ'] = len(char_to_id)
id_to_char[len(id_to_char)] = 'Ċ'

class Dataset(torch.utils.data.Dataset):
    def __init__(self, contents, seq_len, char_to_id):
        # List of IDs, each corresponding to a character in the sequence
        self.contents = contents # Keep the contents as a string
        # How long each sequence/token is
        self.seq_len = seq_len
        self.char_to_id = char_to_id

    def __getitem__(self, i):
        input_text = self.contents[i : i + self.seq_len]
        output_text = self.contents[i + 1 : i + self.seq_len + 1]
        input = torch.tensor([self.char_to_id[x] for x in input_text]) # Convert characters to IDs after slicing
        output = torch.tensor([self.char_to_id[x] for x in output_text])
        #embedding table
        return input, output

    # Length of dataset will be the number of tokens shifted by time step
    def __len__(self):
        return len(self.contents) - self.seq_len

dataset = Dataset(contents, SEQ_LEN, char_to_id)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

# %%
class Model(torch.nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(Model, self).__init__()

        self.input_size = input_size
        self.output_size = output_size
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.rnn = torch.nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)
        # self.rnn = torch.nn.LSTM(input_size, hidden_dim, n_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        """
        `x` is a tensor of shape (N/batch_size, L/seq_length, H_in/input_size)
        """
        batch_size = x.size(0) # Get the batch size of the input tensor, either 128 or 1
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        out, hidden = self.rnn(x, hidden) # For RNN
        # cell_state = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        # out, hidden = self.rnn(x, (hidden, cell_state)) # For LSTM
        out = self.fc(out)
        return out, hidden

    def predict(self, input: str, length: int, verbose=False) -> str:
        result = input
        input = input[-1]

        for i in range(length):
            if verbose:
                print("Iteration", i)
            input = char_to_id[input]
            # Convert to one-hot (shape: [100])
            inp_vec = torch.nn.functional.one_hot(torch.tensor(input), num_classes=len(char_to_id))

            # Add empty dimensions (shape: [1, 1, 100])
            inp_vec = inp_vec[None, None, :]
            pred_vec, hidden = self.forward(inp_vec.float())

            pred_id = torch.multinomial(torch.nn.functional.softmax(
                pred_vec, dim=-1)[0, 0, :], num_samples=1)
            if verbose:
                print("Predicted ID:", pred_id)

            pred_char = id_to_char[pred_id.item()]
            input = pred_char
            result += input
            if verbose:
                print("Current result:", result)

        return result

# %%
model = Model(len(id_to_char), len(id_to_char), 10, 1).to(device)
model.predict("abc", 10)

# %%
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# %%
sample = next(iter(dataloader))
a, b = sample
# a

# %%
EPOCHS = 50
for epoch in tqdm(range(EPOCHS)):
    for i, batch in tqdm(enumerate(iter(dataloader))):
        optimizer.zero_grad()
        inp, label = batch
        # print("Input before one-hot:", inp.shape)
        inp = torch.nn.functional.one_hot(inp, num_classes=len(char_to_id))
        # print("Input after one-hot:", inp.shape)
        pred, hidden = model(inp.float())
        # pred = pred.squeeze(-1)
        # print("Label shape:", label.shape) # [N/batch_size, L]
        # print("Pred shape:", pred.shape) # [N/batch_size, L, num_classes]
        label = label.view(-1)
        pred = pred.view(-1, pred.size(2))
        # print("Pred shape:", pred.shape) # [N/batch_size, L, num_classes]

        # Compute loss and backpropagate
        loss = loss_fn(pred, label)
        loss.backward()
        optimizer.step()
    if epoch % 2 == 0:
        print(model.predict("We are", 30))

# %%
