## Util Definitions


In [8]:
import random
import torch

def extract_test_and_vald_set(text, test_size = 0.1, validation_size = 0.1):
    total_length = len(text)
    test_length = int(total_length * test_size)
    val_length = int(total_length * validation_size)

    test_start_index = random.randint(0, total_length - test_length)
    test_end_index = test_start_index + test_length
    test_text = text[test_start_index:test_end_index]

    total_length_without_test = total_length - test_length
    val_start_index = random.randint(0, total_length_without_test - val_length)
    val_end_index = val_start_index + val_length
    validation_text = text[val_start_index:val_end_index]   

    train_text = text[:val_start_index] + text[val_end_index:]  

    return train_text, test_text, validation_text

def encode_vocab_to_index(vocabulary):
    stoi = {token:i for i, token in enumerate(vocabulary) }
    def encode(token_list):
        return [stoi[token] for token in token_list]
    return encode

def decode_index_to_vocab(vocabulary):
    itos = {i: token for i, token in enumerate(vocabulary)}
    def decode(index_list):
        return [itos[i] for i in index_list]
    return decode

def save_vocabulary_or_merges(vocabulary, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for token in vocabulary:
            file.write(f"{token}\n")

def save_document(text, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text)

def open_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

### Hypterparameter

In [9]:
batch_size = 4
block_size = 8

## Data Preprocessing

### BPE

In [10]:
from collections import defaultdict

class BPE():

    def __init__(self):
        pass

    def learner(corpus, merge_count=150):
        corpus = corpus.lower()
        words = [list(word) + ['_'] for word in corpus.split()]

        merges = []

        for m in range(merge_count):
            vocab = defaultdict(int)
            for word in words:
                for i in range(len(word) - 1):
                    pair = (word[i], word[i+1])
                    vocab[pair] += 1

            most_frequent = max(vocab, key=vocab.get)
            merges.append(most_frequent)

            new_token = ''.join(most_frequent)
            new_words = []
            for word in words:
                new_word = []
                i = 0
                while i < len(word):
                    # Merge durchführen
                    if i < len(word) - 1 and (word[i], word[i+1]) == most_frequent:
                        new_word.append(new_token)
                        i += 2
                    else:
                        new_word.append(word[i])
                        i += 1
                new_words.append(new_word)
            words = new_words  # Corpus aktualisieren

            print(f"Iteration {m+1}: merged {most_frequent}")

        vocabulary = set()
        for word in words:
            for token in word:
                vocabulary.add(token)
        #vocabulary = sorted(vocabulary)


        return merges, vocabulary

    def segmenter(corpus, merges):
        words = [list(word) + ['_'] for word in corpus.lower().split()]
        for merge in merges:
            new_token = ''.join(merge)
            new_words = []
            for word in words:
                new_word = []
                i = 0
                while i < len(word):
                    if i < len(word) - 1 and (word[i], word[i+1]) == merge:
                        new_word.append(new_token)
                        i += 2
                    else:
                        new_word.append(word[i])
                        i += 1
                new_words.append(new_word)
            words = new_words
            tokenised_corpus = [''.join(word).strip('_') for word in words]
            flat_tokens = [token for word in words for token in word if token != '_']
        return flat_tokens

    #text_path = 'data/shakespeare.txt'
    #corpus = open_text_file(text_path)

    #train_corpus, test_corpus = extract_test_set(corpus, test_size=0.1)
    #save_document(train_corpus, 'data/train_corpus.txt')
    #save_document(test_corpus, 'data/test_corpus.txt')

    #train_corpus = open_text_file('data/train_corpus.txt')

    #new_corpus, merges, tokens = learner(train_corpus, merge_count=200)
    #save_vocabulary(tokens, 'data/vocabulary.txt')



### Splitting Corpus in train, test, and validation

In [11]:
# text, test_text, validation_text = extract_test_and_vald_set(open_text_file('data/shakespeare.txt'))
train_corpus = open_text_file('data/Shakespeare_clean_train.txt')
test_corpus = open_text_file('data/Shakespeare_clean_test.txt')
validation_corpus = open_text_file('data/Shakespeare_clean_valid.txt')

### Training

In [12]:

# Take a look at the text
print("Training Text Sample:", train_corpus[:100])
# Take a look at the current characters that occur
chars = sorted(list(set(train_corpus)))
print("Init Vocabulary:", ''.join(chars))
print("Vocabulary size:", len(chars))

# Train the BPE model
merges, vocabulary = BPE.learner(train_corpus, merge_count=150)

# Save the vocabulary and merges
save_vocabulary_or_merges(vocabulary, 'data/vocabulary.txt')
save_vocabulary_or_merges(merges, 'data/merges.txt')

# Apply the Segmenter to the training corpus
tokenised_train_corpus = BPE.segmenter(train_corpus, merges)
tokenised_validation_corpus = BPE.segmenter(validation_corpus, merges)
tokenised_test_corpus = BPE.segmenter(test_corpus, merges)

print("Tokenised Training Corpus Sample:", tokenised_train_corpus[:100])  # Print first 100 tokens

# Encode and decode the vocabulary to a list of indices
encode_vocab = encode_vocab_to_index(vocabulary)
decode_vocab = decode_index_to_vocab(vocabulary)

# Encode and decode the training corpus
train_ids = encode_vocab(tokenised_train_corpus)
test_ids = encode_vocab(tokenised_test_corpus)
validation_ids = encode_vocab(tokenised_validation_corpus)

print("Encoded Training Corpus Sample:", train_ids[:100]) 
text =  decode_vocab(train_ids)
print("decoded Training Ids:", text[:100])  # Print first 100 decoded tokens




Training Text Sample: The Tragedy of Antony and Cleopatra Dramatis Personae MARK ANTONY OCTAVIUS CAESAR M. AEMILIUS LEPIDU
Init Vocabulary:  !&',-.12689:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocabulary size: 67
Iteration 1: merged ('e', '_')
Iteration 2: merged ('t', 'h')
Iteration 3: merged (',', '_')
Iteration 4: merged ('t', '_')
Iteration 5: merged ('s', '_')
Iteration 6: merged ('d', '_')
Iteration 7: merged ('a', 'n')
Iteration 8: merged ('e', 'r')
Iteration 9: merged ('o', 'u')
Iteration 10: merged ('i', 'n')
Iteration 11: merged ('o', '_')
Iteration 12: merged ('y', '_')
Iteration 13: merged ('e', 'n')
Iteration 14: merged ('.', '_')
Iteration 15: merged ('o', 'r')
Iteration 16: merged ('a', 'r')
Iteration 17: merged ('o', 'n')
Iteration 18: merged ('l', 'l')
Iteration 19: merged ('th', 'e_')
Iteration 20: merged ('h', 'a')
Iteration 21: merged ('an', 'd_')
Iteration 22: merged ('e', 's')
Iteration 23: merged ('i', 's_')
Iteration 24: merged ('f', '_')
Iteratio

In [13]:
# convert the training corpus to a tensor
train_data = torch.tensor(train_ids, dtype=torch.long)
test_data = torch.tensor(test_ids, dtype=torch.long)
validation_data = torch.tensor(validation_ids, dtype=torch.long)

print(train_data.shape, train_data.dtype)
print("Training Corpus Encoded Sample:", train_data[:100])  # Print first 100 encoded tokens

torch.Size([438197]) torch.int64
Training Corpus Encoded Sample: tensor([ 97,  51, 124,  85,  89,  39,  41, 107,   8,   7,  41, 144, 171, 130,
         54,  57,   2, 135, 150,  39, 124, 172,  51, 125,  57,  82,  83,   7,
        127,  35, 185,  79, 132,   8,   7,  41,  54, 171, 109, 163, 105,  76,
        113,  79, 185, 168, 127,  89,  70, 161, 105, 130,  57, 189,  39, 105,
         51,  84,  56, 185, 163, 135,  83, 168,  19, 165,  51, 105,  57,  17,
         57,  89, 189, 105,  39,  17, 189, 141, 105, 184,  54, 123,  79, 123,
        105,  38, 184, 141,  29, 105,  82,  54,  42,  83, 171,  79, 105,  39,
         82, 171])


In [14]:
input = train_data[:block_size]
expected_output = train_data[1:block_size+1]

for t in range(block_size):
    context = input[:t+1]
    target = expected_output[t] 
    print(f"Context: {context}, Target: {target}")


Context: tensor([97]), Target: 51
Context: tensor([97, 51]), Target: 124
Context: tensor([ 97,  51, 124]), Target: 85
Context: tensor([ 97,  51, 124,  85]), Target: 89
Context: tensor([ 97,  51, 124,  85,  89]), Target: 39
Context: tensor([ 97,  51, 124,  85,  89,  39]), Target: 41
Context: tensor([ 97,  51, 124,  85,  89,  39,  41]), Target: 107
Context: tensor([ 97,  51, 124,  85,  89,  39,  41, 107]), Target: 8


In [15]:
def get_batch(split):
    if split == 'train':
        data = train_data
    elif split == 'val':
        data = validation_data
    #elif split == 'test':
    #    data = test_ids
    else:
        raise ValueError("Invalid split. Choose from 'train', 'val', or 'test'.")

    intx = torch.randint(len(data) - block_size, (batch_size,))
    input = torch.stack([data[i:i+block_size] for i in intx])
    expected_output = torch.stack([data[i+1:i+block_size+1] for i in intx])

    return input, expected_output

inputMatrix, outputMatrix = get_batch('train')
print("Input Matrix Shape:", inputMatrix.shape)  
print("Input Matrix:", inputMatrix)
print("Output Matrix Shape:", outputMatrix.shape)
print("Output Matrix:", outputMatrix) 

Input Matrix Shape: torch.Size([4, 8])
Input Matrix: tensor([[127,  11,  97,  32,  54,  41, 120, 170],
        [ 27,  72,  89, 131,  56,  57, 136,  97],
        [ 47,  27,  85,  25, 158,  97, 137, 161],
        [ 83, 171, 184,  35,  38, 168, 127, 130]])
Output Matrix Shape: torch.Size([4, 8])
Output Matrix: tensor([[ 11,  97,  32,  54,  41, 120, 170, 171],
        [ 72,  89, 131,  56,  57, 136,  97,  57],
        [ 27,  85,  25, 158,  97, 137, 161,  76],
        [171, 184,  35,  38, 168, 127, 130, 165]])


In [16]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

### Testing

## Neural Network

In [17]:
# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
# ------------

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super(BigramLanguageModel, self).__init__()
        self.vocab_size = vocab_size
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) 
        B, T, C = logits.shape # (batch_size, block_size, vocab_size)
        logits = logits.view(B*T, self.vocab_size)  # reshape to (batch_size * block_size, vocab_size)

        if targets is None:
            loss = None
        else:
            targets = targets.view(B*T) # reshape to (batch_size * block_size,)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)  # sample
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
    

In [19]:
vocab_size = len(vocabulary)

model = BigramLanguageModel(vocab_size=vocab_size)
m = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for step in range(max_iters):

    # Alle eval_interval Schritte: Trainings- und Validierungs-Loss ausgeben
    if step % eval_interval == 0:
        losses = estimate_loss()
        print(f"[Step {step}] Train Loss: {losses['train']:.4f} | Val Loss: {losses['val']:.4f}")

    # Batch aus Trainingsdaten holen
    input_batches, expected_output_batches = get_batch('train')

    # Modell vorwärtslaufen lassen + Loss berechnen
    logits, loss = model(input_batches, expected_output_batches)

    # Gradienten zurücksetzen
    optimizer.zero_grad(set_to_none=True)

    # Backpropagation
    loss.backward()

    optimizer.step()


context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode_vocab(m.generate(context, max_new_tokens=500)[0].tolist()))

[Step 0] Train Loss: 5.7060 | Val Loss: 5.7148
[Step 300] Train Loss: 4.3800 | Val Loss: 4.4093
[Step 600] Train Loss: 3.9480 | Val Loss: 3.9647
[Step 900] Train Loss: 3.7993 | Val Loss: 3.8291
[Step 1200] Train Loss: 3.7490 | Val Loss: 3.7635
[Step 1500] Train Loss: 3.7109 | Val Loss: 3.7355
[Step 1800] Train Loss: 3.6913 | Val Loss: 3.7275
[Step 2100] Train Loss: 3.6804 | Val Loss: 3.7229
[Step 2400] Train Loss: 3.6760 | Val Loss: 3.7084
[Step 2700] Train Loss: 3.6625 | Val Loss: 3.7073


IndexError: too many indices for tensor of dimension 2

In [None]:
import numpy as np
import tqdm

class N_Gram_Basic:

    def __init__(self, n, vocab):
        # n = 1 -> unigram P(wt​​)
        # n = 2 -> bigram  P(wt​∣​wt−1​)
        # n = 3 -> trigram P(wt​∣wt−2​,wt−1​)

        assert(1 <= n)


        self.n = n 
        self.vocab = vocab 
        
        self.vocab_size = len(vocab)

        # e.g. P(wt​∣wt−2​,wt−1​) -> order: wt−2​,wt−1, wt
        self.cnts = {}
 
        self.map_token_to_id = {
            token:id for id, token in enumerate(vocab)
        }



    def train(self, corpus_tokenized):
        
        tokens = corpus_tokenized[0:self.n]
        token_ids_window = [self.map_token_to_id[id] for id in tokens]
        
        print(f"Train {self.n}-Gram:")

        for token in tqdm.tqdm(corpus_tokenized[self.n:], position=0, leave=True):
       
            try:
                self.cnts[tuple(token_ids_window)] += 1

            except KeyError:
                self.cnts[tuple(token_ids_window)] = 1
            
            token_id = self.map_token_to_id[token]
            token_ids_window.append(token_id)

            token_ids_window.pop(0)

       


    def get_prob(self, token_ids_window):

        #
        # Laplace smoothing
        #

        try:
            cnt_target = self.cnts[tuple(token_ids_window)] + 1
        except KeyError:
            cnt_target = 1

        # Prevent side effect
        token_ids_window_tmp = token_ids_window.copy()

        cnt_list = []
        for token_id in range(self.vocab_size):
            
            token_ids_window_tmp[-1] = token_id

            try:
                cnt = self.cnts[tuple(token_ids_window_tmp)] + 1
            except KeyError:
                cnt = 1

            cnt_list.append(cnt)
            
        cnts = np.sum(cnt_list)

        if cnts == 0:
            return 1 / self.vocab_size

        prob = cnt_target / cnts 

        return prob
    

    # No Laplace-smoothing -> trigger KeyError (later used for backoff)
    def get_prob_raiseKeyError(self, token_ids_window):

        try:
            cnt_target = self.cnts[tuple(token_ids_window)]
        except KeyError:
            # unigram word not present -> return avg prob
            if self.n == 1:
                p = 0

                for token_id in range(self.vocab_size):
                    p += self.get_prob([token_id])
                
                return p / self.vocab_size 
            
            raise KeyError
        
        # Prevent side effect
        token_ids_window_tmp = token_ids_window.copy()

        cnt_list = []
        for token_id in range(self.vocab_size):
            
            token_ids_window_tmp[-1] = token_id

            try:
                cnt = self.cnts[tuple(token_ids_window_tmp)]
            except KeyError:
                cnt = 0

            cnt_list.append(cnt)
            
        cnts = np.sum(cnt_list)

        if cnts == 0:
            return 1 / self.vocab_size

        prob = cnt_target / cnts 

        return prob


    def get_distri(self, token_ids_window):


        # Prevent side effect
        token_ids_window_tmp = token_ids_window.copy()

        token_ids_window_tmp.append(None) # dummy

        cnt_tokens_list = []
        for token_id in range(self.vocab_size):

            token_ids_window_tmp[-1] = token_id

            try:
                cnt = self.cnts[tuple(token_ids_window_tmp)] + 1
            except KeyError:
                cnt = 1

            cnt_tokens_list.append(cnt)
        
        
        cnt_tokens = np.array(cnt_tokens_list)

        distri = cnt_tokens / np.sum(cnt_tokens)

        return distri


    def perplexity(self, corpus_val_tokenized):
        
        print("Compute perplexity")

        tokens = corpus_val_tokenized[0:self.n]
        token_ids_window = [self.map_token_to_id[id] for id in tokens]

        logit_list = []
        for token in tqdm.tqdm(corpus_val_tokenized[self.n:], position=0, leave=True):

            prob = self.get_prob(token_ids_window)
      
            logit = np.log(prob)
            
            logit_list.append(logit)

            
            token_id = self.map_token_to_id[token]
            token_ids_window.append(token_id)

            token_ids_window.pop(0)

        tmp = - np.average(logit_list)
       
        return np.exp(tmp)