## Setup enviroments - install and import

In [None]:
!pip install torch
!pip install transformers
!pip install datasets

In [1]:
import torch
from transformers import GPT2Tokenizer, AutoTokenizer, GPT2LMHeadModel
from datasets import load_dataset
from collections import defaultdict
import copy
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## **Task 1: Byte pair encoding**

The goal is to calculate byte pair encodings for a given dataset. For a recap of the concept we refer to lecture 9, slides 22.ff and internet search.

### Task 1.1
First we want to make sure we create a suffieciently large corpus. As dataset you import the dataset "wikitext-2-raw-v1" from huggingface. Make sure to use the 500 first entries of the training dataset and subsequently filter out empty  entries. Also print the first entry of the corpus.

In [2]:
# Load WikiText-2 dataset "wikitext-2-raw-v1"
corpus = []

# TODO: YOUR CODE HERE
# Load the training dataset
dataset_load = load_dataset(path="wikitext", name="wikitext-2-raw-v1", split="train")

Found cached dataset wikitext (/Users/paolobonicco/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


In [3]:
corpus = []
for el in dataset_load["text"][:500]: # take only the first 500 lines
    if(el not in ['', ' ']): # remove empty lines
        corpus.append(el)
    
print("first entry of corpus: ", corpus[0])

first entry of corpus:   = Valkyria Chronicles III = 



Note that the corpus should have a format like
corpus = [
    "This is the first sentence",
    "This is the second sentence",
    ..
]. For debugging during coding you can also consider working with a smaller corpus, e.g. only 5 sentences

### Task 1.2
Here, you loop through the corpus and count the word frequencies

In [4]:
# Initialize a tokenizer from the transformers library
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Initialize a defaultdict to store word frequencies
word_freqs = defaultdict(int)
# this dictionary gets entries for word_freqs[word]

alphabet = []
vocab = [""]
splits = {}
vocab_size = 200
merges = {}

# We loop through to corups to calculate word frequencies
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    # TODO: YOUR CODE HERE to count word frequencies
    for word in new_words:
        word_freqs[word] += 1

The following block creates the alphabet and initial vocabulary

In [5]:
for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print("alphabet", alphabet)
vocab = ["<|endoftext|>"] + alphabet.copy()
splits = {word: [c for c in word] for word in word_freqs.keys()}

print("splits", splits)
print("len vocab:", len(vocab), "vocab", vocab)
print("len alphabet:", len(alphabet), "alphabet", alphabet)

alphabet ['!', '"', '$', '%', '&', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~', '¡', '¢', '£', '¤', '¥', '¦', '©', 'ª', '«', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¹', '¼', '½', '¿', 'Â', 'Ã', 'Å', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ï', 'Ċ', 'Ġ', 'Ģ', 'ģ', 'Ĥ', 'ĥ', 'Ħ', 'ħ', 'Ĩ', 'Ī', 'ī', 'Ĭ', 'ĭ', 'Į', 'į', 'ı', 'Ĳ', 'ĳ', 'Ĵ', 'ĵ', 'Ķ', 'Ĺ', 'ĺ', 'Ļ', 'Ľ', 'Ł', 'ł', 'Ń']
splits {'Ġ=': ['Ġ', '='], 'ĠValkyria': ['Ġ', 'V', 'a', 'l', 'k', 'y', 'r', 'i', 'a'], 'ĠChronicles': ['Ġ', 'C', 'h', 'r', 'o', 'n', 'i', 'c', 'l', 'e', 's'], 'ĠIII': ['Ġ', 'I', 'I', 'I'], 'ĠĊ': ['Ġ', 'Ċ'], 'ĠSenjÅį': ['Ġ', 'S', 'e', 'n', 'j', 'Å', 'į'], 'Ġno': ['Ġ', 'n', 'o'

### Task 1.3
Define a function to compute the frequency of each pair of characters

In [6]:
# Define a function to compute the frequency of each pair of characters
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    
    # TODO: YOUR CODE HERE to compute the frequency of each pair of characters
    
    for word, clist in splits.items():
        for i in range(len(clist)-1):
            pair_freqs[(clist[i], clist[i+1])] += 1

    return pair_freqs

# Initialize pair_freqs by calling the compute_pair_freqs function
pair_freqs = compute_pair_freqs(splits)

In [7]:
len(pair_freqs)

1040

Here we find the pair of characters that appears most frequently together

In [35]:
for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i >= 5:
        break

best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq
        
print("\nMost frequent pair of characters : ", best_pair, " occurs ", max_freq, " times")

('Ġ', '='): 1
('Ġ', 'V'): 29
('V', 'a'): 10
('a', 'l'): 311
('l', 'k'): 9
('k', 'y'): 5

Most frequent pair of characters :  ('i', 'n')  occurs  582  times


### Task 1.4
Define a function to merge the most frequent pair of characters

In [9]:
# Define a function to merge the most frequent pair of characters
def merge_pair_2(a, b, splits):
    splits_copy = copy.deepcopy(splits)
    for word in word_freqs:
        split = splits_copy[word]
        if len(split) == 1:
            continue

        # TODO: YOUR CODE HERE to merge the most frequent pair of characters in each word
        
        for key, word in splits_copy.items():
            for i in range(len(word)-1):
                if (word[i], word[i+1]) == (a, b):
                    splits_copy[key] = splits_copy[key][:i] + [a+b] + splits_copy[key][i+2:]
                    break

    return splits_copy

**Optimized Version**

In [10]:
def merge_pair(a, b, splits):
    splits_copy = {k: merge_in_word(v, a, b) for k, v in splits.items()}
    return splits_copy

def merge_in_word(word, a, b):
    pair = a + b
    merged_word = []
    i = 0
    while i < len(word):
        try:
            if word[i] == a and word[i + 1] == b:
                merged_word.append(pair)
                i += 2  # skip the next character
            else:
                merged_word.append(word[i])
                i += 1
        except IndexError:  # End of the word
            merged_word.append(word[i])
            break
    return merged_word

#### Test

In [11]:
merged_splits = merge_pair(best_pair[0], best_pair[1], splits)

In [12]:
splits["Ġserving"]

['Ġ', 's', 'e', 'r', 'v', 'i', 'n', 'g']

In [13]:
merged_splits["Ġserving"]

['Ġ', 's', 'e', 'r', 'v', 'in', 'g']

### Task 1.5

Iterate this process until you reach a predefined vocabulary size

##### Define a function to add element to a vocab

In [14]:
def vocab_from_splits(spl, voc):
    """ This function generates a sorted vocabulary list from a dictionary. The dictionary's keys 
    are treated as words and the values of each key are expected to be iterable (e.g., list or string). """
    for word in spl.keys():
        for c in spl[word]:
            if c not in voc:
                voc.append(c)
    voc.sort()
    return voc

##### Define a function that find the most common pair in a text

In [15]:
# Define a function to compute the frequency of each pair of characters
def compute_pair_freqs(spl):
    pair_freqs = defaultdict(int)
        
    for word, clist in spl.items():
        for i in range(len(clist)-1):
            pair_freqs[(clist[i], clist[i+1])] += 1

    return pair_freqs

# Find the most frequent pair of characters in a dict
# with key = pair of characters and value = frequency
def most_common_pair(pair_list):
    best_pair = ""
    max_freq = None

    for pair, freq in pair_list.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq

    return best_pair

##### Tests

In [16]:
# Initialize pair_freqs by calling the compute_pair_freqs function
pair_freqs = compute_pair_freqs(splits)
most_com_pair = most_common_pair(pair_freqs)
print("pair_freqs", pair_freqs)
print("most_com_pair", most_com_pair)

pair_freqs defaultdict(<class 'int'>, {('Ġ', '='): 1, ('Ġ', 'V'): 29, ('V', 'a'): 10, ('a', 'l'): 311, ('l', 'k'): 9, ('k', 'y'): 5, ('y', 'r'): 6, ('r', 'i'): 235, ('i', 'a'): 102, ('Ġ', 'C'): 115, ('C', 'h'): 27, ('h', 'r'): 18, ('r', 'o'): 183, ('o', 'n'): 467, ('n', 'i'): 86, ('i', 'c'): 191, ('c', 'l'): 43, ('l', 'e'): 244, ('e', 's'): 377, ('Ġ', 'I'): 33, ('I', 'I'): 7, ('Ġ', 'Ċ'): 1, ('Ġ', 'S'): 127, ('S', 'e'): 19, ('e', 'n'): 365, ('n', 'j'): 9, ('j', 'Å'): 1, ('Å', 'į'): 3, ('Ġ', 'n'): 74, ('n', 'o'): 75, ('Ġ', '3'): 14, ('Ġ', ':'): 1, ('Ġ', 'U'): 16, ('U', 'n'): 7, ('n', 'r'): 6, ('r', 'e'): 420, ('e', 'c'): 138, ('c', 'o'): 208, ('o', 'r'): 305, ('r', 'd'): 69, ('d', 'e'): 252, ('e', 'd'): 522, ('Ġ', '('): 1, ('Ġ', 'J'): 30, ('J', 'a'): 12, ('a', 'p'): 77, ('p', 'a'): 98, ('a', 'n'): 322, ('n', 'e'): 188, ('s', 'e'): 241, ('Ġ', 'æ'): 1, ('æ', 'Ī'): 2, ('Ī', '¦'): 2, ('¦', 'å'): 1, ('å', 'ł'): 1, ('ł', '´'): 1, ('´', 'ã'): 2, ('ã', 'ģ'): 20, ('ģ', '®'): 6, ('®', 'ã'): 2, ('ã

##### Iterate the dataset until we reach a given vocab size

In [17]:
# YOUR CODE HERE to iterate this process until you reach a predefined vocabulary size
s_final = dict()
for idx, elem in enumerate(sorted(pair_freqs.items(), key=lambda x: x[1], reverse=True)):
  if idx == 0:
    s_final = merge_pair(a=elem[0][0], b=elem[0][1], splits=splits)
  else:
    s_final = merge_pair(a=elem[0][0], b=elem[0][1], splits=s_final)

In [54]:
def merge_pair_max(splits, vocab_mer, max_vocab_size):
    pair_freq = compute_pair_freqs(splits)
    max_pair = most_common_pair(pair_freq)
    merges = {('i', 'n'): 'in'}
    
    with tqdm(total=max_vocab_size, desc='Merging pairs', ncols=80) as pbar:
        
        while len(vocab_mer) < max_vocab_size:
            # print("Pair to merge: ", max_pair[0], max_pair[1])
            splits = merge_pair(max_pair[0], max_pair[1], splits)
            vocab_mer = vocab_from_splits(splits, vocab_mer)
            pair_freq = compute_pair_freqs(splits)
            max_pair = most_common_pair(pair_freq)
            
            merges[(max_pair[0], max_pair[1])] = max_pair[0] + max_pair[1]
            
            pbar.update(len(vocab_mer) - pbar.n)  # Update the progress bar
        
    return splits, vocab_mer, merges

In [55]:
# TODO: YOUR CODE HERE to iterate this process until you reach a predefined vocabulary size
vocab_max_size = 200

vocab = ["<|endoftext|>"] + alphabet.copy()

original_splits = splits.copy()
original_splits_vocab = vocab_from_splits(original_splits, vocab).copy()

print("Length of original_splits_vocab:", len(original_splits_vocab))

# Iterate the processing of merging

#? Uncomment to clear the prevoius vocabulary
# vocab = []

new_merged_splits, new_vocab, merges = merge_pair_max(original_splits, vocab, max_vocab_size=vocab_max_size)

Length of original_splits_vocab: 144


Merging pairs: 100%|██████████████████████████| 200/200 [00:02<00:00, 97.68it/s]


In [56]:
dif_vocab = list(set(new_vocab) - set(original_splits_vocab))

print("\nIn the new merged splits there are", len(dif_vocab), "new characters")
print("\nNew vocab (differences):", dif_vocab, "new vocab len:", len(dif_vocab))
print("\nMerges:", merges)
print("\nNew vocab:", new_vocab)


In the new merged splits there are 56 new characters

New vocab (differences): ['ou', 'Ġm', 'le', 'ent', 're', 'ĠA', 'ĠC', 'Ġd', 'et', 'on', 'in', 'lo', 'ec', 'ing', 'el', 'ed', 'Ġre', 'ro', 'as', 'Ġf', 'ly', 'er', 'ĠS', 'or', 'om', 'st', 'ĠP', 'ers', 'Ġc', 'an', 'al', 'at', 'it', 'en', 'Ġt', 'ri', 'Ġa', 'ar', 'Ġ1', 'ic', 'is', 'Ġe', 'Ġh', 'ti', 'ur', 'Ġw', 'ĠM', 'ac', 'Ġp', 'il', 'ra', 'Ġs', 'Ġb', 'Ġin', 'es', 'ion'] new vocab len: 56

Merges: {('i', 'n'): 'in', ('e', 'r'): 'er', ('e', 'd'): 'ed', ('o', 'n'): 'on', ('e', 's'): 'es', ('e', 'n'): 'en', ('Ġ', 's'): 'Ġs', ('a', 't'): 'at', ('a', 'n'): 'an', ('a', 'l'): 'al', ('o', 'r'): 'or', ('a', 'r'): 'ar', ('in', 'g'): 'ing', ('Ġ', 'c'): 'Ġc', ('Ġ', 'p'): 'Ġp', ('r', 'e'): 're', ('t', 'i'): 'ti', ('i', 's'): 'is', ('Ġ', 'd'): 'Ġd', ('e', 'l'): 'el', ('Ġ', 'm'): 'Ġm', ('Ġ', 'f'): 'Ġf', ('i', 'c'): 'ic', ('i', 't'): 'it', ('Ġ', 't'): 'Ġt', ('o', 'u'): 'ou', ('l', 'e'): 'le', ('Ġ', 'a'): 'Ġa', ('r', 'o'): 'ro', ('Ġ', 'S'): 'ĠS', ('i', '

In [57]:
new_merged_splits

{'Ġ=': ['Ġ', '='],
 'ĠValkyria': ['Ġ', 'V', 'al', 'k', 'y', 'ri', 'a'],
 'ĠChronicles': ['ĠC', 'h', 'r', 'on', 'ic', 'l', 'es'],
 'ĠIII': ['Ġ', 'I', 'I', 'I'],
 'ĠĊ': ['Ġ', 'Ċ'],
 'ĠSenjÅį': ['ĠS', 'en', 'j', 'Å', 'į'],
 'Ġno': ['Ġ', 'n', 'o'],
 'Ġ3': ['Ġ', '3'],
 'Ġ:': ['Ġ', ':'],
 'ĠUnrecorded': ['Ġ', 'U', 'n', 're', 'c', 'or', 'd', 'ed'],
 'Ġ(': ['Ġ', '('],
 'ĠJapanese': ['Ġ', 'J', 'a', 'p', 'an', 'es', 'e'],
 'ĠæĪ¦åł´ãģ®ãĥ´ãĤ¡ãĥ«ãĤŃãĥ¥ãĥªãĤ¢': ['Ġ',
  'æ',
  'Ī',
  '¦',
  'å',
  'ł',
  '´',
  'ã',
  'ģ',
  '®',
  'ã',
  'ĥ',
  '´',
  'ã',
  'Ĥ',
  '¡',
  'ã',
  'ĥ',
  '«',
  'ã',
  'Ĥ',
  'Ń',
  'ã',
  'ĥ',
  '¥',
  'ã',
  'ĥ',
  'ª',
  'ã',
  'Ĥ',
  '¢'],
 '3': ['3'],
 'Ġ,': ['Ġ', ','],
 'Ġlit': ['Ġ', 'l', 'it'],
 'Ġ.': ['Ġ', '.'],
 'Ġof': ['Ġ', 'o', 'f'],
 'Ġthe': ['Ġt', 'h', 'e'],
 'ĠBattlefield': ['Ġ', 'B', 'at', 't', 'le', 'f', 'i', 'el', 'd'],
 'Ġ)': ['Ġ', ')'],
 'Ġcommonly': ['Ġc', 'om', 'm', 'on', 'ly'],
 'Ġreferred': ['Ġre', 'f', 'er', 'r', 'ed'],
 'Ġto': ['Ġt', 'o'],
 'Ġa

We provide the tokenize function to you

In [58]:
darmstadt_sent = ("Darmstadt holds the official title City of Science (German: Wissenschaftsstadt) as it is a major centre of scientific institutions, universities, and high-technology companies.")

In [60]:
def tokenize(text, merges):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])

tok_sent = tokenize(darmstadt_sent, merges)
print(tok_sent)
print(len(tok_sent))

['D', 'ar', 'm', 'st', 'a', 'd', 't', 'Ġh', 'o', 'l', 'd', 's', 'Ġt', 'h', 'e', 'Ġ', 'o', 'f', 'f', 'ic', 'i', 'al', 'Ġ', 'ti', 't', 'le', 'ĠC', 'it', 'y', 'Ġ', 'o', 'f', 'ĠS', 'c', 'i', 'en', 'c', 'e', 'Ġ', '(', 'G', 'er', 'm', 'an', ':', 'Ġ', 'W', 'is', 's', 'en', 's', 'c', 'h', 'a', 'f', 't', 's', 'st', 'a', 'd', 't', ')', 'Ġa', 's', 'Ġ', 'it', 'Ġ', 'is', 'Ġa', 'Ġm', 'a', 'j', 'or', 'Ġc', 'ent', 're', 'Ġ', 'o', 'f', 'Ġs', 'c', 'i', 'en', 'ti', 'f', 'ic', 'Ġin', 's', 'ti', 't', 'u', 'ti', 'on', 's', ',', 'Ġ', 'u', 'n', 'i', 'v', 'ers', 'i', 'ti', 'es', ',', 'Ġ', 'an', 'd', 'Ġh', 'i', 'g', 'h', '-', 't', 'ec', 'h', 'n', 'o', 'lo', 'g', 'y', 'Ġc', 'om', 'p', 'an', 'i', 'es', '.']
128


### Task 1.6
Now, repeat the byte pair encoding for vocab sizes of 1000 and 5000. What do you notice?

In [61]:
# Use this chunk for the answer
vocab_1000 = original_splits_vocab.copy()
new_merged_splits_1000, new_vocab_1000, merges_1000 = merge_pair_max(original_splits, vocab_1000, max_vocab_size=1000)

print("len of new_vocab:", len(new_vocab_1000))
print("new_merged_splits:", new_merged_splits_1000)

Merging pairs: 100%|████████████████████████| 1000/1000 [00:46<00:00, 21.66it/s]

len of new_vocab: 1000
new_merged_splits: {'Ġ=': ['Ġ', '='], 'ĠValkyria': ['ĠValky', 'ri', 'a'], 'ĠChronicles': ['ĠCh', 'ron', 'ic', 'les'], 'ĠIII': ['ĠI', 'II'], 'ĠĊ': ['Ġ', 'Ċ'], 'ĠSenjÅį': ['ĠS', 'en', 'j', 'Å', 'į'], 'Ġno': ['Ġno'], 'Ġ3': ['Ġ3'], 'Ġ:': ['Ġ', ':'], 'ĠUnrecorded': ['ĠUn', 'rec', 'ord', 'ed'], 'Ġ(': ['Ġ', '('], 'ĠJapanese': ['ĠJ', 'ap', 'an', 'ese'], 'ĠæĪ¦åł´ãģ®ãĥ´ãĤ¡ãĥ«ãĤŃãĥ¥ãĥªãĤ¢': ['Ġ', 'æ', 'Ī', '¦', 'å', 'ł', '´', 'ãģ®', 'ãĥ', '´', 'ãĤ', '¡', 'ãĥ', '«', 'ãĤ', 'Ń', 'ãĥ', '¥', 'ãĥ', 'ª', 'ãĤ', '¢'], '3': ['3'], 'Ġ,': ['Ġ', ','], 'Ġlit': ['Ġlit'], 'Ġ.': ['Ġ', '.'], 'Ġof': ['Ġo', 'f'], 'Ġthe': ['Ġth', 'e'], 'ĠBattlefield': ['ĠB', 'attle', 'f', 'ield'], 'Ġ)': ['Ġ', ')'], 'Ġcommonly': ['Ġcomm', 'on', 'ly'], 'Ġreferred': ['Ġre', 'fer', 'red'], 'Ġto': ['Ġto'], 'Ġas': ['Ġas'], 'Ġoutside': ['Ġ', 'outs', 'ide'], 'ĠJapan': ['ĠJ', 'ap', 'an'], 'Ġis': ['Ġ', 'is'], 'Ġa': ['Ġa'], 'Ġtactical': ['Ġt', 'ac', 'tical'], 'Ġrole': ['Ġro', 'le'], 'Ġ@-@': ['Ġ', '@', '-', '@'], 'Ġplaying




In [62]:
darm_sent_1000 = tokenize(darmstadt_sent, merges_1000)

print(darm_sent_1000)
len(darm_sent_1000)

['D', 'ar', 'm', 'st', 'ad', 't', 'Ġh', 'old', 's', 'Ġth', 'e', 'Ġoffic', 'ial', 'Ġ', 'tit', 'le', 'ĠC', 'ity', 'Ġo', 'f', 'ĠS', 'ci', 'ence', 'Ġ', '(', 'G', 'er', 'man', ':', 'ĠW', 'iss', 'ens', 'ch', 'a', 'f', 'ts', 'st', 'ad', 't', ')', 'Ġas', 'Ġ', 'it', 'Ġ', 'is', 'Ġa', 'Ġma', 'j', 'or', 'Ġcent', 're', 'Ġo', 'f', 'Ġs', 'ci', 'entif', 'ic', 'Ġin', 's', 'tit', 'u', 'tions', ',', 'Ġun', 'iv', 'ers', 'ities', ',', 'Ġ', 'and', 'Ġhigh', '-', 't', 'ech', 'n', 'ology', 'Ġcomp', 'an', 'ies', '.']


80

In [28]:
# Use this chunk for the answer
vocab_5000 = original_splits_vocab.copy()
new_merged_splits_5000, new_vocab_5000, merges_5000 = merge_pair_max(original_splits, vocab_5000, max_vocab_size=5000)

print("len of new_vocab:", len(new_vocab_5000))
print("new_merged_splits:", new_merged_splits_5000)

Merging pairs:   0%|                                   | 0/5000 [00:00<?, ?it/s]

Merging pairs: 100%|████████████████████████| 5000/5000 [09:27<00:00,  8.81it/s]

len of new_vocab: 5000
new_merged_splits: {'Ġ=': ['Ġ='], 'ĠValkyria': ['ĠValkyria'], 'ĠChronicles': ['ĠChronicles'], 'ĠIII': ['ĠIII'], 'ĠĊ': ['ĠĊ'], 'ĠSenjÅį': ['ĠSenjÅį'], 'Ġno': ['Ġno'], 'Ġ3': ['Ġ3'], 'Ġ:': ['Ġ:'], 'ĠUnrecorded': ['ĠUnrecorded'], 'Ġ(': ['Ġ('], 'ĠJapanese': ['ĠJapanese'], 'ĠæĪ¦åł´ãģ®ãĥ´ãĤ¡ãĥ«ãĤŃãĥ¥ãĥªãĤ¢': ['ĠæĪ¦åł´ãģ®ãĥ´ãĤ¡ãĥ«ãĤŃãĥ¥ãĥªãĤ¢'], '3': ['3'], 'Ġ,': ['Ġ,'], 'Ġlit': ['Ġlit'], 'Ġ.': ['Ġ.'], 'Ġof': ['Ġof'], 'Ġthe': ['Ġthe'], 'ĠBattlefield': ['ĠBattlefield'], 'Ġ)': ['Ġ)'], 'Ġcommonly': ['Ġcommonly'], 'Ġreferred': ['Ġreferred'], 'Ġto': ['Ġto'], 'Ġas': ['Ġas'], 'Ġoutside': ['Ġoutside'], 'ĠJapan': ['ĠJapan'], 'Ġis': ['Ġis'], 'Ġa': ['Ġa'], 'Ġtactical': ['Ġtactical'], 'Ġrole': ['Ġrole'], 'Ġ@-@': ['Ġ@-@'], 'Ġplaying': ['Ġplaying'], 'Ġvideo': ['Ġvideo'], 'Ġgame': ['Ġgame'], 'Ġdeveloped': ['Ġdeveloped'], 'Ġby': ['Ġby'], 'ĠSega': ['ĠSega'], 'Ġand': ['Ġand'], 'ĠMedia': ['ĠMedia'], '.': ['.'], 'Vision': ['Vision'], 'Ġfor': ['Ġfor'], 'ĠPlayStation': ['ĠPlayStation'], 'ĠPor




In [29]:
darm_sent_5000 = tokenize(darmstadt_sent, merges_5000)
print(darm_sent_5000)
len(darm_sent_5000)

['D', 'ar', 'm', 'st', 'ad', 't', 'Ġhold', 's', 'Ġthe', 'Ġofficial', 'Ġtitle', 'ĠCity', 'Ġof', 'ĠScience', 'Ġ(', 'G', 'er', 'man', ':', 'ĠW', 'iss', 'ens', 'ch', 'a', 'f', 'ts', 'st', 'ad', 't', ')', 'Ġas', 'Ġit', 'Ġis', 'Ġa', 'Ġmajor', 'Ġcent', 're', 'Ġof', 'Ġs', 'ci', 'entific', 'Ġ', 'i', 'n', 's', 'tit', 'u', 'tions', ',', 'Ġun', 'iv', 'ers', 'ities', ',', 'Ġand', 'Ġhigh', '-', 't', 'ech', 'n', 'ology', 'Ġcompanies', '.']


63

In [30]:
count = 0

for k, w in new_merged_splits.items():
    if(k != w[0]):
        print("k:", k, "w:", w)
        count += 1
    if count > 5: break

k: Ġ= w: ['Ġ', '=']
k: ĠValkyria w: ['Ġ', 'V', 'al', 'k', 'y', 'ri', 'a']
k: ĠChronicles w: ['ĠC', 'h', 'r', 'on', 'ic', 'l', 'es']
k: ĠIII w: ['Ġ', 'I', 'I', 'I']
k: ĠĊ w: ['Ġ', 'Ċ']
k: ĠSenjÅį w: ['ĠS', 'en', 'j', 'Å', 'į']


We can notice that with 5000 token a lot of splits are merged to the word, as we expexted. In conseguence, in the vocabulary we got a lot of full word instead of a letters of a pair of letters.

In [44]:
len(darm_sent_1000)

82

In [43]:
len(darm_sent_5000)

63

# **Task 2: Getting to Know Foundation Models**

In this task, we want to load two different versions of GPT-2 and compare their performance.

### Task 2.1

After we imported transformers from Huggingface, your first task is to initialize the GPT2LMHead Model. This model should be a gpt2 model from huggingface under the variable model. Also define the GPT2 tokenizer.

While we have seen the transformers in the lectures, you should print a single GPT2Block to the console for the 10th layer. Then, count the parameters of the model.

In [63]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Initialize GPT2 model and tokenizer
model_small = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer_small = GPT2Tokenizer.from_pretrained('gpt2')

# Print a GPT2Block in the 10th layer
print("GPT2Block for 10th layer:\n\n", model_small.transformer.h[9])  # Layers are 0-indexed, so layer 10 is at index 9

# Function to count parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters()) # numel = number of elements of a tensor

# Count and print the number of trainable parameters
print(f"\nThe model has {count_parameters(model_small):,} trainable parameters")

GPT2Block for 10th layer:

 GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

The model has 124,439,808 trainable parameters


#### Tests

In [64]:
inputs = tokenizer("Hello, the DL4NLP is the best course you can attend in Darmstadt", return_tensors="pt")
inputs

{'input_ids': tensor([[15496,    11,   262, 23641,    19,    45, 19930,   318,   262,  1266,
          1781,   345,   460,  5262,   287,   360,  1670, 38863]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [65]:
outputs = model_small(**inputs, labels=inputs["input_ids"])
outputs

CausalLMOutputWithCrossAttentions(loss=tensor(4.7058, grad_fn=<NllLossBackward0>), logits=tensor([[[ -35.2362,  -35.3266,  -38.9753,  ...,  -44.4645,  -43.9974,
           -36.4580],
         [-112.6171, -114.5832, -116.5725,  ..., -119.0128, -118.8059,
          -111.6917],
         [ -81.5177,  -81.7825,  -84.0149,  ...,  -83.8951,  -86.2484,
           -81.4283],
         ...,
         [ -75.5244,  -77.2464,  -74.4660,  ...,  -87.6613,  -87.7032,
           -78.4784],
         [ -91.4077,  -93.2514,  -96.1139,  ..., -100.0912, -100.3428,
           -93.9917],
         [ -87.8294,  -91.6237,  -97.1091,  ..., -105.3598, -104.4135,
           -93.3228]]], grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-1.2526,  2.3200,  0.1722,  ..., -1.0076, -0.1897,  1.3219],
          [-1.6482,  3.0222,  1.2789,  ..., -0.9078, -1.7395,  2.4237],
          [-2.2444,  2.6332,  1.9227,  ..., -0.6722, -1.5328,  2.0305],
          ...,
          [-2.0138,  2.1441,  2.6814,  ..., -0.3243, -2

In [None]:
loss = outputs.loss
loss

tensor(4.7058, grad_fn=<NllLossBackward0>)

In [None]:
logits = outputs.logits
logits

tensor([[[ -35.2362,  -35.3266,  -38.9753,  ...,  -44.4645,  -43.9974,
           -36.4580],
         [-112.6171, -114.5832, -116.5725,  ..., -119.0128, -118.8059,
          -111.6917],
         [ -81.5177,  -81.7825,  -84.0149,  ...,  -83.8951,  -86.2484,
           -81.4283],
         ...,
         [ -75.5244,  -77.2464,  -74.4660,  ...,  -87.6613,  -87.7032,
           -78.4784],
         [ -91.4077,  -93.2514,  -96.1139,  ..., -100.0912, -100.3428,
           -93.9917],
         [ -87.8294,  -91.6237,  -97.1091,  ..., -105.3598, -104.4135,
           -93.3228]]], grad_fn=<UnsafeViewBackward0>)

### Task 2.2
Because we want to compare two versions of GPT, you now also load the gpt2-large from huggingface as model. As evaluation dataset you import the dataset "wikitext-2-raw-v1" from huggingface. (like in Task1) Make sure to use the 200 first entries of the test dataset and print an entry. Also print the parameters of the gpt2-large model

In [66]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset
import random
import numpy as np

In [67]:
# Initialize gpt2-large model and tokenizer
model_large = GPT2LMHeadModel.from_pretrained('gpt2-large')
tokenizer_large = GPT2Tokenizer.from_pretrained('gpt2-large')

# Load the dataset
dataset = load_dataset(path="wikitext", name="wikitext-2-raw-v1")

Found cached dataset wikitext (/Users/paolobonicco/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
100%|██████████| 3/3 [00:00<00:00, 67.36it/s]


**Note:** We use just 124 sentence after remove the empty strings in the dataset

In [71]:
# Use the 200 first entries of the test dataset
test_data = dataset['test']['text'][:200]

# Remove empty strings from the test dataset dict
test_data = [text for text in test_data if text != ""]

test_data_len = len(test_data)

# Used to print a random entry of the test dataset
rnd_n = random.randint(0, test_data_len)

print(">> Length of test_data from 'wikitext-2-raw-v1' -->", test_data_len, "\n")
    
print(test_data[rnd_n])

print(">> Length of test_data after empty strings removal:", len(test_data))

# Print trainable parameters of gpt2-large
print(f"\n>> The gpt2-large model has {count_parameters(model_large):,} trainable parameters")

>> Length of test_data from 'wikitext-2-raw-v1' --> 124 

 = = = Propulsion = = = 

>> Length of test_data after empty strings removal: 124

>> The gpt2-large model has 774,030,080 trainable parameters


### Task 2.3
As a next step, we define an evaluation metric to calculate predictions. Remember the perplexity funciton from the lecture. The function should take a model, tokenizer and a text and return the perplexity score for the given text.

Hint: you should encode the text, generate outputs for inputs and corresponding labels and then calculate perplexity.

#### Test the predictions

In [None]:
# Predict text using the model

def predict_text(model, tokenizer, text, n_words=50):
    # Encode the text
    input_ids = tokenizer(text, return_tensors="pt")["input_ids"]
    
    # Generate text
    output = model.generate(input_ids=input_ids, max_length=n_words, pad_token_id=tokenizer.eos_token_id)
    
    # Decode and print the output
    print(tokenizer.decode(output[0], skip_special_tokens=True))

In [None]:
predict_text(model_large, tokenizer_large, test_data["text"][1])

Input length of input_ids is 183, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


 Robert Boulter is an English film, television and theatre actor. He had a guest @-@ starring role on the television series The Bill in 2000. This was followed by a starring role in the play Herons written by Simon Stephens, which was performed in 2001 at the Royal Court Theatre. He had a guest role in the television series Judge John Deed in 2002. In 2004 Boulter landed a role as " Craig " in the episode " Teddy's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi. He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur, which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London. He was directed by John Tiffany and starred alongside Ben Whishaw, Shane Zaza, Harry Kent, Fraser Ayres, Sophie Stanton and Dominic Hall. 




In [None]:
def predict_text(model, tokenizer, text):
    # TODO: YOUR CODE HERE
    
    # Initialize perplexity
    perplexity = 0 
    
    # Iterate over the test dataset
    for sent in text: # Loop on sentences
        
        # Create a list of words from the sentence
        words = sent.split()
        part_sent = ""
        for idx, word in enumerate(words):
            part_sent += word + " "
            input_ids = tokenizer(part_sent, return_tensors="pt")["input_ids"] # Encode the text, "pt" is for returning PyTorch tensor
        
            # Generate text with the model (predict next word)
            predict_len = len(input_ids[0]) + 1
            
            output = model.generate(input_ids=input_ids, max_length=predict_len, pad_token_id=tokenizer.eos_token_id)
            
            ##? Uncomment to test the output
            
            # output_word = tokenizer.decode(output[0], skip_special_tokens=True)
            # input_words = tokenizer.decode(input_ids[0], skip_special_tokens=True)
            
            # print("\n>> Input:", input_words, "\n>> Output:", output_word, "\n>> Label:", words[:idx + 1])
    
    # Calculate perplexity

#### Evaluate perplexity

##### Aziz Method

In [None]:
import torch.nn.functional as F
def calculate_perplexity_2(model, tokenizer, text):
    # YOUR CODE HERE
    # Step 1: Encode the text
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    input_ids = torch.tensor([input_ids])

    # Step 2: Generate outputs for inputs
    with torch.no_grad():
        outputs = model(input_ids=input_ids)

    logits = outputs.logits
    # Remove batch dimension and last token (prediction for the [CLS] token)
    logits = logits.squeeze(0)[:-1, :]

    # Step 3: Calculate cross-entropy loss
    target_ids = input_ids.squeeze(0)[1:]  # Labels are the input tokens shifted by 1
    loss = F.cross_entropy(logits, target_ids)

    # Step 4: Compute perplexity
    perplexity_score = torch.exp(loss)

    return perplexity_score.item()

##### Fake methods

In [None]:
def calculate_perplexity(model, tokenizer, text):
    model.eval()  # put model in evaluation mode

    total_log_prob = 0
    total_words = 0

    # Use tqdm to create a progress bar
    progress_bar = tqdm(text, desc="Calculating Perplexity")

    # Iterate over the test dataset
    for sent in progress_bar: # Loop on sentences
        # Create a list of words from the sentence
        words = sent.split()
        part_sent = ""

        for idx, word in enumerate(words):
            part_sent += word + " "
            input_ids = tokenizer.encode(part_sent, return_tensors="pt") 

            # The target is the next word
            # target = tokenizer.encode(words[idx+1], return_tensors="pt") if idx+1 < len(words) else tokenizer.encode(tokenizer.eos_token, return_tensors="pt")
            
            # Generate output from the model
            with torch.no_grad():
                outputs = model(input_ids)
            
            # Get the predicted next sub-word's id
            predicted_next_word_id = outputs.logits[0, -1].argmax(-1)
            predicted_next_word_prob = outputs.logits[0, -1, predicted_next_word_id]

            # add the log probability of the predicted word to the total log probability
            total_log_prob += predicted_next_word_prob.item()

            # increment the word count
            total_words += 1

    # Calculate and return the perplexity
    perplexity = torch.exp(torch.tensor(-total_log_prob / total_words))

    return perplexity.item()

In [None]:
def calculate_perplexity_test(model, tokenizer, text):
    model.eval()  # put model in evaluation mode

    total_log_prob = 0
    total_words = 0

    # Use tqdm to create a progress bar
    progress_bar = tqdm(text, desc="Calculating Perplexity")

    # Iterate over the test dataset
    for sent in progress_bar: # Loop on sentences
        # Create a list of words from the sentence
        words = sent.split()
        part_sent = ""

        for idx, word in enumerate(words):
            part_sent += word + " "
            input_ids = tokenizer.encode(part_sent, return_tensors="pt") 

            # The target is the next word
            # target = tokenizer.encode(words[idx+1], return_tensors="pt") if idx+1 < len(words) else tokenizer.encode(tokenizer.eos_token, return_tensors="pt")
            
            # Generate output from the model
            with torch.no_grad():
                output = model.generate(input_ids=input_ids, max_length=predict_len, pad_token_id=tokenizer.eos_token_id)
                
            print(tokenizer.decode(outputs[0], skip_special_tokens=True))
            
            # Get the predicted next sub-word's id
            predicted_next_word_id = outputs.logits[0, -1].argmax(-1)
            predicted_next_word_prob = outputs.logits[0, -1, predicted_next_word_id]

            # add the log probability of the predicted word to the total log probability
            total_log_prob += predicted_next_word_prob.item()

            # increment the word count
            total_words += 1

    # Calculate and return the perplexity
    perplexity = torch.exp(torch.tensor(-total_log_prob / total_words))

    return perplexity.item()

In [None]:
test_text = ["The dog was so happy.", "The cat was so happy.", "The dog was so sad.", "The cat was so sad."]

In [None]:
calculate_perplexity_test(model_large, tokenizer_large, test_text)

Calculating Perplexity:   0%|          | 0/4 [00:01<?, ?it/s]


TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'

In [None]:
data_perplexity = test_data["text"][3:7]
print(data_perplexity)

[' = = Career = = \n', ' = = = 2000 – 2005 = = = \n', ' In 2000 Boulter had a guest @-@ starring role on the television series The Bill ; he portrayed " Scott Parry " in the episode , " In Safe Hands " . Boulter starred as " Scott " in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . A review of Boulter \'s performance in The Independent on Sunday described him as " horribly menacing " in the role , and he received critical reviews in The Herald , and Evening Standard . He appeared in the television series Judge John Deed in 2002 as " Addem Armitage " in the episode " Political Expediency " , and had a role as a different character " Toby Steele " on The Bill . \n', ' He had a recurring role in 2003 on two episodes of The Bill , as character " Connor Price " . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi .

In [None]:
calculate_perplexity(model, tokenizer_large, data_perplexity)

Calculating Perplexity: 100%|██████████| 4/4 [03:29<00:00, 52.39s/it]


9.813647581015158e+20

##### Results

Remove empty string from test_data

In [None]:
test_data_no_empty = list(filter(None, test_data))

In [None]:
perplexities_small = []
perplexities_large = []
data = test_data_no_empty
# Loop over the dataset, append scores to the lists
for sent in data:
  perplexities_large.append(calculate_perplexity_2(model=model_large, tokenizer=tokenizer_large, text=sent))
  
print("Perplexity evaluation for GPT2 large finished")

perplexities_small = [calculate_perplexity_2(model=model_small, tokenizer=tokenizer_small, text=sent) for sent in data]

print("Perplexity evaluation for GPT2 small finished")

# Calculate and print average perplexities
print("Average perplexity (GPT2 ): ", np.nanmean(perplexities_small))
print("Average perplexity (GPT2 large): ", np.nanmean(perplexities_large))

Perplexity evaluation for GPT2 large finished
Perplexity evaluation for GPT2 small finished
Average perplexity (GPT2 ):  509.7849573319958
Average perplexity (GPT2 large):  433.09401690575385


What do you notice? Please explain.

### Task 2.4 Masking attention
What is an attention mask? When and why is it usually used?

Your answer here:

Your task is now to run a prompt in regular inference, but with an attention mask. You can reuse the gpt2_small model

In [None]:
text_prompt = "The sun was setting behind"

# Step 1: Implement attention masking
def apply_attention_mask(input_ids, mask_start_idx, mask_end_idx):
    # -----------------------
    # YOUR CODE HERE
    # -----------------------
    
    attention_mask = torch.ones(input_ids.shape)
    
    # mask the given positions
    attention_mask[:, mask_start_idx : mask_end_idx + 1] = 0
    return attention_mask

# Step 2: Apply attention masks
mask_start = 3  # Start index of the region to mask
mask_end = 5  # End index of the region to mask
tokenizer_small = AutoTokenizer.from_pretrained("gpt2")
input_ids = tokenizer_small.encode(text_prompt, return_tensors='pt')
print(f"input_ids == {input_ids}")
attention_mask = apply_attention_mask(input_ids, mask_start, mask_end)

# Step 3: Generate output using modified attention mechanism
with torch.no_grad():
    outputs = model_small.generate(input_ids, attention_mask=attention_mask)
    print(f"Outputs == {outputs}")
    generated_text = tokenizer_small.decode(outputs[0], skip_special_tokens=True)

# Step 4: Compare and analyze the generated outputs
print("Text Prompt:", text_prompt)
print("Generated Text:", generated_text)

In [None]:
# Step 5: Run the same prompt without attention mask

# TODO: YOUR CODE HERE
with torch.no_grad():
    outputs = model_small.generate(input_ids)
    print(f"Outputs == {outputs[0]}")
    generated_text = tokenizer_small.decode(outputs[0], skip_special_tokens=True)

# Step 4: Compare and analyze the generated outputs
print("Text Prompt:", text_prompt)
print("Generated Text NO ATTENTION MASK:", generated_text)

Step 6: Compare the two outputs and describe it in one to two sentences

Your answer here: