# Exercise 3

In [7]:
import os
import numpy as np
import tqdm
import torch
from sklearn.datasets import load_svmlight_file
import re
import glob
import collections
import string

### Load Data

In [8]:
from pathlib import Path

# Define base path to the dataset
BASE_DATA_PATH = Path("../exercise_01/aclImdb") 

TRAIN_PATH = BASE_DATA_PATH / "train"
TEST_PATH = BASE_DATA_PATH / "test"

TRAIN_POS_PATH = TRAIN_PATH / "pos"
TRAIN_NEG_PATH = TRAIN_PATH / "neg"
TEST_POS_PATH = TEST_PATH / "pos"
TEST_NEG_PATH = TEST_PATH / "neg"

assert TRAIN_POS_PATH.exists() and TRAIN_NEG_PATH.exists() and TEST_POS_PATH.exists() and TEST_NEG_PATH.exists()

In [9]:
def load_imdb_data(data_path: Path) -> tuple[list[str], list[int]]:
    """
    Loads movie reviews and their sentiments from the specified path.

    """
    texts:list[str] = []
    labels:list[int] = [] 

    for _, folder_path in [("pos", data_path / "pos"), ("neg", data_path / "neg")]:
        if not folder_path.exists():
            raise FileNotFoundError(f"Warning: Path {folder_path} does not exist.")

        for file_path in folder_path.glob("*.txt"):
                # Extract score from filename, e.g., "7_123.txt" -> score 7
            score = int(file_path.name.split('_')[0])
            filename_parts = file_path.name.split('_')
            score_str = filename_parts[1].split('.')[0]
            score = int(score_str)

            labels.append(score)
            texts.append(file_path.read_text(encoding='utf-8'))
    return texts, labels

# Load training data
train_texts_raw, train_labels = load_imdb_data(TRAIN_PATH)
print(f"Loaded {len(train_texts_raw)} training reviews.")
print(f"Training labels distribution: Positive (1): {sum(train_labels)}, Negative (0): {len(train_labels) - sum(train_labels)}")

# Load test data
test_texts_raw, test_labels = load_imdb_data(TEST_PATH)
print(f"Loaded {len(test_texts_raw)} test reviews.")
print(f"Test labels distribution: Positive (1): {sum(test_labels)}, Negative (0): {len(test_labels) - sum(test_labels)}")

Loaded 25000 training reviews.
Training labels distribution: Positive (1): 136943, Negative (0): -111943
Loaded 25000 test reviews.
Test labels distribution: Positive (1): 137824, Negative (0): -112824


In [10]:
print("Train :")
for text, label in zip(train_texts_raw[:10], train_labels):
    print(f"{text[:50]}... Label : {label}")

print("\n\nTest :")
for text, label in zip(test_texts_raw[:10], test_labels):
    print(f"{text[:50]}... Label : {label}")

Train :
For a movie that gets no respect there sure are a ... Label : 9
Bizarre horror movie filled with famous faces but ... Label : 8
A solid, if unremarkable film. Matthau, as Einstei... Label : 7
It's a strange feeling to sit alone in a theater o... Label : 8
You probably all already know this by now, but 5 a... Label : 10
I saw the movie with two grown children. Although ... Label : 8
You're using the IMDb.<br /><br />You've given som... Label : 10
This was a good film with a powerful message of lo... Label : 10
Made after QUARTET was, TRIO continued the quality... Label : 10
For a mature man, to admit that he shed a tear ove... Label : 10


Test :
Based on an actual story, John Boorman shows the s... Label : 9
This is a gem. As a Film Four production - the ant... Label : 9
I really like this show. It has drama, romance, an... Label : 9
This is the best 3-D experience Disney has at thei... Label : 10
Of the Korean movies I've seen, only three had rea... Label : 10
this movie is fu

## Task 1: Pre-Processing

In [11]:
from preprocessing import preprocess

In [12]:
# apply the pre-processing pipeline
pre_processed_data = preprocess(train_texts_raw, to_lowercase=True, tokenize_on_punctuation=True, remove_punctuation_tokens=True, number_replacement_token="NUMTOKEN")
pre_processed_data_test = preprocess(test_texts_raw, to_lowercase=True, tokenize_on_punctuation=True, remove_punctuation_tokens=True, number_replacement_token="NUMTOKEN")

In [13]:
for tokens in pre_processed_data[:10]:
    print(f"{tokens[:10]}...")

['for', 'a', 'movie', 'that', 'gets', 'no', 'respect', 'there', 'sure', 'are']...
['bizarre', 'horror', 'movie', 'filled', 'with', 'famous', 'faces', 'but', 'stolen', 'by']...
['a', 'solid', 'if', 'unremarkable', 'film', 'matthau', 'as', 'einstein', 'was', 'wonderful']...
['it', 's', 'a', 'strange', 'feeling', 'to', 'sit', 'alone', 'in', 'a']...
['you', 'probably', 'all', 'already', 'know', 'this', 'by', 'now', 'but', 'NUMTOKEN']...
['i', 'saw', 'the', 'movie', 'with', 'two', 'grown', 'children', 'although', 'it']...
['you', 're', 'using', 'the', 'imdb', 'you', 've', 'given', 'some', 'hefty']...
['this', 'was', 'a', 'good', 'film', 'with', 'a', 'powerful', 'message', 'of']...
['made', 'after', 'quartet', 'was', 'trio', 'continued', 'the', 'quality', 'of', 'the']...
['for', 'a', 'mature', 'man', 'to', 'admit', 'that', 'he', 'shed', 'a']...


## Task 2: Byte-Pair Tokenizer

You can refer to this [Hugging Face tutorial](https://huggingface.co/learn/llm-course/en/chapter6/5) for a detailed explanation of the BPE algorithm.

In [14]:
unique_tokens = set(token for tokens in pre_processed_data for token in tokens )
sorted(list(unique_tokens))[:10]

['000s', '00am', '00pm', '00s', '01pm', '02i', '06th', '08th', '0f', '0ne']

In [19]:
from collections import defaultdict


followings = defaultdict(lambda:defaultdict(int))

for tokens in pre_processed_data:
    for token, next in zip(tokens[:-1], tokens[1:]):
        followings[token][next]+=1
    

In [26]:
sorted([(token, count) for token, count in followings["this"].items()], key=lambda x: x[1], reverse=True)[:10]

[('movie', 15710),
 ('film', 10904),
 ('is', 9467),
 ('one', 3132),
 ('was', 2273),
 ('show', 1095),
 ('time', 578),
 ('and', 506),
 ('a', 462),
 ('story', 445)]

We can see that "this" is most followed by "movie"

In [43]:
# pairs_frequency = (token1, token2) -> count

def get_pairs_frequency(data:list[list[str]])-> dict[tuple[str,str],int]:
    pairs_frequency = defaultdict(int)

    for tokens in data:
        for token, next in zip(tokens[:-1], tokens[1:]):
            pairs_frequency[(token, next)] +=1
    
    return pairs_frequency

pairs_frequency =get_pairs_frequency(pre_processed_data)

In [None]:
most_frequent_pairs = sorted([((token, next), count) for ((token, next), count) in pairs_frequency.items()], key=lambda x:x[1], reverse=True)
most_frequent_pairs[:10]

[(('of', 'the'), 39161),
 (('in', 'the'), 25226),
 (('it', 's'), 17221),
 (('this', 'movie'), 15710),
 (('the', 'film'), 13523),
 (('and', 'the'), 13328),
 (('is', 'a'), 13124),
 (('to', 'the'), 12004),
 (('to', 'be'), 11876),
 (('the', 'movie'), 11855)]

In [46]:
def get_more_frequent_pair(frequencies:dict[tuple[str,str], int] )-> tuple[str,str,int]:
    return sorted([(token, next,count) for ((token, next), count) in frequencies.items()], key=lambda x:x[2], reverse=True)[0]

get_more_frequent_pair(pairs_frequency)

('of', 'the', 39161)

In [35]:
merge = most_frequent_pairs[0][0]
merge

('of', 'the')

In [47]:
def merge_tokens(tokens:list[str], a:str, b:str)->list[str]:
    _tokens :list[str] = []
    did_merge = False
    for token, next in zip(tokens[:-1], tokens[1:]):
        if did_merge:
            did_merge = False
            continue
        if (token, next) == (a,b):
            _tokens.append(token + " " + next)
            did_merge = True
        else:
            _tokens.append(token)
    if not did_merge:
        _tokens.append(tokens[-1])

    return _tokens



assert merge_tokens(
    ["This", "is", "an", "example", "of", "a", "sentence"],
    "This",
    "is"
) == ['This is', 'an', 'example', 'of', 'a', 'sentence']


assert merge_tokens(
    ["This", "is", "an", "example", "of", "a", "sentence"],
    "of",
    "a"
) == ['This', 'is', 'an', 'example', 'of a', 'sentence']


assert merge_tokens(
    ["This", "is", "an", "example", "of", "a", "sentence"],
    "a",
    "sentence"
) == ['This', 'is', 'an', 'example', 'of', 'a sentence']

### In a loop

In [68]:
# data = pre_processed_data

# N_MERGES = 100
# merges : list[tuple[str,str]] = []

for _ in range(100):
    frequencies = get_pairs_frequency(data)
    a,b,count = get_more_frequent_pair(frequencies)
    print(f"Most frequent pair : ({a}, {b}) {count=}")
    data = [merge_tokens(tokens,a,b) for tokens in data]
    merges.append((a,b))

Most frequent pair : (like, the) count=2009
Most frequent pair : (he, was) count=2004
Most frequent pair : (in, NUMTOKEN) count=1988
Most frequent pair : (more, than) count=1984
Most frequent pair : (i, would) count=1980
Most frequent pair : (and, it) count=1970
Most frequent pair : (to, say) count=1957
Most frequent pair : (was, the) count=1924
Most frequent pair : (a, movie) count=1894
Most frequent pair : (some, of the) count=1894
Most frequent pair : (a, little) count=1886
Most frequent pair : (the, same) count=1882
Most frequent pair : (the, acting) count=1877
Most frequent pair : (is, that) count=1868
Most frequent pair : (at, all) count=1842
Most frequent pair : (the, best) count=1819
Most frequent pair : (by, a) count=1798
Most frequent pair : (and, then) count=1786
Most frequent pair : (you, can) count=1770
Most frequent pair : (a, great) count=1767
Most frequent pair : (i, saw) count=1758
Most frequent pair : (NUMTOKEN, minutes) count=1706
Most frequent pair : (over, the) cou

### Tokenization

In [69]:
tokens = pre_processed_data_test[15]

tokens

['in',
 'the',
 'future',
 'a',
 'disparate',
 'group',
 'of',
 'people',
 'asleep',
 'aboard',
 'a',
 'commercial',
 'spaceship',
 'is',
 'forced',
 'to',
 'improvise',
 'their',
 'survival',
 'when',
 'the',
 'spaceship',
 'crash',
 'lands',
 'on',
 'a',
 'remote',
 'barren',
 'planet',
 'they',
 'already',
 'have',
 'one',
 'problem',
 'in',
 'that',
 'one',
 'of',
 'the',
 'passengers',
 'is',
 'intense',
 'criminal',
 'richard',
 'riddick',
 'vin',
 'diesel',
 'in',
 'his',
 'first',
 'top',
 'billed',
 'role',
 'however',
 'they',
 'are',
 'soon',
 'preyed',
 'upon',
 'by',
 'a',
 'strange',
 'species',
 'of',
 'predator',
 'that',
 'thrives',
 'in',
 'the',
 'darkness',
 'and',
 'a',
 'rare',
 'solar',
 'eclipse',
 'is',
 'soon',
 'to',
 'take',
 'place',
 'while',
 'the',
 'script',
 'for',
 'this',
 'movie',
 'is',
 'ultimately',
 'on',
 'the',
 'routine',
 'side',
 'it',
 'is',
 'decently',
 'acted',
 'and',
 'it',
 'is',
 'especially',
 'well',
 'made',
 'technically',
 'loc

In [82]:
tokens:list[str] = ["i", "think","the", "film", "is", "very", "good"]

while True:
    did_merge = False
    last_merge = None
    new_tokens= []
    for i in range(len(tokens)):
        if did_merge:
            did_merge = False
            continue

        if i == len(tokens)-1:
            # only add the last token if we didn't merge
            assert not did_merge
            new_tokens.append(tokens[i])
        else:
            token, next = tokens[i], tokens[i+1]
            print((token,next))
            if (token,next) in merges:
                new_tokens.append(token + " " + next)
                did_merge = True
                last_merge = (token, next)
                print(f"Merged ({token}, {next})")
                print(new_tokens)
            else:
                did_merge = False
                new_tokens.append(token)
    tokens = new_tokens[::]
    if last_merge is None:
        print("Nothing to merge, exiting")
        print(tokens)
        break

('i', 'think')
Merged (i, think)
['i think']
('the', 'film')
Merged (the, film)
['i think', 'the film']
('is', 'very')
('very', 'good')
('i think', 'the film')
('the film', 'is')
Merged (the film, is)
['i think', 'the film is']
('very', 'good')
('i think', 'the film is')
('the film is', 'very')
('very', 'good')
Nothing to merge, exiting
['i think', 'the film is', 'very', 'good']


In [None]:
defaultdict[]

In [None]:
class BPETokenizer:
    def __init__(self, base_vocab: str, num_merges: int = 1000):
        self.base_vocab = base_vocab
        self.num_merges = num_merges
        self.vocab = None
        

    def train(self, texts: list[str]):
        pass # todo

    def tokenize(self, text: str) -> list[list[str]]:
        pass # todo


### 2 (a): Base Vocabulary = Characters from Reviews

In [None]:
# Extract all unique characters from the data
unique_chars = # todo

# Train the tokenizer
bpe_char_based = BPETokenizer(base_vocab=unique_chars, num_merges=1000)
bpe_char_based.train(pre_processed_data)

### 2 (b): Base Vocabulary = ASCII Characters

In [None]:
ascii_chars = list(string.printable)

bpe_ascii_based = BPETokenizer(base_vocab=ascii_chars, num_merges=1000)
bpe_ascii_based.train(pre_processed_data)

In [None]:
# Analyze the two vocabularies
print("Char-based initial_vocab sample:")
print(list(bpe_char_based.vocab.items())[:10])

print("ASCII-based initial_vocab sample:")
print(list(bpe_ascii_based.vocab.items())[:10])

In [None]:
# Compare Tokenization of a New/Unknown Word
unknown_word = "backpropagationlessness"

print("Char-based tokenization:", bpe_char_based.tokenize_word(unknown_word))
print("ASCII-based tokenization:", bpe_ascii_based.tokenize_word(unknown_word))

In [None]:
# more comparison ...

## Task 3: WordPiece Tokenizer

You can refer to this [Hugging Face tutorial](https://huggingface.co/learn/llm-course/en/chapter6/6) for a detailed explanation of the WordPiece algorithm.

In [None]:
import collections
import re

class WordPieceTokenizer:
    def __init__(self, vocab_size=1000, initial_vocab=None):
        self.vocab_size = vocab_size
        self.initial_vocab = set(initial_vocab) if initial_vocab else set()
        self.vocab = {}
        

    def train(self, texts: List[str]):
        pass # todo

    def tokenize(self, text: str) -> List[List[str]]:
        pass # todo


### 3 (a): Base Vocabulary = Characters from Reviews

In [None]:
corpus_chars = # todo
tokenizer_a = WordPieceTokenizer(vocab_size=1000, initial_vocab=corpus_chars)
vocab_a = tokenizer_a.train(pre_processed_data)
print(f"Vocabulary A (corpus chars): {sorted(vocab_a)}")

### 3 (b): Base Vocabulary = Characters from Reviews + ASCII Characters

In [None]:
ascii_chars = set(string.printable)
initial_vocab = sorted(corpus_chars.union(ascii_chars))
tokenizer_b = WordPieceTokenizer(vocab_size=1000, initial_vocab=initial_vocab)
vocab_b = tokenizer_b.train(pre_processed_data)

In [None]:
# Compare Tokenization of a New/Unknown Word
unknown_word = "backpropagationlessness"
print("WordPiece A tokenization:", tokenizer_a.tokenize_word(unknown_word))
print("WordPiece B tokenization:", tokenizer_b.tokenize_word(unknown_word))

In [None]:
# more comparison ...

## Task 4: Hugging Face Implementations

In [None]:
# Hugging Face Byte-Pair Encoder (BPE)
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace


In [None]:
# Hugging Face WordPiece Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer


In [None]:
# compare the different tokenizers