# Exercise 3

In [4]:
import os
import numpy as np
import tqdm
import torch
from sklearn.datasets import load_svmlight_file
import re
import glob
import collections
import string

### Load Data

In [10]:
from pathlib import Path

# Define base path to the dataset
BASE_DATA_PATH = Path("../exercise_01/aclImdb") 

TRAIN_PATH = BASE_DATA_PATH / "train"
TEST_PATH = BASE_DATA_PATH / "test"

TRAIN_POS_PATH = TRAIN_PATH / "pos"
TRAIN_NEG_PATH = TRAIN_PATH / "neg"
TEST_POS_PATH = TEST_PATH / "pos"
TEST_NEG_PATH = TEST_PATH / "neg"

assert TRAIN_POS_PATH.exists() and TRAIN_NEG_PATH.exists() and TEST_POS_PATH.exists() and TEST_NEG_PATH.exists()

In [11]:
def load_imdb_data(data_path: Path) -> tuple[list[str], list[int]]:
    """
    Loads movie reviews and their sentiments from the specified path.

    """
    texts:list[str] = []
    labels:list[int] = [] 

    for _, folder_path in [("pos", data_path / "pos"), ("neg", data_path / "neg")]:
        if not folder_path.exists():
            raise FileNotFoundError(f"Warning: Path {folder_path} does not exist.")

        for file_path in folder_path.glob("*.txt"):
                # Extract score from filename, e.g., "7_123.txt" -> score 7
            score = int(file_path.name.split('_')[0])
            filename_parts = file_path.name.split('_')
            score_str = filename_parts[1].split('.')[0]
            score = int(score_str)

            labels.append(score)
            texts.append(file_path.read_text(encoding='utf-8'))
    return texts, labels

# Load training data
train_texts_raw, train_labels = load_imdb_data(TRAIN_PATH)
print(f"Loaded {len(train_texts_raw)} training reviews.")
print(f"Training labels distribution: Positive (1): {sum(train_labels)}, Negative (0): {len(train_labels) - sum(train_labels)}")

# Load test data
test_texts_raw, test_labels = load_imdb_data(TEST_PATH)
print(f"Loaded {len(test_texts_raw)} test reviews.")
print(f"Test labels distribution: Positive (1): {sum(test_labels)}, Negative (0): {len(test_labels) - sum(test_labels)}")

Loaded 25000 training reviews.
Training labels distribution: Positive (1): 136943, Negative (0): -111943
Loaded 25000 test reviews.
Test labels distribution: Positive (1): 137824, Negative (0): -112824


In [12]:
print("Train :")
for text, label in zip(train_texts_raw[:10], train_labels):
    print(f"{text[:50]}... Label : {label}")

print("\n\nTest :")
for text, label in zip(test_texts_raw[:10], test_labels):
    print(f"{text[:50]}... Label : {label}")

Train :
For a movie that gets no respect there sure are a ... Label : 9
Bizarre horror movie filled with famous faces but ... Label : 8
A solid, if unremarkable film. Matthau, as Einstei... Label : 7
It's a strange feeling to sit alone in a theater o... Label : 8
You probably all already know this by now, but 5 a... Label : 10
I saw the movie with two grown children. Although ... Label : 8
You're using the IMDb.<br /><br />You've given som... Label : 10
This was a good film with a powerful message of lo... Label : 10
Made after QUARTET was, TRIO continued the quality... Label : 10
For a mature man, to admit that he shed a tear ove... Label : 10


Test :
Based on an actual story, John Boorman shows the s... Label : 9
This is a gem. As a Film Four production - the ant... Label : 9
I really like this show. It has drama, romance, an... Label : 9
This is the best 3-D experience Disney has at thei... Label : 10
Of the Korean movies I've seen, only three had rea... Label : 10
this movie is fu

## Task 1: Pre-Processing

In [6]:
from preprocessing import preprocess

In [27]:
# apply the pre-processing pipeline
pre_processed_data = preprocess(train_texts_raw, to_lowercase=True)
pre_processed_data_test = preprocess(test_texts_raw, to_lowercase=True)

In [26]:
for tokens in pre_processed_data[:10]:
    print(f"{tokens[:10]}...")

['for', 'a', 'movie', 'that', 'gets', 'no', 'respect', 'there', 'sure', 'are']...
['bizarre', 'horror', 'movie', 'filled', 'with', 'famous', 'faces', 'but', 'stolen', 'by']...
['a', 'solid,', 'if', 'unremarkable', 'film.', 'matthau,', 'as', 'einstein,', 'was', 'wonderful.']...
["it's", 'a', 'strange', 'feeling', 'to', 'sit', 'alone', 'in', 'a', 'theater']...
['you', 'probably', 'all', 'already', 'know', 'this', 'by', 'now,', 'but', 'NUMTOKEN']...
['i', 'saw', 'the', 'movie', 'with', 'two', 'grown', 'children.', 'although', 'it']...
["you're", 'using', 'the', "imdb.you've", 'given', 'some', 'hefty', 'votes', 'to', 'some']...
['this', 'was', 'a', 'good', 'film', 'with', 'a', 'powerful', 'message', 'of']...
['made', 'after', 'quartet', 'was,', 'trio', 'continued', 'the', 'quality', 'of', 'the']...
['for', 'a', 'mature', 'man,', 'to', 'admit', 'that', 'he', 'shed', 'a']...


## Task 2: Byte-Pair Tokenizer

You can refer to this [Hugging Face tutorial](https://huggingface.co/learn/llm-course/en/chapter6/5) for a detailed explanation of the BPE algorithm.

In [None]:
class BPETokenizer:
    def __init__(self, base_vocab: str, num_merges: int = 1000):
        self.base_vocab = base_vocab
        self.num_merges = num_merges
        self.vocab = None
        

    def train(self, texts: List[str]):
        pass # todo

    def tokenize(self, text: str) -> List[List[str]]:
        pass # todo


### 2 (a): Base Vocabulary = Characters from Reviews

In [None]:
# Extract all unique characters from the data
unique_chars = # todo

# Train the tokenizer
bpe_char_based = BPETokenizer(base_vocab=unique_chars, num_merges=1000)
bpe_char_based.train(pre_processed_data)

### 2 (b): Base Vocabulary = ASCII Characters

In [None]:
ascii_chars = list(string.printable)

bpe_ascii_based = BPETokenizer(base_vocab=ascii_chars, num_merges=1000)
bpe_ascii_based.train(pre_processed_data)

In [None]:
# Analyze the two vocabularies
print("Char-based initial_vocab sample:")
print(list(bpe_char_based.vocab.items())[:10])

print("ASCII-based initial_vocab sample:")
print(list(bpe_ascii_based.vocab.items())[:10])

In [None]:
# Compare Tokenization of a New/Unknown Word
unknown_word = "backpropagationlessness"

print("Char-based tokenization:", bpe_char_based.tokenize_word(unknown_word))
print("ASCII-based tokenization:", bpe_ascii_based.tokenize_word(unknown_word))

In [None]:
# more comparison ...

## Task 3: WordPiece Tokenizer

You can refer to this [Hugging Face tutorial](https://huggingface.co/learn/llm-course/en/chapter6/6) for a detailed explanation of the WordPiece algorithm.

In [None]:
import collections
import re

class WordPieceTokenizer:
    def __init__(self, vocab_size=1000, initial_vocab=None):
        self.vocab_size = vocab_size
        self.initial_vocab = set(initial_vocab) if initial_vocab else set()
        self.vocab = {}
        

    def train(self, texts: List[str]):
        pass # todo

    def tokenize(self, text: str) -> List[List[str]]:
        pass # todo


### 3 (a): Base Vocabulary = Characters from Reviews

In [None]:
corpus_chars = # todo
tokenizer_a = WordPieceTokenizer(vocab_size=1000, initial_vocab=corpus_chars)
vocab_a = tokenizer_a.train(pre_processed_data)
print(f"Vocabulary A (corpus chars): {sorted(vocab_a)}")

### 3 (b): Base Vocabulary = Characters from Reviews + ASCII Characters

In [None]:
ascii_chars = set(string.printable)
initial_vocab = sorted(corpus_chars.union(ascii_chars))
tokenizer_b = WordPieceTokenizer(vocab_size=1000, initial_vocab=initial_vocab)
vocab_b = tokenizer_b.train(pre_processed_data)

In [None]:
# Compare Tokenization of a New/Unknown Word
unknown_word = "backpropagationlessness"
print("WordPiece A tokenization:", tokenizer_a.tokenize_word(unknown_word))
print("WordPiece B tokenization:", tokenizer_b.tokenize_word(unknown_word))

In [None]:
# more comparison ...

## Task 4: Hugging Face Implementations

In [None]:
# Hugging Face Byte-Pair Encoder (BPE)
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace


In [None]:
# Hugging Face WordPiece Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer


In [None]:
# compare the different tokenizers