# Solutions: Task 8.4 - Tokenization Lab

This notebook contains solutions to the exercises from notebook 04.

---

In [None]:
import re
from collections import Counter
import matplotlib.pyplot as plt

try:
    from transformers import AutoTokenizer
    HAS_TRANSFORMERS = True
except ImportError:
    HAS_TRANSFORMERS = False
    print("transformers not installed")

## Exercise 1: Multilingual Tokenization

**Task:** Compare how different tokenizers handle non-English text.

In [None]:
if HAS_TRANSFORMERS:
    # Load tokenizers
    tokenizers = {
        "GPT-2": AutoTokenizer.from_pretrained("gpt2"),
        "BERT": AutoTokenizer.from_pretrained("bert-base-uncased"),
    }
    
    # Try to load multilingual tokenizers
    try:
        tokenizers["mBERT"] = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
    except:
        pass
    
    # Test multilingual tokenization
    multilingual_texts = [
        ("English", "Hello, world!"),
        ("French", "Bonjour le monde!"),
        ("Spanish", "Hola, mundo!"),
        ("German", "Hallo, Welt!"),
        ("Russian", "Привет мир!"),
        ("Chinese", "你好世界!"),
        ("Japanese", "こんにちは世界!"),
    ]
    
    print("Multilingual Tokenization Comparison")
    print("=" * 70)
    
    results = []
    for lang, text in multilingual_texts:
        print(f"\n{lang}: '{text}'")
        print("-" * 50)
        
        row = {"Language": lang, "Text": text}
        
        for name, tokenizer in tokenizers.items():
            tokens = tokenizer.tokenize(text)
            row[name] = len(tokens)
            print(f"  {name}: {len(tokens)} tokens")
            print(f"    {tokens[:10]}{'...' if len(tokens) > 10 else ''}")
        
        results.append(row)
    
    # Visualize
    print("\n" + "=" * 70)
    print("\nAnalysis:")
    print("- GPT-2 is trained mostly on English, so non-Latin scripts get many tokens")
    print("- mBERT handles multiple languages more efficiently")
    print("- Chinese/Japanese often tokenized character-by-character with English tokenizers")
else:
    print("Please install transformers to run this exercise")

## Exercise 2: Code Tokenization

**Task:** Analyze how different tokenizers handle programming code.

In [None]:
if HAS_TRANSFORMERS:
    code_samples = [
        ('print', 'print("Hello, World!")'),
        ('function', 'def factorial(n): return 1 if n <= 1 else n * factorial(n-1)'),
        ('class', 'class MyClass:\n    def __init__(self, value):\n        self.value = value'),
        ('list_comp', '[x**2 for x in range(10) if x % 2 == 0]'),
        ('lambda', 'sorted(items, key=lambda x: x.name.lower())'),
    ]
    
    print("Code Tokenization Analysis")
    print("=" * 70)
    
    gpt2_tok = tokenizers["GPT-2"]
    
    for name, code in code_samples:
        tokens = gpt2_tok.tokenize(code)
        
        print(f"\n{name}:")
        print(f"  Code: {code[:60]}{'...' if len(code) > 60 else ''}")
        print(f"  Tokens ({len(tokens)}): {tokens}")
        
        # Analyze token types
        keywords = sum(1 for t in tokens if t.strip() in ['def', 'class', 'return', 'if', 'else', 'for', 'in', 'lambda'])
        operators = sum(1 for t in tokens if t.strip() in ['(', ')', '[', ']', ':', '=', '+', '-', '*', '**', '%'])
        
        print(f"  Keywords: {keywords}, Operators/punctuation: {operators}")
    
    print("\n" + "=" * 70)
    print("\nObservations:")
    print("- GPT-2 often splits camelCase/snake_case identifiers")
    print("- Common keywords like 'def', 'class' are usually single tokens")
    print("- Indentation (spaces/tabs) becomes tokens too")
    print("- Special characters like '**' may be single or multiple tokens")
else:
    print("Please install transformers to run this exercise")

## Challenge: Implement WordPiece Tokenization

**Task:** Implement WordPiece (used by BERT), which differs from BPE in how it scores merges.

In [None]:
class SimpleWordPiece:
    """
    Simple WordPiece implementation for educational purposes.
    
    Unlike BPE which merges based on frequency, WordPiece
    merges based on likelihood improvement:
    
    score(a, b) = freq(ab) / (freq(a) * freq(b))
    """
    
    def __init__(self):
        self.vocab = {}
        self.merges = {}
        
    def _get_word_freqs(self, text):
        """Split text into words and count frequencies."""
        words = re.findall(r"\w+", text.lower())
        return Counter(words)
    
    def _word_to_chars(self, word):
        """Convert word to characters with ## prefix for non-initial."""
        if not word:
            return []
        chars = [word[0]]
        for c in word[1:]:
            chars.append(f"##{c}")
        return chars
    
    def _get_pair_scores(self, word_freqs, word_tokens):
        """Compute WordPiece scores for pairs."""
        # Count individual token frequencies
        token_freqs = Counter()
        pair_freqs = Counter()
        
        for word, freq in word_freqs.items():
            tokens = word_tokens[word]
            for t in tokens:
                token_freqs[t] += freq
            for i in range(len(tokens) - 1):
                pair = (tokens[i], tokens[i+1])
                pair_freqs[pair] += freq
        
        # Compute scores: freq(ab) / (freq(a) * freq(b))
        scores = {}
        for pair, freq in pair_freqs.items():
            a, b = pair
            if token_freqs[a] > 0 and token_freqs[b] > 0:
                scores[pair] = freq / (token_freqs[a] * token_freqs[b])
        
        return scores
    
    def _merge_pair(self, word_tokens, pair):
        """Merge a pair in all words."""
        new_word_tokens = {}
        new_token = pair[0] + pair[1].replace("##", "")
        
        for word, tokens in word_tokens.items():
            new_tokens = []
            i = 0
            while i < len(tokens):
                if i < len(tokens) - 1 and (tokens[i], tokens[i+1]) == pair:
                    new_tokens.append(new_token)
                    i += 2
                else:
                    new_tokens.append(tokens[i])
                    i += 1
            new_word_tokens[word] = new_tokens
        
        return new_word_tokens, new_token
    
    def train(self, text, num_merges=100, verbose=True):
        """Train WordPiece tokenizer."""
        word_freqs = self._get_word_freqs(text)
        word_tokens = {word: self._word_to_chars(word) for word in word_freqs}
        
        # Initial vocabulary
        vocab = set()
        for tokens in word_tokens.values():
            vocab.update(tokens)
        
        if verbose:
            print(f"Initial vocabulary size: {len(vocab)}")
        
        for i in range(num_merges):
            scores = self._get_pair_scores(word_freqs, word_tokens)
            if not scores:
                break
            
            # Find best pair (highest score)
            best_pair = max(scores, key=scores.get)
            best_score = scores[best_pair]
            
            word_tokens, new_token = self._merge_pair(word_tokens, best_pair)
            self.merges[best_pair] = new_token
            vocab.add(new_token)
            
            if verbose and (i + 1) % 20 == 0:
                print(f"Merge {i+1}: {best_pair} -> '{new_token}' (score: {best_score:.4f})")
        
        self.vocab = {token: i for i, token in enumerate(sorted(vocab))}
        
        if verbose:
            print(f"Final vocabulary size: {len(self.vocab)}")
    
    def tokenize(self, word):
        """Tokenize a word using learned vocabulary."""
        tokens = self._word_to_chars(word.lower())
        
        changed = True
        while changed:
            changed = False
            new_tokens = []
            i = 0
            while i < len(tokens):
                if i < len(tokens) - 1:
                    pair = (tokens[i], tokens[i+1])
                    if pair in self.merges:
                        new_tokens.append(self.merges[pair])
                        i += 2
                        changed = True
                        continue
                new_tokens.append(tokens[i])
                i += 1
            tokens = new_tokens
        
        return tokens

# Test WordPiece
training_text = """
Machine learning is a subset of artificial intelligence.
Deep learning uses neural networks with many layers.
Transformers revolutionized natural language processing.
""" * 50

wp = SimpleWordPiece()
wp.train(training_text, num_merges=50, verbose=True)

print("\nTest tokenization:")
test_words = ["learning", "machine", "transformers", "neural"]
for word in test_words:
    tokens = wp.tokenize(word)
    print(f"  '{word}' -> {tokens}")

---

## Key Takeaways

1. **Multilingual handling** varies greatly between tokenizers - use multilingual models for non-English
2. **Code tokenization** splits identifiers and operators differently than natural language
3. **WordPiece** differs from BPE by using likelihood ratios instead of raw frequencies
4. **Token efficiency** matters for cost and context length in production

---