# Tokenization and Vocabulary Building for Large Language Models

## Reading and Loading a Text File
This code block reads a text file into Python, displaying the total number of characters and a preview of the text for context.


In [None]:
with open("verdict.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()
print("Total number of characters:", len(raw_text))
print(raw_text[:99])  # Displaying the first 100 characters for context


## Tokenizing Text
Here, we tokenize the input text into words and punctuation using regular expressions, preparing it for further processing.


In [None]:
import re

# Tokenize the text using regular expressions
tokens = re.split(r'[\s,.;:?!]+', raw_text)
print("Sample Tokens:", tokens[:10])  # Displaying the first 10 tokens


## Creating a Vocabulary
This step involves creating a vocabulary that maps unique tokens to numerical indices, enabling efficient text encoding.


In [None]:
# Create a vocabulary of unique tokens
unique_tokens = sorted(set(tokens))
vocab = {token: idx for idx, token in enumerate(unique_tokens)}
print("Vocabulary Size:", len(vocab))
print("Sample Vocabulary Entries:", list(vocab.items())[:10])  # Displaying first 10 entries


## Mapping Tokens to Token IDs
This block maps each token in the text to its corresponding numerical ID based on the created vocabulary.


In [None]:
# Map tokens to token IDs
token_ids = [vocab[token] for token in tokens if token in vocab]
print("Sample Token IDs:", token_ids[:10])  # Displaying the first 10 token IDs


## Implementing a Simple Tokenizer Class
This class-based tokenizer encodes text into token IDs and decodes token IDs back into text, showcasing bidirectional functionality.


In [None]:
class Tokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        self.reverse_vocab = {v: k for k, v in vocab.items()}
    
    def encode(self, text):
        tokens = re.split(r'[\s,.;:?!]+', text)
        token_ids = [self.vocab.get(token, self.vocab.get("[UNK]")) for token in tokens]
        return token_ids

    def decode(self, token_ids):
        tokens = [self.reverse_vocab.get(token_id, "[UNK]") for token_id in token_ids]
        return " ".join(tokens)

# Instantiate and test the tokenizer
tokenizer = Tokenizer(vocab)
sample_text = "This is a test."
encoded = tokenizer.encode(sample_text)
decoded = tokenizer.decode(encoded)

print("Encoded:", encoded)
print("Decoded:", decoded)


## Handling Out-of-Vocabulary (OOV) Tokens
This code demonstrates handling unknown words by replacing them with a special `[UNK]` token during encoding.


In [None]:
class TokenizerWithOOV(Tokenizer):
    def __init__(self, vocab):
        super().__init__(vocab)
        self.vocab["[UNK]"] = max(self.vocab.values()) + 1

    def encode(self, text):
        tokens = re.split(r'[\s,.;:?!]+', text)
        token_ids = [self.vocab.get(token, self.vocab["[UNK]"]) for token in tokens]
        return token_ids

# Test the tokenizer with OOV handling
tokenizer_oov = TokenizerWithOOV(vocab)
sample_text = "An unknown word test."
encoded = tokenizer_oov.encode(sample_text)
decoded = tokenizer_oov.decode(encoded)

print("Encoded with OOV:", encoded)
print("Decoded with OOV:", decoded)


## Enhanced Tokenizer with Special Tokens
This block adds support for special tokens like `<|unk|>` and `<|endoftext|>`, improving the tokenizer's robustness and functionality.


In [None]:
class EnhancedTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        self.reverse_vocab = {v: k for k, v in vocab.items()}
        self.vocab["<|unk|>"] = len(vocab)
        self.vocab["<|endoftext|>"] = len(vocab) + 1

    def encode(self, text):
        tokens = re.split(r'[\s,.;:?!]+', text) + ["<|endoftext|>"]
        token_ids = [self.vocab.get(token, self.vocab["<|unk|>"]) for token in tokens]
        return token_ids

    def decode(self, token_ids):
        tokens = [self.reverse_vocab.get(token_id, "<|unk|>") for token_id in token_ids]
        return " ".join(tokens).replace(" <|endoftext|>", "")

# Instantiate and test the enhanced tokenizer
enhanced_tokenizer = EnhancedTokenizer(vocab)
sample_text = "A new text example."
encoded = enhanced_tokenizer.encode(sample_text)
decoded = enhanced_tokenizer.decode(encoded)

print("Encoded with Special Tokens:", encoded)
print("Decoded with Special Tokens:", decoded)
