 Implement a Basic Word-Level Tokenizer
✅ Task
Create a WordTokenizer class with:

python
Copy
Edit
class WordTokenizer:
    def fit(self, texts): ...
    def tokenize(self, text): ...
    def convert_tokens_to_ids(self, tokens): ...
    def convert_ids_to_tokens(self, ids): ...
💡 Bonus
Add a special token for <UNK> and <PAD>, and support fixed vocab size (e.g., top 10,000 tokens only).

In [None]:
class WordTokenizer:
    def __init__(self, vocab_size=10000, min_freq=1):
        self.vocab_size = vocab_size
        self.min_freq = min_freq
        self.token_to_id = {}
        self.id_to_token = {}
        self.special_tokens = ['<PAD>', '<UNK>']

    def fit(self, texts):
        counter = Counter()
        for text in texts:
            tokens = tokenize(text)
            counter.update(tokens)

        # Filter tokens
        filtered = [tok for tok, freq in counter.items() if freq >= self.min_freq]
        vocab = self.special_tokens + filtered[:self.vocab_size - len(self.special_tokens)]

        self.token_to_id = {tok: idx for idx, tok in enumerate(vocab)}
        self.id_to_token = {idx: tok for tok, idx in self.token_to_id.items()}

    def tokenize(self, text):
        return tokenize(text)

    def convert_tokens_to_ids(self, tokens):
        return [self.token_to_id.get(tok, self.token_to_id['<UNK>']) for tok in tokens]

    def convert_ids_to_tokens(self, ids):
        return [self.id_to_token.get(i, '<UNK>') for i in ids]

# Example
wtok = WordTokenizer()
wtok.fit(corpus)
tokens = wtok.tokenize("Deep learning is fun")
ids = wtok.convert_tokens_to_ids(tokens)
print("Tokens:", tokens)
print("IDs:", ids)
print("Recovered:", wtok.convert_ids_to_tokens(ids))
