In [1]:
# Import required libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, CharTokenize
import spacy
from transformers import BertTokenizer, ByteLevelBPETokenizer
from tokenizers import BPE
import re

# Download required NLTK data
nltk.download('punkt')

class TokenizationDemo:
    def __init__(self):
        # Initialize tokenizers
        self.nlp = spacy.load('en_core_web_sm')
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
    def word_tokenization(self, text):
        """
        Demonstrates different word tokenization methods
        """
        # 1. Using NLTK
        nltk_tokens = word_tokenize(text)
        
        # 2. Using SpaCy
        spacy_tokens = [token.text for token in self.nlp(text)]
        
        # 3. Simple split (basic approach)
        basic_tokens = text.split()
        
        # 4. Regex-based tokenization
        regex_tokens = re.findall(r'\b\w+\b', text.lower())
        
        return {
            'nltk': nltk_tokens,
            'spacy': spacy_tokens,
            'basic': basic_tokens,
            'regex': regex_tokens
        }
    
    def character_tokenization(self, text):
        """
        Demonstrates character-level tokenization
        """
        # 1. Basic character splitting
        char_tokens = list(text)
        
        # 2. Character tokenization with spaces
        char_tokens_with_space = [char for char in text if not char.isspace()]
        
        # 3. Alphanumeric character tokenization
        alphanum_tokens = [char for char in text if char.isalnum()]
        
        return {
            'basic': char_tokens,
            'no_space': char_tokens_with_space,
            'alphanum': alphanum_tokens
        }
    
    def subword_tokenization(self, text):
        """
        Demonstrates subword tokenization using BERT
        """
        # Using BERT tokenizer
        bert_tokens = self.bert_tokenizer.tokenize(text)
        
        return {
            'bert': bert_tokens
        }
    
    def sentence_tokenization(self, text):
        """
        Demonstrates sentence tokenization
        """
        # 1. Using NLTK
        nltk_sents = sent_tokenize(text)
        
        # 2. Using SpaCy
        spacy_sents = [sent.text for sent in self.nlp(text).sents]
        
        return {
            'nltk': nltk_sents,
            'spacy': spacy_sents
        }

# Usage Example
def main():
    tokenizer = TokenizationDemo()
    
    # Sample texts
    word_text = "Machine learning is fascinating! AI will change the world."
    char_text = "NLP"
    subword_text = "preprocessing untokenizable chatbots"
    paragraph = """Natural Language Processing is fascinating. 
                  It helps computers understand human language. 
                  This is a multi-sentence text."""
    
    # Word tokenization
    word_tokens = tokenizer.word_tokenization(word_text)
    print("\nWord Tokenization:")
    for method, tokens in word_tokens.items():
        print(f"{method}: {tokens}")
    
    # Character tokenization
    char_tokens = tokenizer.character_tokenization(char_text)
    print("\nCharacter Tokenization:")
    for method, tokens in char_tokens.items():
        print(f"{method}: {tokens}")
    
    # Subword tokenization
    subword_tokens = tokenizer.subword_tokenization(subword_text)
    print("\nSubword Tokenization:")
    for method, tokens in subword_tokens.items():
        print(f"{method}: {tokens}")
    
    # Sentence tokenization
    sent_tokens = tokenizer.sentence_tokenization(paragraph)
    print("\nSentence Tokenization:")
    for method, tokens in sent_tokens.items():
        print(f"{method}: {tokens}")

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'nltk'