In [32]:
import string
import re

class CustomTokenizer:
    """
    Tokenizer splits a sentence into tokens(single word, multiple words, char, suffix or prefix).    
    """
    def __init__(self, raw_sen, use_punctuation_as_tokens=True, punctuations=string.punctuation,
                 token_boundaries=[" ", "-"], token_delimeter="<SPLIT>",
                 n_grams=1):
        """_summary_

        Args:
            raw_sen (str): sentence to tokenize

            punctuations (List[str]): remove a list of punctuations.
            When to Use Punctuation as Tokens:
            Here are some scenarios where including punctuation as tokens can be beneficial:
            Tasks Where Punctuation is Meaningful: Sentiment analysis, sarcasm detection, parsing, and any task where
            punctuation conveys important information.Models Designed for Punctuation: Some NLP models, like those specifically
            trained on social media text, are designed to handle and learn from punctuation.
            
            When Not to Use Punctuation as Tokens:
            Here are some scenarios where excluding punctuation might be preferable:
            Tasks Where Punctuation is Irrelevant: Topic modeling, machine translation, or tasks where meaning primarily
            comes from the words themselves. Large Datasets: If you have a very large dataset, the increased vocabulary size
            due to punctuation might become an issue.
            
            Best Practices:
            Experiment: The best way to determine whether to include punctuation as tokens is to experiment and evaluate model
            performance on your specific task and dataset.
            Fine-Tuning: If you're using a pretrained model, you might need to fine-tune it on your data to ensure it effectively
            handles punctuation tokens.
            Consider Frequency: If you're worried about vocabulary size, you can choose to only tokenize the most frequent
            punctuation marks or those most relevant to your task.

            token_boundaries (List[str]): split the tokens from the sentence based on token boundaries. Other 
            languages like Japanese they dont have a space to seperate words but you a caharacter from chinese
            called as KANA to separate words. 
            
            token_delimeter (List[str]): the delimiter token <SPLIT> is used to explicitly mark the boundaries
                created by spaces, hyphens, and punctuation. This serves a few key purposes:
                    Preserving Information: Delimiters can be used to retain information about the original structure
                    of the text. For instance, the presence of a period indicates the end of a sentence, while a comma
                    might signal a pause within a sentence. This structural information can be useful for downstream NLP
                    tasks like parsing or syntax analysis.
                    Disambiguation: In some cases, token boundaries might not be obvious. For example, in the text "New York,"
                    is it one token (a city) or two (an adjective and a noun)? Using a delimiter between "New" and "York" could
                    help clarify this.
                    Feature Engineering:  Delimiters can be treated as tokens themselves, potentially serving as features for
                    machine learning models. For example, the presence or absence of certain punctuation marks could be 
                    informative for sentiment analysis or language identification tasks.

                    Sometimes Necessary:
                    If you need to preserve information about the original text structure (e.g., for parsing or syntax analysis).
                    When dealing with languages where word boundaries are not clearly marked by spaces (e.g., Chinese or Japanese).
                    If your downstream tasks require specific delimiter tokens as features (e.g., for sentiment analysis).
                    
                    Sometimes Not Necessary:
                    In simple word-based tokenization, where spaces naturally separate words, you might not need explicit delimiters.
                    If your primary goal is to create a bag-of-words representation (where word order doesn't matter),
                    delimiters might not be essential.
        """
        self.raw = raw_sen
        self._punc = punctuations
        self.use_punc_as_tokens = use_punctuation_as_tokens # if false we will remove all the punctions # good for bag of words taks
        self._token_boundary = token_boundaries
        self._delimiter = token_delimeter
        self.ngrams = n_grams
        self._index = 0
        self._tokenize()    

    def __str__(self) -> str:
        return f"Tokenize --> {self.raw}"
    
    def _tokenize(self):
        
        work_sent = self.raw

        # sepearating punctions from the words
        if self.use_punc_as_tokens:
            for punc in self._punc:
                work_sent = work_sent.replace(punc, " "+punc+" ")

        # this is not necessary if we are tokenizing english sentence, if any other language is use then we need this
        # how the words are seperated from each other. Ex: japanese lang
        for delimiter in self._token_boundary:
            work_sent = work_sent.replace(delimiter, self._delimiter)

        # split into tokens based on the delimiter
        self.tokens =  [ word.strip() for word in work_sent.split(self._delimiter) if word != ""]

        if self.ngrams > 1:
            ngrams = []
            for i in range(len(self.tokens)-self.ngrams+1):
                ngrams.append(tuple(self.tokens[i:i+self.ngrams]))
            
            self.tokens = ngrams
            


    # def remove_punctuations(self, pattern=r'[^\w\s]'):
    #     return re.sub(pattern, '', self.raw)

    def __len__(self):
        if self.tokens:
            return len(self.tokens)
        else:
            "No tokens present"

    def __iter__(self):
        return self
    def __next__(self):
        if self._index < len(self.tokens):
            result = self.tokens[self._index]
            self._index+=1
            return result
        raise StopIteration




In [33]:
dtoken = CustomTokenizer("This is a sentence-with-hyphens.", n_grams=2)
for i in dtoken:
    print(i)

('This', 'is')
('is', 'a')
('a', 'sentence')
('sentence', 'with')
('with', 'hyphens')
('hyphens', '.')


In [21]:
import unittest

class TestCustomTokenizer(unittest.TestCase):
    def test_word_tokenization(self):
        sentence = "This is a simple sentence."
        tokenizer = CustomTokenizer(sentence)
        self.assertEqual(tokenizer.tokens, ["This", "is", "a", "simple", "sentence","."])
        self.assertEqual(len(tokenizer), 6)

    def test_punctuation_tokenization(self):
        sentence = "This is a sentence, with punctuation!"
        tokenizer = CustomTokenizer(sentence)
        self.assertEqual(tokenizer.tokens, 
                         ["This", "is", "a", "sentence", ",", "with", "punctuation", "!"])
        self.assertEqual(len(tokenizer), 8)

    def test_hyphen_tokenization(self):
        sentence = "This-is-a-sentence-with-hyphens."
        tokenizer = CustomTokenizer(sentence)
        self.assertEqual(tokenizer.tokens, 
                         ["This", "is", "a", "sentence", "with", "hyphens", "."])
        self.assertEqual(len(tokenizer), 7)


unittest.main(argv=[''], exit=False)

...
----------------------------------------------------------------------
Ran 3 tests in 0.002s

OK


<unittest.main.TestProgram at 0x7e16ba1ab310>