In [None]:
MAX_LEN = 70 # Maximum_length of words are 66
import re
class WordTokenizer:
    def __init__(self, corpora):
        self.punctuation_set = set("!?.:,;…()[]{}\"'`~@#$%^&*-+=/\\|<>")

        self.corpora_tokens = self.tokenize_corpora(corpora)

        
        self.word2idx = {ch:id for id, ch in enumerate(sorted(set(self.corpora_tokens)))}
        self.idx2word = {id:ch for id, ch in enumerate(sorted(set(self.corpora_tokens)))}

        print(f"Corpora Set :{self.corpora_tokens}")
    
    def tokenize_corpora(self, input):
        tokens = re.findall(r'\w+(?:-\w+)*|[^\w\s]+', input)
        spaced_tokens = []
        for i, token in enumerate(tokens):
            if all(char in self.punctuation_set for char in token):
                # Punctuation: no space before
                spaced_tokens.append(token)
            
            elif token == "UNK":
                spaced_tokens.append(token)

            else:
                # Word/hyphenated word: add space before
                spaced_tokens.append(' ' + token)
        
        return spaced_tokens
    
    def tokenize_text(self, input):
        tokens = re.findall(r'\w+(?:-\w+)*|[^\w\s]+', input)
        spaced_tokens = []
        for i, token in enumerate(tokens):
            if all(char in self.punctuation_set for char in token):
                # Punctuation: no space before
                spaced_tokens.append(token)
            
            elif token not in self.corpora_tokens:
                spaced_tokens.append("UNK")

            else:
                # Word/hyphenated word: add space before
                spaced_tokens.append(' ' + token)
        
        return spaced_tokens
    
    def get_input_ids(self, text):
        tokens = self.tokenize_text(text)
        return [self.word2idx[word] for word in tokens]
    
    def get_characters(self, ids):
        return ''.join([self.idx2word[id] for id in ids])

    def pad_and_create_mask(self, ids, max_length, pad_id=0):
        padded = ids[:max_length] + [pad_id] * max(0, max_length - len(ids))
        mask = [1 if i < len(ids) else 0 for i in range(max_length)]
        return padded, mask
    
    def get_masked_token(self, text, max_length = 128):
        input_ids = self.get_input_ids(text)
        input_ids, masks =  self.pad_and_create_mask(input_ids, max_length)
        return {"input_ids":input_ids,"mask": masks}

In [73]:
import re

class WordTokenizer:
    def __init__(self, corpora, max_len=70):
        self.MAX_LEN = max_len
        self.punctuation_set = set("!?.:,;…()[]{}\"'`~@#$%^&*-+=/\\|<>")

        # Build Vocabulary consisting of special tokens and tokens from corpora
        self.special_tokens = [" UNK", "PAD"]
        self.corpora_tokens = self.tokenize(corpora, for_corpus=True)
        unique_tokens = sorted(set(self.corpora_tokens).union(self.special_tokens).union(self.punctuation_set))

        self.word2idx = {token: idx for idx, token in enumerate(unique_tokens)}
        self.idx2word = {idx: token for token, idx in self.word2idx.items()}

        print(f"Corpora Tokens: {self.corpora_tokens}")
        print(f"Vocabulary Size: {len(self.word2idx)}")

    def tokenize(self, text, for_corpus=False):
        tokens = re.findall(r'\w+(?:-\w+)*|[^\w\s]+', text)
        spaced_tokens = []

        for token in tokens:
            if all(char in self.punctuation_set for char in token):
                # Punctuation: no preceding space
                spaced_tokens.append(token)
            
            else:
                spaced_tokens.append(' ' + token)

        return spaced_tokens

    def get_input_ids(self, text):
        tokens = self.tokenize(text)
        return [self.word2idx.get(token, self.word2idx[" UNK"]) for token in tokens]

    def get_characters(self, ids):
        return ''.join([self.idx2word.get(idx, " UNK") for idx in ids])

    def pad_and_create_mask(self, ids, max_length=None):
        if max_length is None:
            max_length = self.MAX_LEN

        pad_id = self.word2idx["PAD"]
        padded = ids[:max_length] + [pad_id] * max(0, max_length - len(ids))
        mask = [1 if i < len(ids) else 0 for i in range(max_length)]
        return padded, mask

    def get_masked_token(self, text, max_length=None):
        input_ids = self.get_input_ids(text)
        padded_ids, masks = self.pad_and_create_mask(input_ids, max_length)
        return {"input_ids": padded_ids, "mask": masks}

In [77]:
corpus = "this is the very large corpora"
tokenizer = WordTokenizer(corpus)

text = "The sun is hot".lower()
tokens = tokenizer.tokenize(text)
ids = tokenizer.get_input_ids(text)
chars = tokenizer.get_characters(ids)
masked = tokenizer.get_masked_token(text, max_length=10)

print("Tokens:", tokens)
print("Input IDs:", ids)
print("Decoded:", chars)
print("Masked:", masked)

Corpora Tokens: [' this', ' is', ' the', ' very', ' large', ' corpora']
Vocabulary Size: 40
Tokens: [' the', ' sun', ' is', ' hot']
Input IDs: [4, 0, 2, 0]
Decoded:  the UNK is UNK
Masked: {'input_ids': [4, 0, 2, 0, 29, 29, 29, 29, 29, 29], 'mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]}


In [48]:
corpora = "UNK In the very beginging of the world, we were humans"
text = "In the very nnc"
t = WordTokenizer(corpora)
ids = t.get_input_ids(text)
tokens = t.get_characters(ids)
print(ids, tokens)

Punchuation Set :['UNK', ' In', ' the', ' very', ' beginging', ' of', ' the', ' world', ',', ' we', ' were', ' humans']
[10, 10, 10, 10] UNKUNKUNKUNK


In [None]:
corpora = "<unk> In the very beginging of the world, we were humans"
text = "In the very nnc"
punctuation_set = set("!?.:,;…()[]{}\"'`~@#$%^&*-+=/\\|<>")
text = "Wait... what?! GPT-4 is amazing!!!"
tokens = re.findall(r'\w+(?:-\w+)*|[^\w\s]+', text)
spaced_tokens = [' ' + token for token in tokens if not all(char in punctuation_set for char in token)]
spaced_tokens

[' Wait', ' what', ' GPT-4', ' is', ' amazing']

In [28]:

text = "Wait... what?! GPT-4 is amazing!!!"

# Define punctuation set
punctuation_set = set("!?.:,;…()[]{}\"'`~@#$%^&*-+=/\\|<>")

# Tokenize into words (with hyphens) and grouped punctuation
tokens = re.findall(r'\w+(?:-\w+)*|[^\w\s]+', text)

# Add space before tokens, unless it's punctuation
spaced_tokens = []
for i, token in enumerate(tokens):
    if all(char in punctuation_set for char in token):
        # Punctuation: no space before
        spaced_tokens.append(token)
    else:
        # Word/hyphenated word: add space before
        spaced_tokens.append(' ' + token)

# Join into final result
result = ''.join(spaced_tokens)

print("Result:", spaced_tokens)

Result: [' Wait', '...', ' what', '?!', ' GPT-4', ' is', ' amazing', '!!!']


In [7]:
Wt = WordTokenizer(corpora)

In [10]:
ids = Wt.get_input_ids(text)
Wt.get_characters(ids)

'In the very beginging'

In [11]:
token = Wt.token(text)

In [13]:
print(token)

{'input_ids': [2, 0, 6, 0, 7, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
