In [27]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string 
nltk.download("punkt")
nltk.download("stopwords")

def normalize_text(text):
    stop_words = set(stopwords.words("english"))  
    sentences = sent_tokenize(text)
    processed = []

    for sentence in sentences:
        words = word_tokenize(sentence, preserve_line=True)
        # words = nltk.tokenize.casual_tokenize(sentence)


        # Remove numbering at the start of the sentence
        if words and words[0].isdigit():
            words = words[1:]

        # Remove stopwords
        words = [word for word in words if word.lower() not in stop_words and word.lower() not in string.punctuation]
        print(words)

        # Reconstruct the sentence
        if words:
            cleaned_sentence = " ".join(words)
            processed.append(cleaned_sentence)

    # Join sentences into a structured paragraph
    normalized_text = " ".join(processed).strip().lower()

    return normalized_text

# Example usage
text = """  1.   This    is a   sample   text.\n 
           2. It  doesn't don't won't contains unnecessary spaces.   
   3.  Also,   numbering at the beginning of sentences.   """

print(normalize_text(text))


[]
['sample', 'text']
[]
['contains', 'unnecessary', 'spaces']
[]
['Also', 'numbering', 'beginning', 'sentences']
sample text contains unnecessary spaces also numbering beginning sentences


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
import nltk
from collections import Counter

def learn_bpe(text, num_merges=50):
    """Learns BPE merges from input text."""
    text = text.encode('utf-8', errors='replace').decode('utf-8')  # Normalize text
    tokens = list(text)  # Character-level tokenization
    merges = {}

    
    print('text', text)
    print("tokens", tokens)

    
    for _ in range(num_merges):
        bigrams = Counter(zip(tokens, tokens[1:]))
        if not bigrams:
            break
        most_common = max(bigrams, key=bigrams.get)
        new_token = ''.join(most_common)
        merges[most_common] = new_token
        tokens.append(new_token)  # Append new merged token
    print(merges)
    return merges

def encode(text, merges):
    """Encodes text using BPE merges."""
    text = text.encode('utf-8', errors='replace').decode('utf-8')  # Normalize again
    tokens = list(text)
    
    for bigram, new_token in merges.items():
        i = 0
        while i < len(tokens) - 1:
            if (tokens[i], tokens[i + 1]) == bigram:
                tokens[i] = new_token  # Merge into new token
                del tokens[i + 1]
            else:
                i += 1

    return [t.encode('utf-8') for t in tokens]  # Convert to byte representation

def decode(encoded_tokens, merges):
    """Decodes a sequence of byte tokens back to text."""
    tokens = [t.decode('utf-8') for t in encoded_tokens]  # Convert back to string
    rev_merges = {v: k for k, v in merges.items()}  # Reverse merges

    for new_token, bigram in rev_merges.items():
        i = 0
        while i < len(tokens):
            if tokens[i] == new_token:
                tokens[i:i+1] = bigram  # Expand back to original
            i += 1

    return ''.join(tokens)

# Example usage
corpus = "banana banana bandana band bandit"
merges = learn_bpe(corpus, num_merges=10)

encoded = encode("bandana", merges)
decoded = decode(encoded, merges)

print("Merges:", merges)
print("Encoded:", encoded)
print("Decoded:", decoded)


text banana banana bandana band bandit
tokens ['b', 'a', 'n', 'a', 'n', 'a', ' ', 'b', 'a', 'n', 'a', 'n', 'a', ' ', 'b', 'a', 'n', 'd', 'a', 'n', 'a', ' ', 'b', 'a', 'n', 'd', ' ', 'b', 'a', 'n', 'd', 'i', 't']
{('a', 'n'): 'an'}
Merges: {('a', 'n'): 'an'}
Encoded: [b'b', b'an', b'd', b'an', b'a']
Decoded: bandana


In [62]:
import nltk
from collections import Counter

def learn_bpe(text, num_merges=50):
    """Learns BPE merges directly on byte-level encoding."""
    text = text.encode('utf-8', errors='replace')  # Convert text to bytes
    tokens = list(text)  # Split into byte-level tokens
    merges = {}
    # print('text', text)
    # print("tokens", tokens
    new_token_val = 256
    for _ in range(num_merges):
        bigrams = Counter(zip(tokens, tokens[1:]))
        if not bigrams:
            break
        most_common = max(bigrams, key=bigrams.get)
        # new_token = max(tokens) + 1  # Assign new byte value
        new_token = new_token_val
        merges[most_common] = new_token
        i = 0
        while i < len(tokens)-1:
            if (tokens[i], tokens[i+1]) == most_common:
                tokens[i] = most_common
                del tokens[i+1]
            else:
                i+=1
        new_token_val+=1

    return merges

def encode(text, merges):
    """Encodes text using learned BPE merges."""
    text = text.encode('utf-8', errors='replace')  # Convert to byte form
    tokens = list(text)

    print('text', text)
    print("tokens", tokens)

    for bigram, new_token in merges.items():
        i = 0
        while i < len(tokens) - 1:
            if (tokens[i], tokens[i + 1]) == bigram:
                tokens[i] = new_token  # Replace bigram with new token
                del tokens[i + 1]
            else:
                i += 1

    return tokens  # Keep bytes as integers

def decode(encoded_tokens, merges):
    """Decodes byte-tokenized text back to its original form."""
    rev_merges = {v: k for k, v in merges.items()}  # Reverse merges

    i = 0
    while i < len(encoded_tokens):
        if encoded_tokens[i] in rev_merges:
            encoded_tokens[i:i+1] = rev_merges[encoded_tokens[i]]  # Expand back
        else:
            i += 1

    return bytes(encoded_tokens).decode('utf-8', errors='replace')  # Convert back to text

# Example usage
corpus = "banana banana bandana band bandit"
merges = learn_bpe(corpus, num_merges=10)

encoded = encode("bandana", merges)
decoded = decode(encoded, merges)

print("Merges:", merges)
print("Encoded:", encoded)  # Byte-level encoding
print("Decoded:", decoded)  # Should return 'bandana'


text b'banana banana bandana band bandit'
text b'bandana'
tokens [98, 97, 110, 100, 97, 110, 97]
Merges: {(97, 110): 256, (98, (97, 110)): 257, (32, (98, (97, 110))): 258, ((97, 110), 97): 259, (((97, 110), 97), (32, (98, (97, 110)))): 260, ((((97, 110), 97), (32, (98, (97, 110)))), 100): 261, ((98, (97, 110)), (((97, 110), 97), (32, (98, (97, 110))))): 262, (((98, (97, 110)), (((97, 110), 97), (32, (98, (97, 110))))), ((((97, 110), 97), (32, (98, (97, 110)))), 100)): 263, ((((98, (97, 110)), (((97, 110), 97), (32, (98, (97, 110))))), ((((97, 110), 97), (32, (98, (97, 110)))), 100)), ((((97, 110), 97), (32, (98, (97, 110)))), 100)): 264, (((((98, (97, 110)), (((97, 110), 97), (32, (98, (97, 110))))), ((((97, 110), 97), (32, (98, (97, 110)))), 100)), ((((97, 110), 97), (32, (98, (97, 110)))), 100)), (32, (98, (97, 110)))): 265}
Encoded: [98, 97, 110, 100, 97, 110, 97]
Decoded: bandana
