In [2]:
import re
import collections
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import gutenberg
import nltk

# Ensure necessary NLTK data is downloaded
nltk.download("gutenberg")
nltk.download("punkt")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Hema\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hema\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load the Shakespeare text corpus from NLTK's Gutenberg corpus
shakespeare_text = gutenberg.raw('shakespeare-macbeth.txt')

In [4]:
# Tokenize the text using the Penn Treebank tokenizer
treebank_tokenizer = TreebankWordTokenizer()
tokens = treebank_tokenizer.tokenize(shakespeare_text)

In [5]:
# Define a Byte Pair Encoding (BPE) implementation
def byte_pair_encoding(tokens, num_merges):
    """Implements Byte Pair Encoding tokenization."""
    vocab = collections.Counter(tokens)
    
    # Create a mapping of token to character pairs
    def get_stats(vocab):
        pairs = collections.defaultdict(int)
        for word, freq in vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[symbols[i], symbols[i + 1]] += freq
        return pairs

    # Merge the most frequent pair
    def merge_vocab(pair, vocab):
        new_vocab = {}
        bigram = ' '.join(pair)
        replacement = ''.join(pair)
        for word in vocab:
            new_word = word.replace(bigram, replacement)
            new_vocab[new_word] = vocab[word]
        return new_vocab

    # Apply merges
    for _ in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best_pair = max(pairs, key=pairs.get)
        vocab = merge_vocab(best_pair, vocab)

    return vocab

# Prepare tokens for BPE by joining characters with spaces
bpe_tokens = [' '.join(token) for token in tokens]

# Apply Byte Pair Encoding with a specified number of merges (e.g., 10)
bpe_vocab = byte_pair_encoding(bpe_tokens, num_merges=10)

# Perform stemming using Porter Stemmer
porter_stemmer = PorterStemmer()
stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]

# Print sample outputs
print("Original Tokens (Sample):", tokens[:10])
print("BPE Vocabulary (Sample):", list(bpe_vocab.items())[:10])
print("Stemmed Tokens (Sample):", stemmed_tokens[:10])

Original Tokens (Sample): ['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']']
BPE Vocabulary (Sample): [('[', 4), ('T h e', 118), ('T r a g e d i e', 1), ('o f', 314), ('M a c b e th', 53), ('b y', 36), ('W i l l i a m', 1), ('S h a k es p e a re', 1), ('1 6 0 3', 1), (']', 4)]
Stemmed Tokens (Sample): ['[', 'the', 'tragedi', 'of', 'macbeth', 'by', 'william', 'shakespear', '1603', ']']
