In [1]:
import nltk
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import gutenberg
from nltk.stem import PorterStemmer
import re
from collections import Counter

In [2]:
# Download necessary NLTK data
nltk.download('gutenberg')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Praveena\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Praveena\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# 1. Load the Shakespeare text corpus (using Gutenberg from NLTK)
shakespeare_text = gutenberg.raw('shakespeare-hamlet.txt')
print("Loaded Shakespeare Text Sample:")
print(shakespeare_text[:200])  # Print the first 500 characters

Loaded Shakespeare Text Sample:
[The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Barnardo and Francisco two Centinels.

  Barnardo. Who's there?
  Fran. Nay answer me: Stand & vnfold
your sel


In [4]:
# 2. Tokenize using the Penn Treebank tokenizer
print("\nTokenizing using the Penn Treebank tokenizer...")
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(shakespeare_text)
print(tokens[:20])  # Print first 20 tokens


Tokenizing using the Penn Treebank tokenizer...
['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']', 'Actus', 'Primus.', 'Scoena', 'Prima.', 'Enter', 'Barnardo', 'and', 'Francisco', 'two', 'Centinels.']


In [5]:
# Function to perform Byte Pair Encoding (BPE)
def byte_pair_encoding(tokens, num_merges):
    vocab = Counter(tokens)
    bpe_codes = {}

    def get_stats(vocab):
        pairs = Counter()
        for word, freq in vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[symbols[i], symbols[i + 1]] += freq
        return pairs

    def merge_vocab(pair, vocab):
        bigram = re.escape(' '.join(pair))
        pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
        new_vocab = {}
        for word in vocab:
            w_out = pattern.sub(''.join(pair), word)
            new_vocab[w_out] = vocab[word]
        return new_vocab

    for i in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
        bpe_codes[best] = i

    return vocab, bpe_codes

In [6]:
# Example usage of BPE
token_pairs = [' '.join(list(token)) for token in tokens]
bpe_vocab, bpe_codes = byte_pair_encoding(token_pairs, 100)

In [7]:
stemmer=PorterStemmer()
stemmed_tokens=[stemmer.stem(token) for token in tokens]

In [8]:
# Print results
print("First 20 tokens:", tokens[:20])
print("First 20 stemmed tokens:", stemmed_tokens[:20])
print("BPE Vocabulary Sample:", list(bpe_vocab.items())[:10])


First 20 tokens: ['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']', 'Actus', 'Primus.', 'Scoena', 'Prima.', 'Enter', 'Barnardo', 'and', 'Francisco', 'two', 'Centinels.']
First 20 stemmed tokens: ['[', 'the', 'tragedi', 'of', 'hamlet', 'by', 'william', 'shakespear', '1599', ']', 'actu', 'primus.', 'scoena', 'prima.', 'enter', 'barnardo', 'and', 'francisco', 'two', 'centinels.']
BPE Vocabulary Sample: [('[', 6), ('The', 133), ('T ra g ed i e', 3), ('of', 572), ('Ham let', 78), ('b y', 90), ('W i ll i am', 1), ('S ha k es pe are', 1), ('1 5 9 9', 1), (']', 6)]
