### loading Libraries that are going to be used

In [1]:
import os
import re
from collections import defaultdict, Counter
import pickle

# Preprocessing DataSet 
### The preprocess_text function preprocesses the input text by converting it to lowercase and removing unnecessary punctuation.

In [2]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s.,!?]', '', text)
    return text

# BPE CLASS
The BPETokenizer class implements Byte Pair Encoding (BPE) for tokenizing text. It builds a vocabulary, merges frequent pairs of characters or tokens, and encodes/decodes text into token IDs.
## Methods
# __init__(self, vocab_size=1000)

Initializes the tokenizer with a specified vocabulary size.

Default: vocab_size = 1000.

build_vocab(self, text)

Builds the initial vocabulary from unique characters in the text.

Assigns IDs starting from 1.

Adds <UNK> token with ID 0.

# get_stats(self, tokens)

Counts the frequency of adjacent token pairs in the input tokens.

Returns a dictionary of pairs and their counts.

# merge_vocab(self, tokens, pair)

Merges the most frequent pair of tokens in the input tokens.

Returns updated tokens after merging.

# train(self, text)

Trains the BPE model on the input text.

Builds vocabulary, performs merges, and saves the model to bpe_model.pkl.

# encode(self, text)

Encodes the input text into token IDs using the trained BPE model.

Returns a list of token IDs.

# decode(self, ids)

Decodes a list of token IDs back into text.

Returns the decoded text.



In [3]:
class BPETokenizer:
    def __init__(self, vocab_size=1000):
        self.vocab_size = vocab_size
        self.vocab = {}
        self.merges = {}
        self.unk_token = "<UNK>"
        self.unk_id = 0

    def build_vocab(self, text):
        chars = sorted(list(set(text)))
        self.vocab = {char: idx + 1 for idx, char in enumerate(chars)}  
        self.vocab[self.unk_token] = self.unk_id 

    def get_stats(self, tokens):
        pairs = defaultdict(int)
        for word in tokens:
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[(symbols[i], symbols[i + 1])] += 1
        return pairs

    def merge_vocab(self, tokens, pair):
        new_tokens = []
        for word in tokens:
            new_word = []
            i = 0
            while i < len(word.split()):
                if i < len(word.split()) - 1 and (word.split()[i], word.split()[i + 1]) == pair:
                    new_word.append(pair[0] + pair[1])
                    i += 2
                else:
                    new_word.append(word.split()[i])
                    i += 1
            new_tokens.append(" ".join(new_word))
        return new_tokens

    def train(self, text):
        text = preprocess_text(text)
        self.build_vocab(text)
        words = text.split()
        tokens = [" ".join(word) for word in words]
        while len(self.vocab) < self.vocab_size:
            stats = self.get_stats(tokens)
            if not stats:
                break
            best_pair = max(stats, key=stats.get)
            tokens = self.merge_vocab(tokens, best_pair)
            new_token = best_pair[0] + best_pair[1]
            self.vocab[new_token] = len(self.vocab)
            self.merges[best_pair] = new_token
        with open("bpe_model.pkl", "wb") as f:
            pickle.dump({"vocab": self.vocab, "merges": self.merges}, f)

    def encode(self, text):
        text = preprocess_text(text)
        words = text.split()
        tokens = [" ".join(word) for word in words]
        for pair, merge in self.merges.items():
            tokens = self.merge_vocab(tokens, pair)
        ids = []
        for token in tokens:
            if token in self.vocab:
                ids.append(self.vocab[token])
            else:
                ids.append(self.unk_id)
        return ids

    def decode(self, ids):
        id_to_token = {v: k for k, v in self.vocab.items()}
        tokens = [id_to_token.get(id, self.unk_token) for id in ids]
        text = " ".join(tokens)
        return text

### Loading DataSet function


In [4]:
def load_dataset_from_folder(folder_path):
    text = ""
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            with open(os.path.join(folder_path, file_name), "r", encoding="utf-8") as f:
                text += f.read() + " "
    return text

### Calculating Accuracy

In [5]:
def calculate_accuracy(bpe, test_text):
    tokens = bpe.encode(test_text)
    unk_count = tokens.count(bpe.unk_id)
    total_tokens = len(tokens)
    if total_tokens == 0:
        return 1.0 
    accuracy = (total_tokens - unk_count) / total_tokens
    return accuracy

# Main Function
## Load Dataset:
Loads all .txt files from the corpus_folder and combines their content into a single string.
## Train BPE Tokenizer:
Initializes a BPETokenizer with a vocabulary size of 1000.
Trains the tokenizer on the combined text.
## Evaluate on Test Files:
Iterates over all .txt files in the test_folder.
## For each test file:
Encodes the text using the trained BPE tokenizer.
Calculates accuracy by comparing the number of <UNK> tokens to the total number of tokens.
Computes the combined accuracy as the average accuracy across all test files.
## Output:
Prints the vocabulary size of the trained model.
Prints the combined accuracy as a percentage.



In [6]:
if __name__ == "__main__":

    corpus_folder = "D:\\NLP\\i21-0711_A2\\corpus"  
    text = load_dataset_from_folder(corpus_folder)
    bpe = BPETokenizer(vocab_size=1000)
    bpe.train(text)
    print(f"Vocabulary Size: {len(bpe.vocab)}")
    test_folder = "D:\\NLP\\i21-0711_A2\\test"  
    total_accuracy = 0.0
    total_files = 0
    for file_name in os.listdir(test_folder):
        if file_name.endswith(".txt"):
            test_file_path = os.path.join(test_folder, file_name)
            with open(test_file_path, "r", encoding="utf-8") as f:
                test_text = f.read()
            accuracy = calculate_accuracy(bpe, test_text)
            total_accuracy += accuracy
            total_files += 1
    if total_files > 0:
        combined_accuracy = total_accuracy / total_files *100
        print(f"Combined Accuracy: {combined_accuracy:.4f} %")
    else:
        print("No test files found.")

Vocabulary Size: 1000
Combined Accuracy: 65.5006 %
