### 21i-0466 Shahmeer Ali Akhtar
### LOADING LIBRARIES


In [6]:
import re
import os
import pickle
from collections import Counter


## BytePairEncoding Class

### Constructor
- `__init__(self, vocab_size=1000)`: Initializes the BPE encoder with a specified vocabulary size. The default size is set to 1000.

### Attributes
- `vocab_size`: The maximum size of the vocabulary for BPE.
- `bpe_vocab`: Dictionary to store the BPE vocabulary after training.
- `normalization_dict`: Dictionary for normalizing specific words during preprocessing.

### Methods

#### `preprocess_text(self, text)`
- Preprocesses the input text by removing numbers, converting to lowercase, removing non-alphabetic characters, and normalizing text based on the `normalization_dict`.

#### `tokenize(self, text)`
- Splits the preprocessed text into tokens based on whitespace.

#### `get_vocab(self, texts)`
- Generates a frequency dictionary of tokens from the list of input texts.

#### `get_stats(self, vocab)`
- Calculates the frequency of adjacent symbol pairs in the vocabulary.

#### `merge_vocab(self, pair, vocab)`
- Merges the most frequent pair of symbols in the vocabulary to create a new vocabulary with the merged symbol.

#### `train(self, texts)`
- Trains the BPE model using the provided texts by iteratively merging the most frequent pairs until the vocabulary size is reached or no more pairs can be merged.

#### `print_vocab_size(self)`
- Prints the total number of unique entries in the BPE vocabulary.

#### `encode(self, text)`
- Encodes a text into subwords based on the trained BPE vocabulary. Tokens not found in the vocabulary are marked as `<UNK>`.

#### `decode(self, encoded_text)`
- Decodes the list of encoded tokens back into a string.

#### `evaluate_bpe(self, test_texts)`
- Evaluates the BPE model on test texts to calculate the ratio of unknown tokens, the coverage ratio of known tokens, and the unique token coverage.

#### `save_model(self, file_path)`
- Saves the trained BPE vocabulary to a file using Python's `pickle` module.

#### `load_model(self, file_path)`
- Loads a BPE vocabulary from a file into the `bpe_vocab` attribute.


In [7]:
class BytePairEncoding:
    def __init__(self, vocab_size=1000):
        self.vocab_size = vocab_size
        self.bpe_vocab = {}
        self.normalization_dict = {
            "mujhe": "mujhay",
            "kya": "kia",
            "mei":"ma",
            "main":"ma",
            "mai":"ma"
        }

    def preprocess_text(self, text):
        text = re.sub(r'^\d+\.', '', text)
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        for key, value in self.normalization_dict.items():
            text = text.replace(key, value)
        return text

    def tokenize(self, text):
        return text.split()

    def get_vocab(self, texts):
        vocab = Counter()
        for text in texts:
            tokens = self.tokenize(text)
            for token in tokens:
                vocab[token] += 1
        return vocab

    def get_stats(self, vocab):
        pairs = Counter()
        for word, freq in vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pair = (symbols[i], symbols[i + 1])
                pairs[pair] += freq
        return pairs

    def merge_vocab(self, pair, vocab):
        replacement = ''.join(pair)
        new_vocab = {}
        bigram = ' '.join(pair)
        for word in vocab:
            new_word = re.sub(r'\b' + re.escape(bigram) + r'\b', replacement, word)
            new_vocab[new_word] = vocab[word]
        return new_vocab

    def train(self, texts):
        vocab = self.get_vocab(texts)
        self.bpe_vocab = vocab.copy()
        for i in range(self.vocab_size):
            pairs = self.get_stats(self.bpe_vocab)
            if not pairs:
                break
            most_frequent_pair = pairs.most_common(1)[0][0]
            self.bpe_vocab = self.merge_vocab(most_frequent_pair, self.bpe_vocab)

    def print_vocab_size(self):
        print("Total unique vocabulary size:", len(self.bpe_vocab))

    def encode(self, text):
        words = self.tokenize(text)
        encoded_text = []
        for word in words:
            if word in self.bpe_vocab:
                encoded_text.append(word)
            else:
                encoded_text.append('<UNK>')
        return encoded_text

    def decode(self, encoded_text):
        return ' '.join(encoded_text)

    def evaluate_bpe(self, test_texts):
        total_tokens = 0
        unk_tokens = 0
        covered_tokens = 0
        vocab_tokens = set(self.bpe_vocab.keys())

        test_vocab = Counter()
        for text in test_texts:
            tokens = self.tokenize(text)
            test_vocab.update(tokens)

        for token, freq in test_vocab.items():
            total_tokens += freq
            if token in vocab_tokens:
                covered_tokens += freq
            else:
                unk_tokens += freq

        unk_ratio = unk_tokens / total_tokens if total_tokens > 0 else 0
        coverage_ratio = covered_tokens / total_tokens if total_tokens > 0 else 0
        unique_coverage = len(vocab_tokens.intersection(test_vocab.keys())) / len(test_vocab.keys())

        print(f"Total tokens: {total_tokens}")
        print(f"Unknown token occurrences: {unk_tokens}")
        print(f"Unknown token ratio: {unk_ratio:.4f}")
        print(f"Coverage ratio: {coverage_ratio:.4f}")
        print(f"Unique token coverage: {unique_coverage:.4f}")

    def save_model(self, file_path):
        with open(file_path, 'wb') as file:
            pickle.dump(self.bpe_vocab, file)

    def load_model(self, file_path):
        with open(file_path, 'rb') as file:
            self.bpe_vocab = pickle.load(file)


### Loading the dairies from the folder
And displaying the total vocabulary size that the model has learned 


In [8]:
def load_texts_from_folder(folder_path):
    file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]
    texts = []
    for path in file_paths:
        try:
            with open(path, 'r', encoding='utf-8') as file:
                text = file.read()
                processed_text = bpe_processor.preprocess_text(text)
                texts.append(processed_text)
        except Exception as e:
            print(f"Error processing file {path}: {e}")
    return texts
folder_path = "D:\\NLP\\i21-0466_A2\\corpus" #D:\NLP\i21-0466_A2\corpus
bpe_processor = BytePairEncoding(vocab_size=1000)
processed_texts = load_texts_from_folder(folder_path)
bpe_processor.train(processed_texts)
bpe_processor.print_vocab_size()


Total unique vocabulary size: 2426


### Saving model and then loading it for further use


In [9]:
model_path = "bpe_model.pkl"
bpe_processor.save_model(model_path)
bpe_processor.load_model(model_path)


### Example Usage of the model

In [10]:
test_folder_path = 'D:\\NLP\\i21-0466_A2\\test'

test_files = [f for f in os.listdir(test_folder_path) if f.endswith('.txt')]

test_texts = []
for file_name in test_files:
    with open(os.path.join(test_folder_path, file_name), 'r', encoding='utf-8') as file:
        text = file.read()
        test_texts.append(text)

bpe_processor.evaluate_bpe(test_texts)

Total tokens: 1604
Unknown token occurrences: 418
Unknown token ratio: 0.2606
Coverage ratio: 0.7394
Unique token coverage: 0.5469


### Evaluation Criteria

The `evaluate_bpe` method assesses the performance of the BPE model using the following metrics:

- **Total Tokens**: Counts all tokens in the test texts. This metric indicates the size of the data being evaluated.

- **Unknown Token Occurrences**: Counts how many tokens from the test texts are not recognized by the model's vocabulary. A lower number suggests better model performance.

- **Unknown Token Ratio**: Measures the proportion of tokens that are unrecognized by the model compared to the total tokens. A lower ratio indicates better coverage of the test data by the model.

- **Coverage Ratio**: Indicates the percentage of total tokens that the model's vocabulary can recognize and represent. Higher coverage ratios reflect better model accuracy.

- **Unique Token Coverage**: Evaluates how many unique tokens from the test texts are covered by the model's vocabulary. Higher values show a more diverse and effective vocabulary.
