In [1]:
import requests
from bs4 import BeautifulSoup
import os
import re

# Set data directory
DATA_DIR = os.path.join('data', 'hindi')

def clean_text(text):
    """Clean the text by removing unnecessary whitespace and special characters."""
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Remove special characters except Hindi Unicode range and basic punctuation
    text = re.sub(r'[^\u0900-\u097F\s\.,\?!]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def download_hindi_text():
    """Download Hindi text from Hindi Wikipedia featured articles."""
    # URLs of some Hindi Wikipedia featured articles
    urls = [
        'https://hi.wikipedia.org/wiki/भारत',
        'https://hi.wikipedia.org/wiki/हिन्दी',
        'https://hi.wikipedia.org/wiki/दिल्ली',
        'https://hi.wikipedia.org/wiki/महात्मा_गांधी',
        'https://hi.wikipedia.org/wiki/योग',
        'https://hi.wikipedia.org/wiki/भारतीय_संविधान',
        'https://hi.wikipedia.org/wiki/भारतीय_राष्ट्रपति',
        'https://hi.wikipedia.org/wiki/भारतीय_संविधान_सभा',
        'https://hi.wikipedia.org/wiki/भारतीय_राष्ट्रपति_चुनाव',
        'https://hi.wikipedia.org/wiki/भारतीय_राष्ट्रपति_चुनाव_2017',
        'https://hi.wikipedia.org/wiki/भारतीय_राष्ट्रपति_चुनाव_2012',
        'https://hi.wikipedia.org/wiki/भारतीय_राष्ट्रपति_चुनाव_2007',
        'https://hi.wikipedia.org/wiki/भारतीय_राष्ट्रपति_चुनाव_2002',
        'https://hi.wikipedia.org/wiki/भारतीय_राष्ट्रपति_चुनाव_1997',
    ]

    all_text = []

    for url in urls:
        try:
            print(f"Downloading from {url}")
            response = requests.get(url)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Get main content
            content = soup.find(id='mw-content-text')
            if content:
                paragraphs = content.find_all('p')
                text = ' '.join(p.get_text() for p in paragraphs)
                cleaned_text = clean_text(text)
                if cleaned_text:
                    all_text.append(cleaned_text)

        except Exception as e:
            print(f"Error downloading {url}: {e}")

    # Combine all text
    combined_text = ' '.join(all_text)

    # Create directory if it doesn't exist
    os.makedirs(DATA_DIR, exist_ok=True)

    # Save to file
    with open(os.path.join(DATA_DIR, 'text.txt'), 'w', encoding='utf-8') as f:
        f.write(combined_text)

    print(f"\nDownloaded and saved {len(combined_text)} characters of Hindi text")
    return combined_text

if __name__ == "__main__":
    download_hindi_text()

Downloading from https://hi.wikipedia.org/wiki/भारत
Downloading from https://hi.wikipedia.org/wiki/हिन्दी
Downloading from https://hi.wikipedia.org/wiki/दिल्ली
Downloading from https://hi.wikipedia.org/wiki/महात्मा_गांधी
Downloading from https://hi.wikipedia.org/wiki/योग
Downloading from https://hi.wikipedia.org/wiki/भारतीय_संविधान
Downloading from https://hi.wikipedia.org/wiki/भारतीय_राष्ट्रपति
Downloading from https://hi.wikipedia.org/wiki/भारतीय_संविधान_सभा
Downloading from https://hi.wikipedia.org/wiki/भारतीय_राष्ट्रपति_चुनाव
Error downloading https://hi.wikipedia.org/wiki/भारतीय_राष्ट्रपति_चुनाव: 404 Client Error: Not Found for url: https://hi.wikipedia.org/wiki/%E0%A4%AD%E0%A4%BE%E0%A4%B0%E0%A4%A4%E0%A5%80%E0%A4%AF_%E0%A4%B0%E0%A4%BE%E0%A4%B7%E0%A5%8D%E0%A4%9F%E0%A5%8D%E0%A4%B0%E0%A4%AA%E0%A4%A4%E0%A4%BF_%E0%A4%9A%E0%A5%81%E0%A4%A8%E0%A4%BE%E0%A4%B5
Downloading from https://hi.wikipedia.org/wiki/भारतीय_राष्ट्रपति_चुनाव_2017
Error downloading https://hi.wikipedia.org/wiki/भारतीय_र

In [5]:
import matplotlib.pyplot as plt
import os

class BPEVisualizer:
    """Visualizes BPE training statistics."""

    def __init__(self, stats_dir: str):
        self.stats_dir = stats_dir
        self.plots_dir = os.path.join(stats_dir, 'plots')
        os.makedirs(self.plots_dir, exist_ok=True)

    def plot_training_stats(self, metrics_logger):
        """Generate all training statistics plots."""
        # Get data from metrics
        iterations = [log['iteration'] for log in metrics_logger.token_logs]
        vocab_sizes = [log['vocab_size'] for log in metrics_logger.token_logs]
        token_freqs = [log['frequency'] for log in metrics_logger.token_logs]
        compression_ratios = [log['compression_ratio'] for log in metrics_logger.compression_logs]

        # Create figure with subplots
        fig = plt.figure(figsize=(20, 15))

        # 1. Vocabulary Size Growth
        ax1 = fig.add_subplot(221)
        ax1.plot(iterations, vocab_sizes)
        ax1.set_title('Vocabulary Size Growth')
        ax1.set_xlabel('Iteration')
        ax1.set_ylabel('Vocabulary Size')
        ax1.grid(True)

        # 2. Compression Ratio Progress
        ax2 = fig.add_subplot(222)
        ax2.plot(iterations, compression_ratios)
        ax2.set_title('Compression Ratio Progress')
        ax2.set_xlabel('Iteration')
        ax2.set_ylabel('Compression Ratio')
        ax2.grid(True)

        # 3. Token Frequencies (Log Scale)
        ax3 = fig.add_subplot(223)
        ax3.plot(iterations, token_freqs)
        ax3.set_title('Token Frequencies')
        ax3.set_xlabel('Iteration')
        ax3.set_ylabel('Frequency')
        ax3.set_yscale('log')
        ax3.grid(True)

        # 4. Compression Ratio Distribution
        ax4 = fig.add_subplot(224)
        ax4.hist(compression_ratios, bins=50)
        ax4.set_title('Compression Ratio Distribution')
        ax4.set_xlabel('Compression Ratio')
        ax4.set_ylabel('Count')
        ax4.grid(True)

        # Save combined plot
        plt.tight_layout()
        plt.savefig(os.path.join(self.plots_dir, 'training_stats.png'))
        plt.close()

        # Save individual plots
        self._save_individual_plots(
            iterations, vocab_sizes, compression_ratios, token_freqs
        )

    def _save_individual_plots(self, iterations, vocab_sizes, compression_ratios, token_freqs):
        """Save individual plots for each metric."""
        # Vocabulary Size
        plt.figure(figsize=(10, 6))
        plt.plot(iterations, vocab_sizes)
        plt.title('Vocabulary Size Growth')
        plt.xlabel('Iteration')
        plt.ylabel('Vocabulary Size')
        plt.grid(True)
        plt.savefig(os.path.join(self.plots_dir, 'vocab_size.png'))
        plt.close()

        # Compression Ratio
        plt.figure(figsize=(10, 6))
        plt.plot(iterations, compression_ratios)
        plt.title('Compression Ratio Progress')
        plt.xlabel('Iteration')
        plt.ylabel('Compression Ratio')
        plt.grid(True)
        plt.savefig(os.path.join(self.plots_dir, 'compression_ratio.png'))
        plt.close()

        # Token Frequencies
        plt.figure(figsize=(10, 6))
        plt.plot(iterations, token_freqs)
        plt.title('Token Frequencies')
        plt.xlabel('Iteration')
        plt.ylabel('Frequency')
        plt.yscale('log')
        plt.grid(True)
        plt.savefig(os.path.join(self.plots_dir, 'token_frequencies.png'))
        plt.close()

In [6]:
from typing import List, Set, Dict, Tuple
from collections import Counter
import re

class BaseTokenizer:
    """Base class for tokenizer implementations."""
    def __init__(self, vocab_size: int):
        if vocab_size <= 0:
            raise ValueError("Vocabulary size must be positive")
        self.vocab_size = vocab_size
        self.vocab: Set[str] = set()

    def tokenize(self, text: str) -> List[str]:
        """Convert text into list of tokens."""
        raise NotImplementedError

    def detokenize(self, tokens: List[str]) -> str:
        """Convert tokens back to text."""
        raise NotImplementedError

class CharacterTokenizer(BaseTokenizer):
    """Simple character-level tokenizer."""
    def tokenize(self, text: str) -> List[str]:
        return list(text)

    def detokenize(self, tokens: List[str]) -> str:
        return ''.join(tokens)

In [7]:
from dataclasses import dataclass
from typing import List, Dict
import json

@dataclass
class TrainingMetrics:
    """Container for training metrics."""
    iteration: int
    vocab_size: int
    tokens: int
    new_token: str
    frequency: int
    compression_ratio: float

class MetricsLogger:
    """Handles logging and saving of training metrics."""
    def __init__(self):
        self.token_logs: List[Dict] = []
        self.compression_logs: List[Dict] = []

    def log_iteration(self, metrics: TrainingMetrics):
        """Log metrics for current iteration."""
        self.token_logs.append({
            'iteration': metrics.iteration,
            'vocab_size': metrics.vocab_size,
            'tokens': metrics.tokens,
            'new_token': metrics.new_token,
            'frequency': metrics.frequency
        })

        self.compression_logs.append({
            'iteration': metrics.iteration,
            'compression_ratio': metrics.compression_ratio
        })

    def print_progress(self, metrics: TrainingMetrics, force: bool = False):
        """Print training progress."""
        if force or metrics.iteration % 500 == 0:
            print(f"\nIteration {metrics.iteration:,}:")
            print(f"Vocab size: {metrics.vocab_size:,}")
            print(f"Compression ratio: {metrics.compression_ratio:.2f}")
            print(f"New token: {metrics.new_token} (freq: {metrics.frequency:,})")
            print(f"Current tokens: {metrics.tokens:,}")
            print("-" * 50)

    def save(self, path: str):
        """Save metrics to file."""
        data = {
            'token_logs': self.token_logs,
            'compression_logs': self.compression_logs
        }
        with open(path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

In [10]:
from typing import List, Tuple, Dict, Set
from collections import Counter
import json
import re
import os

class HindiBPE(BaseTokenizer):
    """Byte-Pair Encoding implementation for Hindi text."""

    def __init__(self, vocab_size: int = 5000):
        super().__init__(vocab_size)
        self.merges: Dict[Tuple[str, str], str] = {}
        self.metrics = MetricsLogger()

    def get_stats(self, words: List[List[str]]) -> Counter:
        """Count pair frequencies in current vocabulary."""
        pairs = Counter()
        for word in words:
            for i in range(len(word) - 1):
                pairs[tuple(word[i:i + 2])] += 1
        return pairs

    def merge_vocab(self, words: List[List[str]], pair: Tuple[str, str], new_token: str) -> List[List[str]]:
        """Merge all occurrences of a pair into a new token."""
        new_words = []
        bigram = re.escape(' '.join(pair))
        pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')

        for word in words:
            w = ' '.join(word)
            w = pattern.sub(new_token, w)
            new_words.append(w.split())

        return new_words

    def fit(self, text: str, min_freq: int = 2):
        """Train BPE on input text."""
        if not text or not text.strip():
            raise ValueError("Input text cannot be empty")

        # Initialize with characters
        words = [[c for c in word] for word in text.split()]
        self.vocab = set(char for word in words for char in word)

        original_tokens = sum(len(word) for word in words)

        iteration = 0
        while len(self.vocab) < self.vocab_size:
            pairs = self.get_stats(words)
            if not pairs:
                break

            most_common = pairs.most_common(1)[0]
            if most_common[1] < min_freq:
                break

            pair, count = most_common
            new_token = ''.join(pair)
            self.merges[pair] = new_token
            self.vocab.add(new_token)

            words = self.merge_vocab(words, pair, new_token)

            # Calculate metrics
            current_tokens = sum(len(word) for word in words)
            metrics = TrainingMetrics(
                iteration=iteration,
                vocab_size=len(self.vocab),
                tokens=current_tokens,
                new_token=new_token,
                frequency=count,
                compression_ratio=original_tokens / current_tokens
            )

            # Log metrics
            self.metrics.log_iteration(metrics)
            self.metrics.print_progress(metrics)

            iteration += 1

        # Print final statistics
        final_metrics = TrainingMetrics(
            iteration=iteration,
            vocab_size=len(self.vocab),
            tokens=current_tokens,
            new_token=new_token,
            frequency=count,
            compression_ratio=original_tokens / current_tokens
        )
        self.metrics.print_progress(final_metrics, force=True)

    def encode(self, text: str) -> List[str]:
        """Encode text using learned BPE merges."""
        words = [[c for c in word] for word in text.split()]
        for pair, new_token in self.merges.items():
            words = self.merge_vocab(words, pair, new_token)
        return [token for word in words for token in word]

    def decode(self, tokens: List[str]) -> str:
        """Decode tokens back to text."""
        return ' '.join(''.join(tokens))

    def save(self, model_path: str, stats_path: str = None):
        """Save BPE model to file."""
        data = {
            'merges': {' '.join(k): v for k, v in self.merges.items()},
            'vocab': list(self.vocab)
        }
        with open(model_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        # Save metrics separately
        if stats_path:
            self.metrics.save(stats_path)

    def load(self, model_path: str, stats_path: str = None):
        """Load BPE model from file."""
        with open(model_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        self.merges = {tuple(k.split()): v for k, v in data['merges'].items()}
        self.vocab = set(data['vocab'])

        # Load metrics if available
        if stats_path and os.path.exists(stats_path):
            with open(stats_path, 'r', encoding='utf-8') as f:
                metrics_data = json.load(f)
                self.metrics.token_logs = metrics_data['token_logs']
                self.metrics.compression_logs = metrics_data['compression_logs']



In [11]:
import os

def create_directory_structure():
    """Create the required directory structure."""
    directories = [
        os.path.join('models', 'hindi_bpe'),
        os.path.join('stats', 'hindi_bpe'),
        os.path.join('stats', 'hindi_bpe', 'plots'),
        os.path.join('data', 'hindi'),
    ]

    for directory in directories:
        os.makedirs(directory, exist_ok=True)
        print(f"Created directory: {directory}")

# Create directories if they don't exist
MODEL_DIR = os.path.join('models', 'hindi_bpe')
STATS_DIR = os.path.join('stats', 'hindi_bpe')
DATA_DIR = os.path.join('data', 'hindi')
create_directory_structure()

def main():
    # Load Hindi text data
    with open(os.path.join(DATA_DIR, 'text.txt'), 'r', encoding='utf-8') as f:
        text = f.read()

    # Initialize and train BPE
    bpe = HindiBPE(vocab_size=5500)
    bpe.fit(text)

    # Save the model and metrics
    model_path = os.path.join(MODEL_DIR, 'model.json')
    stats_path = os.path.join(STATS_DIR, 'metrics.json')
    bpe.save(model_path, stats_path)

    # Generate and save visualization plots
    visualizer = BPEVisualizer(STATS_DIR)
    visualizer.plot_training_stats(bpe.metrics)
    print("\nGenerated visualization plots in:", os.path.join(STATS_DIR, 'plots'))

    # Test encoding
    test_text = "आप कैसे हैं?"
    encoded = bpe.encode(test_text)
    decoded = bpe.decode(encoded)

    print(f"\nTest encoding/decoding:")
    print(f"Original: {test_text}")
    print(f"Encoded: {encoded}")
    print(f"Decoded: {decoded}")

if __name__ == "__main__":
    main()

Created directory: models/hindi_bpe
Created directory: stats/hindi_bpe
Created directory: stats/hindi_bpe/plots
Created directory: data/hindi

Iteration 0:
Vocab size: 78
Compression ratio: 1.01
New token: ्र (freq: 2,256)
Current tokens: 164,917
--------------------------------------------------

Iteration 500:
Vocab size: 578
Compression ratio: 2.13
New token: त्मक (freq: 35)
Current tokens: 78,381
--------------------------------------------------

Iteration 1,000:
Vocab size: 1,078
Compression ratio: 2.50
New token: सिंह, (freq: 16)
Current tokens: 66,835
--------------------------------------------------

Iteration 1,500:
Vocab size: 1,578
Compression ratio: 2.76
New token: अर्था (freq: 10)
Current tokens: 60,560
--------------------------------------------------

Iteration 2,000:
Vocab size: 2,078
Compression ratio: 2.96
New token: वल्लभ (freq: 7)
Current tokens: 56,541
--------------------------------------------------

Iteration 2,500:
Vocab size: 2,578
Compression ratio: 3.11
