In [116]:
from datasets import load_dataset, Dataset, DatasetDict
import sentencepiece as spm
from pathlib import Path
from typing import Dict

def extract_long_translations(
        min_length: int = 200,
        max_length: int = 2000,
        sp_model_path: str = "../attention_is_all_you_need/BPE/en-hi.model",
        max_samples: int | None = None
    ) -> Dict[str, str]:
    # ── 0.  SentencePiece initialisation ───────────────────────────────
    sp = spm.SentencePieceProcessor()
    if not Path(sp_model_path).exists():
        raise FileNotFoundError(
            f"SentencePiece model not found at {sp_model_path}"
        )
    sp.load(sp_model_path)                              # turn0search6

    def sp_len(txt: str) -> int:
        return len(sp.encode_as_ids(txt))

    # Helper to add (en,hi) pairs if they meet length and quota
    results: dict[str, str] = {}

    def maybe_add(en: str, hi: str) -> None:
        if  min_length <= sp_len(en) <= max_length and min_length <= sp_len(hi) <= max_length:
            if max_samples is None or len(results) < max_samples:
                results[en] = hi

    # Helper that walks through *any* HF dataset object
    def walk_dataset(ds, nested_translation: bool) -> None:
        if isinstance(ds, DatasetDict):
            splits = ds.values()
        else:                           # bare Dataset
            splits = [ds]
        for split in splits:
            for ex in split:
                if nested_translation:                  # translation column
                    maybe_add(ex["translation"]["en"],
                              ex["translation"]["hi"])
                else:                                   # src / tgt style
                    maybe_add(ex["src"], ex["tgt"])

    # ── 1.  OPUS-100 (≈ 55 M pairs, en-hi subset) ──────────────────────
    walk_dataset(
        load_dataset("opus100", "en-hi"),                # turn0search0
        nested_translation=True,
    )

    # ── 2.  IITB EN-HI corpus ──────────────────────────────────────────
    walk_dataset(
        load_dataset("cfilt/iitb-english-hindi"),        # turn0search1
        nested_translation=True,
    )

    # ── 3.  Samanantar (AI4Bharat) ─────────────────────────────────────
    walk_dataset(
        load_dataset("ai4bharat/samanantar", "hi"),      # turn0search2
        nested_translation=False,                       # uses src / tgt
    )

    # ── 4.  PMIndiaSum: align the *two* directions ─────────────────────
    #        (Hindi article + EN summary)  ↔  (English article + HI summary)
    ds_hi = load_dataset("PMIndiaData/PMIndiaSum", data_dir="hindi-english")
    ds_en = load_dataset("PMIndiaData/PMIndiaSum", data_dir="english-hindi")

    # 4-A.  Build a single lookup table from *every* English-side split
    en_by_url = {}
    for split_name, split in ds_en.items():             # train / validation / test
        for ex in split:
            en_by_url[ex["source_url"]] = ex            # keep last seen, fine for 1-1 mapping

    # 4-B.  Iterate over every Hindi-side split and align
    for split_name, split in ds_hi.items():
        for ex_hi in split:
            eng_rec = en_by_url.get(ex_hi["target_url"])
            if eng_rec:
                maybe_add(eng_rec["text"], ex_hi["text"])
                
    return results


In [117]:
sentences = extract_long_translations(50, 2000)

In [118]:
english_encoded = sp.Encode(list(sentences.keys()))
hindi_encoded = sp.Encode(list(sentences.values()))

In [119]:
def analyze_encoded_lengths(english_encoded, hindi_encoded):
    
    # Calculate lengths
    en_lengths = [len(seq) for seq in english_encoded]
    hi_lengths = [len(seq) for seq in hindi_encoded]
    combined_lengths = [en for en, hi in zip(en_lengths, hi_lengths)]
    
    # Print summary
    print(f"Total pairs: {len(combined_lengths):,}")
    print(f"Combined length - Min: {min(combined_lengths)}, Max: {max(combined_lengths)}")
    print(f"Average combined length: {sum(combined_lengths)/len(combined_lengths):.1f} tokens")
    
    # Count by length buckets
    buckets = [(0, 100), (100, 200), (200, 300), (300, 500), (500, 750), (750, 1000), (1000, 1500),  (2000, float('inf'))]
    for min_len, max_len in buckets:
        count = sum(1 for l in combined_lengths if min_len <= l < max_len)
        label = f"{min_len}-{max_len}" if max_len != float('inf') else f"{min_len}+"
        print(f"{label} tokens: {count:,} pairs ({count/len(combined_lengths)*100:.1f}%)")
    
# Use with your data:
lengths = analyze_encoded_lengths(english_encoded, hindi_encoded)
lengths

Total pairs: 816,270
Combined length - Min: 50, Max: 1997
Average combined length: 76.6 tokens
0-100 tokens: 715,935 pairs (87.7%)
100-200 tokens: 89,813 pairs (11.0%)
200-300 tokens: 6,615 pairs (0.8%)
300-500 tokens: 2,126 pairs (0.3%)
500-750 tokens: 760 pairs (0.1%)
750-1000 tokens: 412 pairs (0.1%)
1000-1500 tokens: 435 pairs (0.1%)
2000+ tokens: 0 pairs (0.0%)


In [29]:
import requests
import zipfile
import os
from urllib.parse import urlparse
import pandas as pd

def download_opus_books():
    """Download OPUS Books corpus (literature translations)"""
    
    urls = [
        "https://opus.nlpl.eu/download.php?f=Books/v1/moses/en-hi.txt.zip",
        "https://opus.nlpl.eu/download.php?f=QED/v2.0a/moses/en-hi.txt.zip",  # Educational content
        "https://opus.nlpl.eu/download.php?f=TED2020/v1/moses/en-hi.txt.zip"  # TED talks (longer speeches)
    ]
    
    for url in urls:
        filename = url.split('=')[-1]
        print(f"Downloading {filename}...")
        
        response = requests.get(url)
        if response.status_code == 200:
            with open(filename, 'wb') as f:
                f.write(response.content)
            
            # Extract
            with zipfile.ZipFile(filename, 'r') as zip_ref:
                zip_ref.extractall(f"extracted_{filename.replace('.zip', '')}")
            
            print(f"Extracted to extracted_{filename.replace('.zip', '')}/")
        else:
            print(f"Failed to download {filename}")

def download_gutenberg_literature():
    """Download specific Gutenberg texts with Hindi translations"""
    
    # These are known to have Hindi translations
    gutenberg_books = [
        {
            'title': 'Alice in Wonderland',
            'en_url': 'https://www.gutenberg.org/files/11/11-0.txt',
            'hi_info': 'Search Project Gutenberg for Hindi translation'
        },
        {
            'title': 'Aesop Fables', 
            'en_url': 'https://www.gutenberg.org/files/21/21-0.txt',
            'hi_info': 'Available in Hindi on Gutenberg'
        }
    ]
    
    for book in gutenberg_books:
        print(f"Book: {book['title']}")
        print(f"English: {book['en_url']}")
        print(f"Hindi: {book['hi_info']}\n")

def create_literature_corpus_from_files(directory_path):
    """Process downloaded literature files and create long-sequence corpus"""
    
    literature_pairs = []
    
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.en'):  # English file
                en_file = os.path.join(root, file)
                hi_file = os.path.join(root, file.replace('.en', '.hi'))
                
                if os.path.exists(hi_file):
                    with open(en_file, 'r', encoding='utf-8') as ef:
                        with open(hi_file, 'r', encoding='utf-8') as hf:
                            en_lines = ef.readlines()
                            hi_lines = hf.readlines()
                            
                            # Combine multiple sentences into paragraphs for longer sequences
                            for i in range(0, len(en_lines), 3):  # Group every 3 sentences
                                if i + 2 < len(hi_lines):
                                    en_para = ' '.join([en_lines[j].strip() for j in range(i, min(i+3, len(en_lines)))])
                                    hi_para = ' '.join([hi_lines[j].strip() for j in range(i, min(i+3, len(hi_lines)))])
                                    
                                    if len(en_para.split()) + len(hi_para.split()) > 200:
                                        literature_pairs.append({
                                            'english': en_para,
                                            'hindi': hi_para,
                                            'source': file,
                                            'length': len(en_para.split()) + len(hi_para.split())
                                        })
    
    return literature_pairs

def download_additional_sources():
    """Download from additional academic/cultural sources"""
    
    print("Additional Hindi Literature Sources:")
    print("1. Digital Library of India: https://dli.iiit.ac.in/")
    print("2. Sanskrit Documents: https://sanskritdocuments.org/")
    print("3. Hindi Samay Magazine Archive: http://www.hindisamay.com/")
    print("4. Rekhta (Urdu-Hindi): https://rekhta.org/")
    print("5. Kavita Kosh: http://kavitakosh.org/")
    
    # These require manual download or API access
    academic_sources = [
        {
            'name': 'Indian Institute of Science Archive',
            'url': 'https://archive.org/details/iisc',
            'description': 'Historical texts with translations'
        },
        {
            'name': 'Sahitya Akademi Digital Archive', 
            'url': 'https://sahitya-akademi.gov.in/',
            'description': 'Official literature translations'
        }
    ]
    
    for source in academic_sources:
        print(f"\n{source['name']}: {source['url']}")
        print(f"Description: {source['description']}")

def test_sentencepiece_length(text_pairs, model_path=None):
    """Test how SentencePiece affects your sequence lengths"""
    
    try:
        import sentencepiece as spm
        
        if model_path and os.path.exists(model_path):
            sp = spm.SentencePieceProcessor(model_file=model_path)
        else:
            print("No SentencePiece model provided, using word-based analysis")
            return
            
        print("SentencePiece Length Analysis:")
        for i, pair in enumerate(text_pairs[:10]):  # Test first 10
            en_tokens = sp.encode(pair['english'])
            hi_tokens = sp.encode(pair['hindi']) 
            total_sp_length = len(en_tokens) + len(hi_tokens)
            word_length = len(pair['english'].split()) + len(pair['hindi'].split())
            
            print(f"Pair {i+1}: Words={word_length}, SentencePiece={total_sp_length}")
            
    except ImportError:
        print("SentencePiece not installed. Install with: pip install sentencepiece")

# Main execution
if __name__ == "__main__":
    print("=== DOWNLOADING HINDI LITERATURE DATASETS ===\n")
    
    # Download OPUS literature collections
    print("1. Downloading OPUS Books...")
    download_opus_books()
    
    # Show Gutenberg options
    print("\n2. Project Gutenberg Literature:")
    # download_gutenberg_literature()
    
    # Show additional sources
    print("\n3. Additional Academic Sources:")
    # download_additional_sources()
    
    print("\n=== MANUAL DOWNLOAD INSTRUCTIONS ===")
    print("For the richest literature datasets:")
    print("1. Visit https://opus.nlpl.eu/Books.php")
    print("2. Download EN-HI parallel books")
    print("3. Visit Digital Library of India for classical texts")
    print("4. Contact academic institutions for research datasets")
    


=== DOWNLOADING HINDI LITERATURE DATASETS ===

1. Downloading OPUS Books...
Downloading Books/v1/moses/en-hi.txt.zip...
Failed to download Books/v1/moses/en-hi.txt.zip
Downloading QED/v2.0a/moses/en-hi.txt.zip...
Failed to download QED/v2.0a/moses/en-hi.txt.zip
Downloading TED2020/v1/moses/en-hi.txt.zip...
Failed to download TED2020/v1/moses/en-hi.txt.zip

2. Project Gutenberg Literature:

3. Additional Academic Sources:

=== MANUAL DOWNLOAD INSTRUCTIONS ===
For the richest literature datasets:
1. Visit https://opus.nlpl.eu/Books.php
2. Download EN-HI parallel books
3. Visit Digital Library of India for classical texts
4. Contact academic institutions for research datasets
