In [117]:
sentences = extract_long_translations(50, 2000)

In [124]:
english_encoded = sp.Encode(list(sentences.keys()))
hindi_encoded = sp.Encode(list(sentences.values()))

In [6]:
a = {1: [1,2,3,4], 
     2: [5,6,7,8],
     3: [9,10,11,12],
     4: [13,14,15,16]}

import pandas as pd
pd.DataFrame(a)

Unnamed: 0,1,2,3,4
0,1,5,9,13
1,2,6,10,14
2,3,7,11,15
3,4,8,12,16


In [168]:
def analyze_encoded_lengths(english_encoded, hindi_encoded):
    
    # Calculate lengths
    en_lengths = [len(seq) for seq in english_encoded]
    hi_lengths = [len(seq) for seq in hindi_encoded]
    combined_lengths = [en for en, hi in zip(en_lengths, hi_lengths)]
    
    # Print summary
    print(f"Total pairs: {len(combined_lengths):,}")
    print(f"Combined length - Min: {min(combined_lengths)}, Max: {max(combined_lengths)}")
    print(f"Average combined length: {sum(combined_lengths)/len(combined_lengths):.1f} tokens")
    
    # Count by length buckets
    buckets = [(0, 140), (140, 200), (200, 300), (300, 500), (500, 750), (750, 1000), (1000, 1500),  (1500, float('inf'))]
    for min_len, max_len in buckets:
        count = sum(1 for l in combined_lengths if min_len <= l < max_len)
        label = f"{min_len}-{max_len}" if max_len != float('inf') else f"{min_len}+"
        print(f"{label} tokens: {count:,} pairs ({count/len(combined_lengths)*100:.1f}%)")
    
lengths = analyze_encoded_lengths(english_encoded, hindi_encoded)
lengths

Total pairs: 22,522
Combined length - Min: 50, Max: 1997
Average combined length: 228.1 tokens
0-140 tokens: 10,559 pairs (46.9%)
140-200 tokens: 1,441 pairs (6.4%)
200-300 tokens: 6,615 pairs (29.4%)
300-500 tokens: 2,126 pairs (9.4%)
500-750 tokens: 760 pairs (3.4%)
750-1000 tokens: 412 pairs (1.8%)
1000-1500 tokens: 435 pairs (1.9%)
1500+ tokens: 174 pairs (0.8%)


In [126]:
import random
import numpy as np

def sample_by_length_buckets(english_encoded, hindi_encoded, target_samples=6000, random_seed=42):
    """
    Sample data to limit the number of pairs in specific length buckets
    
    Args:
        english_encoded: List of encoded English sequences
        hindi_encoded: List of encoded Hindi sequences  
        target_samples: Target number of samples for 0-100 and 100-200 buckets
        random_seed: Random seed for reproducibility
    
    Returns:
        Tuple of (sampled_english, sampled_hindi, sampling_info)
    """
    random.seed(random_seed)
    np.random.seed(random_seed)
    
    # Calculate lengths for each pair
    en_lengths = [len(seq) for seq in english_encoded]
    hi_lengths = [len(seq) for seq in hindi_encoded]
    combined_lengths = [en for en, hi in zip(en_lengths, hi_lengths)]
    
    # Create buckets with indices
    buckets = {
        '0-100': [],
        '100-200': [], 
        '200-300': [],
        '300-500': [],
        '500-750': [],
        '750-1000': [],
        '1000-1500': [],
        '1500-2000': [],
        '2000+': []
    }
    
    # Assign each pair to appropriate bucket
    for i, length in enumerate(combined_lengths):
        if 0 <= length < 100:
            buckets['0-100'].append(i)
        elif 100 <= length < 200:
            buckets['100-200'].append(i)
        elif 200 <= length < 300:
            buckets['200-300'].append(i)
        elif 300 <= length < 500:
            buckets['300-500'].append(i)
        elif 500 <= length < 750:
            buckets['500-750'].append(i)
        elif 750 <= length < 1000:
            buckets['750-1000'].append(i)
        elif 1000 <= length < 1500:
            buckets['1000-1500'].append(i)
        elif 1500 <= length < 2000:
            buckets['1500-2000'].append(i)
        else:
            buckets['2000+'].append(i)
    
    # Sample indices to keep
    selected_indices = []
    sampling_info = {}
    
    for bucket_name, indices in buckets.items():
        original_count = len(indices)
        
        if bucket_name in ['0-100', '100-200']:
            # Sample target_samples from these buckets
            if original_count <= target_samples:
                sampled_indices = indices
                sampled_count = original_count
            else:
                sampled_indices = random.sample(indices, target_samples)
                sampled_count = target_samples
        else:
            # Keep all indices from other buckets
            sampled_indices = indices
            sampled_count = original_count
        
        selected_indices.extend(sampled_indices)
        sampling_info[bucket_name] = {
            'original': original_count,
            'sampled': sampled_count,
            'percentage_kept': (sampled_count / original_count * 100) if original_count > 0 else 0
        }
    
    # Sort indices to maintain some order
    selected_indices.sort()
    
    # Create sampled datasets
    sampled_english = [english_encoded[i] for i in selected_indices]
    sampled_hindi = [hindi_encoded[i] for i in selected_indices]
    
    return sampled_english, sampled_hindi, sampling_info

def print_sampling_summary(sampling_info):
    """Print a summary of the sampling results"""
    print("Sampling Summary:")
    print("=" * 50)
    
    total_original = sum(info['original'] for info in sampling_info.values())
    total_sampled = sum(info['sampled'] for info in sampling_info.values())
    
    for bucket_name, info in sampling_info.items():
        original = info['original']
        sampled = info['sampled']
        percentage = info['percentage_kept']
        
        if original > 0:
            print(f"{bucket_name:12}: {original:7,} -> {sampled:6,} ({percentage:5.1f}% kept)")
        else:
            print(f"{bucket_name:12}: {original:7,} -> {sampled:6,} (N/A)")
    
    print("-" * 50)
    print(f"{'Total':12}: {total_original:7,} -> {total_sampled:6,} ({total_sampled/total_original*100:.1f}% kept)")
sampled_english, sampled_hindi, sampling_info = sample_by_length_buckets(
    english_encoded, 
    hindi_encoded, 
    target_samples=6000,
    random_seed=42
)

print_sampling_summary(sampling_info)
english_encoded = sampled_english
hindi_encoded = sampled_hindi

Sampling Summary:
0-100       : 715,935 ->  6,000 (  0.8% kept)
100-200     :  89,813 ->  6,000 (  6.7% kept)
200-300     :   6,615 ->  6,615 (100.0% kept)
300-500     :   2,126 ->  2,126 (100.0% kept)
500-750     :     760 ->    760 (100.0% kept)
750-1000    :     412 ->    412 (100.0% kept)
1000-1500   :     435 ->    435 (100.0% kept)
1500-2000   :     174 ->    174 (100.0% kept)
2000+       :       0 ->      0 (N/A)
--------------------------------------------------
Total       : 816,270 -> 22,522 (2.8% kept)

Sampled Data Analysis:
Total pairs: 22,522
Combined length - Min: 100, Max: 3946
Average combined length: 435.2 tokens
0-100        tokens:      0 pairs (0.0%)
100-200      tokens:  7,134 pairs (31.7%)
200-300      tokens:  4,243 pairs (18.8%)
300-500      tokens:  6,010 pairs (26.7%)
500-750      tokens:  2,649 pairs (11.8%)
750-1000     tokens:    795 pairs (3.5%)
1000-1500    tokens:    722 pairs (3.2%)
1500-2000    tokens:    394 pairs (1.7%)
2000+        tokens:    575 p

In [169]:
analyze_encoded_lengths(english_encoded, hindi_encoded)

Total pairs: 22,522
Combined length - Min: 50, Max: 1997
Average combined length: 228.1 tokens
0-140 tokens: 10,559 pairs (46.9%)
140-200 tokens: 1,441 pairs (6.4%)
200-300 tokens: 6,615 pairs (29.4%)
300-500 tokens: 2,126 pairs (9.4%)
500-750 tokens: 760 pairs (3.4%)
750-1000 tokens: 412 pairs (1.8%)
1000-1500 tokens: 435 pairs (1.9%)
1500+ tokens: 174 pairs (0.8%)


In [163]:
bins = [(0, 140), (140, 200), (200, 300), (300, 500), (500, 750), (750, 1000), (1000, 1500), (1500, 2000)]

from papers.CommonTransformerComponents.train_sp import LanguageTranslationDataset

varying_lengths = {bin: [] for bin in bins}
for a,b in zip(english_encoded, hindi_encoded):
    if 0 <= len(a) <= 140:
        varying_lengths[(0, 140)].append((a, b))
    elif 140 < len(a) <= 200:
        varying_lengths[(140, 200)].append((a, b))
    elif 200 < len(a) <= 300:
        varying_lengths[(200, 300)].append((a, b))
    elif 300 < len(a) <= 500:
        varying_lengths[(300, 500)].append((a, b))
    elif 500 < len(a) <= 750:
        varying_lengths[(500, 750)].append((a, b))
    elif 750 < len(a) <= 1000:
        varying_lengths[(750, 1000)].append((a, b))
    elif 1000 < len(a) <= 1500:
        varying_lengths[(1000, 1500)].append((a, b))
    elif 1500 < len(a) <= 2000:
        varying_lengths[(1500, 2000)].append((a, b))


In [180]:
from torch.utils.data import DataLoader
vals = {}

for binned, pairs in varying_lengths.items():
    tmp_eng, tmp_hindi = [pair[0] for pair in pairs], [pair[1] for pair in pairs]
    full_data = LanguageTranslationDataset(seq_length=binned[-1], src_encodings=tmp_eng, tgt_encodings=tmp_hindi, 
                                           sos_token=sp.bos_id(), eos_token=sp.eos_id(), pad_token=sp.pad_id())
    train_dataloader = DataLoader(full_data, batch_size=16, shuffle=True, pin_memory=True, num_workers=4)
    vals[binned] = train_dataloader

In [1]:
import os
import sys
parent_dir = os.path.abspath("../../")
sys.path.append(parent_dir)
parent_dir = os.path.abspath("../../utils/")
sys.path.append(parent_dir)

In [2]:
from utils.long_sentence_test import extract_long_translations, sample_by_length_buckets

In [3]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor(model_file="../attention_is_all_you_need/BPE/en-hi.model")
sentences = extract_long_translations(1000, 2000, sp)
info = sample_by_length_buckets(sentences, sp, target_samples=6000)

In [4]:
from papers.CommonTransformerComponents.train_sp import LanguageTranslationDataset
from datasets import load_dataset, Dataset, DatasetDict

In [14]:
vals = {}

for binned, pairs in info.items():
    if pairs == "2000+": 
        continue
    max_len = int(binned.split("-")[1])
    tmp_eng, tmp_hindi = pairs[0], pairs[1]
    full_data = LanguageTranslationDataset(seq_length=max_len, src_encodings=tmp_eng, tgt_encodings=tmp_hindi, 
                                           sos_token=sp.bos_id(), eos_token=sp.eos_id(), pad_token=sp.pad_id())
    train_dataloader = DataLoader(full_data, batch_size=16, shuffle=True, pin_memory=True, num_workers=4)
    vals[binned] = train_dataloader

NameError: name 'DataLoader' is not defined