In [None]:
!pip install voyageai tenacity

Collecting voyageai
  Downloading voyageai-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting aiolimiter (from voyageai)
  Downloading aiolimiter-1.2.1-py3-none-any.whl.metadata (4.5 kB)
Downloading voyageai-0.3.5-py3-none-any.whl (28 kB)
Downloading aiolimiter-1.2.1-py3-none-any.whl (6.7 kB)
Installing collected packages: aiolimiter, voyageai
Successfully installed aiolimiter-1.2.1 voyageai-0.3.5


In [None]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from google.colab import drive, files
import os

# # Mount Google Drive
# drive.mount('/content/drive')

# # Upload your chunk files
# print("Upload LLM_smallChunks JSON file:")
# uploaded_llm = files.upload()

# print("Upload Structural_Hierarchical JSON file:")
# uploaded_struct = files.upload()

# Load the data
struct_balanced_filename = r'/content/Structural_Balanced.json'
struct_filename = r'/content/Structural_Hierarchical.json'

with open(struct_balanced_filename, 'r') as f:
    struct_bal_chunks = json.load(f)

with open(struct_filename, 'r') as f:
    struct_chunks = json.load(f)

print(f"Structural Balanced chunks: {len(struct_bal_chunks)}")
print(f"Structural chunks: {len(struct_chunks)}")

# Prepare datasets
datasets = {
    'Structural_Balanced': struct_bal_chunks,
    'Structural_Hierarchical': struct_chunks
}

Structural Balanced chunks: 368
Structural chunks: 380


In [None]:
from google.colab import userdata
from huggingface_hub import login
v_api = userdata.get('Voyageai')

login(token=userdata.get('HF_TOKEN'))

In [None]:
import voyageai
import time
import numpy as np
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

def setup_voyage():
    """Initialize Voyage AI client with API key"""
    vo = voyageai.Client(api_key=v_api)
    return vo

@retry(
    stop=stop_after_attempt(6),
    wait=wait_exponential(multiplier=1, min=25, max=60),
    retry=retry_if_exception_type((Exception,)),  # Retry on any exception
    reraise=True
)
def embed_batch_with_retry(vo, batch_texts):
    """Embed a batch of texts with exponential backoff retry logic"""
    return vo.embed(
        batch_texts,
        model="voyage-3.5",
        input_type="document",
    ).embeddings

def embed_with_voyage(texts, vo, batch_size=5):
    """
    Embed texts using Voyage AI with exponential backoff retry logic

    Args:
        texts: List of texts to embed
        vo: Voyage AI client instance
        batch_size: Number of texts to process in each batch

    Returns:
        numpy array of embeddings
    """
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Voyage embedding"):
        batch_texts = texts[i:i+batch_size]

        try:
            batch_embeddings = embed_batch_with_retry(vo, batch_texts)
            embeddings.extend(batch_embeddings)

            # Optional: Add a small delay between successful batches to be nice to the API
            # Remove this if you want maximum speed and rely only on the retry logic
            if i + batch_size < len(texts):  # Don't sleep after last batch
                time.sleep(1)  # Much shorter delay since we have retry logic

        except Exception as e:
            print(f"Failed to embed batch {i//batch_size + 1} after 6 attempts: {e}")
            # You can choose to either raise the exception or continue with remaining batches
            raise e  # This will stop processing - change to 'continue' if you want to skip failed batches

    return np.array(embeddings)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

# Setup Qwen model
def setup_qwen():
    model_name = "Qwen/Qwen3-0.6B"  # Using available version
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()

    if torch.cuda.is_available():
        model = model.cuda()

    return model, tokenizer

def embed_with_qwen(texts, model, tokenizer, batch_size=8):
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Qwen embedding"):
        batch_texts = texts[i:i + batch_size]

        # Tokenize
        inputs = tokenizer(batch_texts, padding=True, truncation=True,
                          max_length=32768, return_tensors="pt")

        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}

        # Generate embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            # Use mean pooling of last hidden state
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)
            # batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)

        embeddings.extend(batch_embeddings.cpu().numpy())

    return np.array(embeddings)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

def setup_gemma():
    model_name = "google/embeddinggemma-300m"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    model.eval()

    if torch.cuda.is_available():
        model = model.cuda()

    return model, tokenizer

def embed_with_gemma(texts, model, tokenizer, batch_size=8):
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Gemma embedding"):
        batch_texts = texts[i:i + batch_size]

        # Tokenize
        inputs = tokenizer(batch_texts, padding=True, truncation=True,
                          max_length=2048, return_tensors="pt")  # Gemma's context length

        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}

        # Generate embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            # Use the pooled output or mean pooling
            if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
                batch_embeddings = outputs.pooler_output
            else:
                # Mean pooling of last hidden state
                batch_embeddings = outputs.last_hidden_state.mean(dim=1)

            # Normalize embeddings
            # batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)

        embeddings.extend(batch_embeddings.cpu().numpy())

    return np.array(embeddings)

In [None]:
def process_dataset(dataset_name, chunks, models_config):
    """
    Process a dataset with multiple embedding models

    Args:
        dataset_name: Name of the dataset being processed
        chunks: List of text chunks to embed
        models_config: Dictionary of model configurations

    Returns:
        Dictionary containing results for each model
    """
    # Extract texts and metadata
    texts = []
    metadata = []

    for i, chunk in enumerate(chunks):
        text = chunk.get('content', chunk.get('text', ''))
        if text.strip():
            texts.append(text)
            metadata.append({
                'chunk_id': i,
                'token_count': chunk.get('token_count', len(text.split())),
                'source': chunk.get('source', 'unknown'),
                'type': chunk.get('type', 'unknown'),
                'char_count': len(text)
            })

    print(f"\nProcessing {dataset_name}: {len(texts)} chunks")

    results = {}

    # Process each model
    for model_name, (model, embed_func) in models_config.items():
        print(f"\nEmbedding with {model_name}...")

        try:
            # Call embedding function with retry logic built-in
            embeddings = embed_func(texts, model)

            results[model_name] = {
                'embeddings': embeddings,
                'texts': texts,
                'metadata': metadata,
                'model_info': {
                    'model_name': model_name,
                    'embedding_dim': embeddings.shape[1],
                    'total_chunks': len(texts),
                    'dataset': dataset_name
                }
            }

            print(f"✓ Successfully generated {embeddings.shape[0]} embeddings of dim {embeddings.shape[1]}")

        except Exception as e:
            print(f"✗ Error with {model_name}: {e}")
            print(f"  Skipping {model_name} for dataset {dataset_name}")
            continue

    return results

# Setup all models
print("Setting up models...")
try:
    qwen_model, qwen_tokenizer = setup_qwen()
    gemma_model, gemma_tokenizer = setup_gemma()  # Not just setup_gemma()
    # voyage_client = setup_voyage()
    print("✓ All clients initialized successfully")
except Exception as e:
    print(f"✗ Error setting up client: {e}")
    # voyage_client = None

# Model configurations with batch size parameter
models_config = {}

# Add models only if they were successfully initialized
if 'qwen_model' in locals():
    models_config['qwen'] = (
        qwen_model,
        lambda texts, model: embed_with_qwen(texts, model, qwen_tokenizer)
    )

if 'gemma_model' in locals():
    models_config['gemma'] = (
    gemma_model,
    lambda texts, model: embed_with_gemma(texts, model, gemma_tokenizer)
)

# if voyage_client is not None:
#     models_config['voyage'] = (
#         voyage_client,
#         lambda texts, model: embed_with_voyage(texts, model, batch_size=3)
#     )

print(f"Active models: {list(models_config.keys())}")

# Process all datasets
all_results = {}

# Ensure datasets variable exists (you may need to define this based on your data)
# datasets = {'dataset1': chunks1, 'dataset2': chunks2, ...}

if 'datasets' in locals() or 'datasets' in globals():
    for dataset_name, chunks in datasets.items():
        print(f"\n{'='*50}")
        print(f"Starting processing for dataset: {dataset_name}")
        print(f"{'='*50}")

        try:
            dataset_results = process_dataset(dataset_name, chunks, models_config)
            all_results[dataset_name] = dataset_results

            # Summary for this dataset
            print(f"\n📊 Summary for {dataset_name}:")
            for model_name, results in dataset_results.items():
                if results:
                    print(f"  {model_name}: {results['model_info']['total_chunks']} chunks, "
                          f"dim {results['model_info']['embedding_dim']}")

        except Exception as e:
            print(f"✗ Failed to process dataset {dataset_name}: {e}")
            continue

    # Overall summary
    print(f"\n{'='*50}")
    print("FINAL SUMMARY")
    print(f"{'='*50}")
    print(f"Processed {len(all_results)} datasets")
    for dataset_name, dataset_results in all_results.items():
        print(f"\n{dataset_name}:")
        for model_name, results in dataset_results.items():
            if results:
                info = results['model_info']
                print(f"  ✓ {model_name}: {info['total_chunks']} embeddings, dim {info['embedding_dim']}")
            else:
                print(f"  ✗ {model_name}: Failed")

else:
    print("⚠️ Warning: 'datasets' variable not found. Please define your datasets first.")
    print("Example: datasets = {'dataset1': chunks1, 'dataset2': chunks2}")

Setting up models...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

✓ All clients initialized successfully
Active models: ['qwen', 'gemma']

Starting processing for dataset: Structural_Balanced

Processing Structural_Balanced: 368 chunks

Embedding with qwen...


Qwen embedding: 100%|██████████| 46/46 [02:05<00:00,  2.73s/it]


✓ Successfully generated 368 embeddings of dim 1024

Embedding with gemma...


Gemma embedding: 100%|██████████| 46/46 [00:35<00:00,  1.29it/s]


✓ Successfully generated 368 embeddings of dim 768

📊 Summary for Structural_Balanced:
  qwen: 368 chunks, dim 1024
  gemma: 368 chunks, dim 768

Starting processing for dataset: Structural_Hierarchical

Processing Structural_Hierarchical: 380 chunks

Embedding with qwen...


Qwen embedding: 100%|██████████| 48/48 [01:57<00:00,  2.45s/it]


✓ Successfully generated 380 embeddings of dim 1024

Embedding with gemma...


Gemma embedding: 100%|██████████| 48/48 [00:34<00:00,  1.39it/s]

✓ Successfully generated 380 embeddings of dim 768

📊 Summary for Structural_Hierarchical:
  qwen: 380 chunks, dim 1024
  gemma: 380 chunks, dim 768

FINAL SUMMARY
Processed 2 datasets

Structural_Balanced:
  ✓ qwen: 368 embeddings, dim 1024
  ✓ gemma: 368 embeddings, dim 768

Structural_Hierarchical:
  ✓ qwen: 380 embeddings, dim 1024
  ✓ gemma: 380 embeddings, dim 768





In [None]:
import zipfile
import os

def save_and_download_results(all_results):
    base_path = '/content/embedding_results2'
    os.makedirs(base_path, exist_ok=True)

    for dataset_name, dataset_results in all_results.items():
        for model_name, results in dataset_results.items():

            # Create filename
            filename = f"{dataset_name}_{model_name}_embeddings"

            # Save full results as JSON (for metadata)
            json_data = {
                'model_info': results['model_info'],
                'metadata': results['metadata'],
                'embeddings': results['embeddings'].tolist(),
                'texts': results['texts'][:10]  # Save only first 10 texts as sample
            }

            json_path = f"{base_path}/{filename}.json"
            with open(json_path, 'w') as f:
                json.dump(json_data, f, indent=2)

            # Save embeddings as numpy array (more efficient)
            np.save(f"{base_path}/{filename}_embeddings.npy", results['embeddings'])
            np.save(f"{base_path}/{filename}_texts.npy", np.array(results['texts']))
            np.save(f"{base_path}/{filename}_metadata.npy", np.array(results['metadata']))

            print(f"Saved {model_name} results for {dataset_name}")

    # Create summary
    summary = {}
    for dataset_name, dataset_results in all_results.items():
        summary[dataset_name] = {}
        for model_name, results in dataset_results.items():
            summary[dataset_name][model_name] = {
                'embedding_dim': results['embeddings'].shape[1],
                'num_chunks': results['embeddings'].shape[0],
                'avg_chunk_length': np.mean([len(t.split()) for t in results['texts']])
            }

    # Save summary
    with open(f"{base_path}/embedding_summary.json", 'w') as f:
        json.dump(summary, f, indent=2)

    print("Creating zip file...")

    # Create zip file
    zip_path = '/content/embedding_results.zip'
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(base_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, '/content/')
                zipf.write(file_path, arcname)

    print(f"Zip file created: {zip_path}")
    print("Downloading zip file...")

    # Download the zip file
    files.download(zip_path)

    print("Download complete!")

# Execute the save and download
save_and_download_results(all_results)

Saved qwen results for Structural_Balanced
Saved gemma results for Structural_Balanced
Saved qwen results for Structural_Hierarchical
Saved gemma results for Structural_Hierarchical
Creating zip file...
Zip file created: /content/embedding_results.zip
Downloading zip file...


AttributeError: 'list' object has no attribute 'download'

In [None]:
np.save(f"{base_path}/{filename}_embeddings.npy", results['embeddings'])


NameError: name 'base_path' is not defined

In [None]:
emb = np.load('/content/embedding_results/Structural_Hierarchical_voyage_embeddings_embeddings.npy')

emb.shape

(380, 1024)

In [None]:
def inspect_chunk(chunks, chunk_index, context_range=2):
    """
    Inspect a specific chunk and show some context around it

    Args:
        chunks: List of chunks from your dataset
        chunk_index: Index of the chunk to inspect (0-based)
        context_range: Number of chunks before/after to show for context
    """
    if chunk_index >= len(chunks):
        print(f"❌ Chunk index {chunk_index} is out of range. Dataset has {len(chunks)} chunks.")
        return

    chunk = chunks[chunk_index]

    print(f"🔍 INSPECTING CHUNK {chunk_index}")
    print("=" * 60)

    # Show chunk content
    text = chunk.get('content', chunk.get('text', ''))
    print(f"Content: {text[:500]}{'...' if len(text) > 500 else ''}")
    print(f"Full length: {len(text)} characters")

    # Show metadata
    print(f"\nMetadata:")
    for key, value in chunk.items():
        if key not in ['content', 'text']:
            print(f"  {key}: {value}")

    # Show context chunks
    print(f"\n📋 CONTEXT (±{context_range} chunks):")
    start_idx = max(0, chunk_index - context_range)
    end_idx = min(len(chunks), chunk_index + context_range + 1)

    for i in range(start_idx, end_idx):
        marker = "👉 " if i == chunk_index else "   "
        context_text = chunks[i].get('content', chunks[i].get('text', ''))[:100]
        print(f"{marker}Chunk {i}: {context_text}{'...' if len(context_text) >= 100 else ''}")

def calculate_token_usage(chunks, start_idx=0, end_idx=None):
    """
    Calculate approximate token usage for a range of chunks

    Args:
        chunks: List of chunks
        start_idx: Starting index (0-based)
        end_idx: Ending index (exclusive), None for all remaining
    """
    if end_idx is None:
        end_idx = len(chunks)

    total_chars = 0
    total_estimated_tokens = 0

    for i in range(start_idx, min(end_idx, len(chunks))):
        chunk = chunks[i]
        text = chunk.get('content', chunk.get('text', ''))
        chars = len(text)
        estimated_tokens = chars // 4  # Rough estimate: 4 chars = 1 token

        total_chars += chars
        total_estimated_tokens += estimated_tokens

    print(f"📊 TOKEN USAGE ANALYSIS")
    print(f"Chunks {start_idx} to {min(end_idx-1, len(chunks)-1)}:")
    print(f"  Total chunks: {min(end_idx, len(chunks)) - start_idx}")
    print(f"  Total characters: {total_chars:,}")
    print(f"  Estimated tokens: {total_estimated_tokens:,}")
    print(f"  Average tokens per chunk: {total_estimated_tokens // max(1, min(end_idx, len(chunks)) - start_idx)}")

    return total_estimated_tokens

def suggest_batch_strategy(chunks, rate_limit_tpm=10000, rate_limit_rpm=3):
    """
    Suggest batching strategy based on rate limits

    Args:
        chunks: List of chunks
        rate_limit_tpm: Tokens per minute limit
        rate_limit_rpm: Requests per minute limit
    """
    print(f"💡 RATE LIMIT STRATEGY")
    print(f"Current limits: {rate_limit_rpm} RPM, {rate_limit_tpm:,} TPM")

    # Calculate average tokens per chunk
    total_tokens = calculate_token_usage(chunks, 0, len(chunks))
    avg_tokens_per_chunk = total_tokens // len(chunks)

    # Calculate optimal batch size based on TPM limit
    max_batch_size_tpm = rate_limit_tpm // avg_tokens_per_chunk

    # Calculate batch size based on RPM limit
    # With 3 RPM, we can make 1 request every 20 seconds
    seconds_between_requests = 60 / rate_limit_rpm

    print(f"\nRecommended strategy:")
    print(f"  Batch size: {min(5, max_batch_size_tpm)} chunks")
    print(f"  Wait time between batches: {seconds_between_requests:.1f} seconds")
    print(f"  Estimated time for all chunks: {(len(chunks) // min(5, max_batch_size_tpm)) * seconds_between_requests / 60:.1f} minutes")

# Usage examples:

# 1. Inspect the 219th chunk (remember Python is 0-indexed, so chunk 219 is index 218)
print("🔍 Inspecting chunk that caused the failure...")
if 'datasets' in locals() and 'Structural_Balanced' in datasets:
    chunks = datasets['Structural_Balanced']
    inspect_chunk(chunks, 218)  # 219th chunk is at index 218

    print("\n" + "="*60)

    # Calculate how many tokens were processed before failure
    tokens_processed = calculate_token_usage(chunks, 0, 218)
    print(f"\n📈 Tokens processed before failure: {tokens_processed:,}")

    print("\n" + "="*60)

    # Suggest better batching strategy
    suggest_batch_strategy(chunks)

else:
    print("❌ Dataset 'LLM_smallChunks' not found. Make sure your datasets variable is defined.")
    print("\nTo inspect chunk 219 manually:")
    print("inspect_chunk(your_chunks_list, 218)  # Remember: 0-indexed!")

🔍 Inspecting chunk that caused the failure...
🔍 INSPECTING CHUNK 218
Content: Mumbai, Maharashtra, India, 400051 Corporate Address : 2 nd Floor B Wing, Building 25, Dhirubhai Ambani Knowledge City,
Kopar Khairane, Navi Mumbai –400 710, India General About Us Help Center Investor Relations Complaint Resolution JioPay Business Partner Program Products JioPay for Business Payment Gateway Point of Sale UPI Hub Biller Centre JioPay Business App Legal Privacy Policy Terms & Conditions Grievance Redressal Policy Merchant Onboarding & KYC-AML Policy BillPay Terms & Conditions Jio...
Full length: 2357 characters

Metadata:
  chunk_id: https://www.jiopay.com/business/upi#structural_chunk_0
  source_url: https://www.jiopay.com/business/upi
  source_title: https://www.jiopay.com/business/upi
  chunk_index: 0
  token_count: 372
  char_count: 2357
  strategy: structural
  strategy_params: {'preserve_hierarchy': True, 'min_chunk_tokens': 150, 'max_chunk_tokens': 1500}
  structural_info: {'structure_t

In [None]:
import voyageai
import time

# Quick test
try:
    vo = voyageai.Client(api_key=v_api)

    # Test with just one tiny text
    test_text = ["Hello world"]

    print("Testing Voyage AI...")
    response = vo.embed(
        test_text,
        model="voyage-3.5",
        input_type="document",
    )

    print(f"SUCCESS! Got embedding of shape: {len(response.embeddings[0])}")
    print(f"First 5 values: {response.embeddings[0][:5]}")

except Exception as e:
    print(f"FAILED: {e}")

    # If rate limited, wait and try again
    if "rate limit" in str(e).lower() or "RPM" in str(e):
        print("Rate limited. Waiting 65 seconds and trying again...")
        time.sleep(65)
        try:
            response = vo.embed(test_text, model="voyage-3.5", input_type="document")
            print(f"SUCCESS after wait! Got embedding of shape: {len(response.embeddings[0])}")
        except Exception as e2:
            print(f"Still failed: {e2}")

Testing Voyage AI...
SUCCESS! Got embedding of shape: 1024
First 5 values: [0.0023732187692075968, 0.039719145745038986, -0.010803607292473316, 0.04575645551085472, 0.05687781423330307]
