In [3]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple/, https://aws:****@quickstep-905418116080.d.codeartifact.eu-central-1.amazonaws.com/pypi/librarian/simple/
[0mCollecting torch>=2.0.0 (from -r requirements.txt (line 1))
  Downloading torch-2.7.1-cp311-none-macosx_11_0_arm64.whl.metadata (29 kB)
[0mCollecting transformers>=4.30.0 (from -r requirements.txt (line 2))
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
[0mCollecting datasets>=2.12.0 (from -r requirements.txt (line 3))
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
[0mCollecting sentence-transformers>=2.2.0 (from -r requirements.txt (line 4))
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
[0mCollecting scikit-learn>=1.3.0 (from -r requirements.txt (line 5))
  Downloading scikit_learn-1.7.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (31 kB)
[0mCollecting numpy>=1.24.0 (from -r requirements.txt (line 6))
  Downloading numpy-2.3.1-cp311-cp311-macosx_14_0_arm64.whl.me

In [92]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import numpy as np
from typing import List, Tuple, Dict, Optional, Union
import json
from tqdm import tqdm
import os
import platform
import requests
import gzip
import csv
from dataclasses import dataclass
import logging
from sklearn.metrics.pairwise import cosine_similarity
import random
from datasets import load_dataset, Dataset as HFDataset
import pickle
import warnings
from torch.serialization import safe_globals
warnings.filterwarnings("ignore")

In [81]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [82]:
@dataclass
class EmbeddingConfig:
    """Configuration for multilingual embedding model"""
    base_model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    embedding_dim: int = 768  # Good dimension for multilingual tasks
    max_length: int = 512
    batch_size: int = 16  # Optimized for Mac M1/M2 and Windows
    learning_rate: float = 2e-5
    num_epochs: int = 4
    warmup_steps: int = 1000
    temperature: float = 0.05
    margin: float = 0.3
    gradient_accumulation_steps: int = 2
    max_grad_norm: float = 1.0
    weight_decay: float = 0.01
    save_steps: int = 1000
    eval_steps: int = 500
    # Dataset configuration
    dataset_name: str = "sentence-transformers/all-nli"
    max_train_samples: int = 50000
    max_val_samples: int = 5000
    dataset_config_name: str = "pair-class"
    # Multilingual settings
    languages: List[str] = None  # Will be set to common languages
    
    def __post_init__(self):
        if self.languages is None:
            self.languages = ['en', 'de', 'fr', 'es', 'it', 'pt', 'nl', 'pl', 'ru', 'zh', 'ja', 'ar']

In [83]:
class MultilingualDatasetProcessor:
    """Process datasets for multilingual embedding training"""
    
    def __init__(self, config: EmbeddingConfig):
        self.config = config
        self.data_dir = "embedding_data"
        os.makedirs(self.data_dir, exist_ok=True)
        
    def download_and_process_allnli(self):
        """Download and process AllNLI dataset"""
        logger.info("Loading AllNLI dataset...")
        
        try:
            # Load the dataset
            dataset = load_dataset(self.config.dataset_name, self.config.dataset_config_name)
            print(f"Dataset loaded: {self.config.dataset_name}", dataset)
            # Process training data
            train_data = self._process_nli_split(dataset['train'], 'train')
            val_data = self._process_nli_split(dataset['dev'], 'validation')
            
            # Save processed data
            self._save_processed_data(train_data, 'train_data.json')
            self._save_processed_data(val_data, 'val_data.json')
            
            logger.info(f"Processed {len(train_data)} training samples and {len(val_data)} validation samples")
            return True
            
        except Exception as e:
            logger.error(f"Error processing AllNLI dataset: {e}")
            return False
    
    def download_and_process_msmarco(self):
        """Download and process MS MARCO dataset"""
        logger.info("Loading MS MARCO dataset...")
        
        try:
            # Load MS MARCO passage ranking dataset
            dataset = load_dataset("ms_marco", "v1.1")
            
            # Process the dataset
            train_data = self._process_msmarco_split(dataset['train'])
            val_data = self._process_msmarco_split(dataset['validation'])
            
            # Save processed data
            self._save_processed_data(train_data, 'train_data.json')
            self._save_processed_data(val_data, 'val_data.json')
            
            logger.info(f"Processed {len(train_data)} training samples and {len(val_data)} validation samples")
            return True
            
        except Exception as e:
            logger.error(f"Error processing MS MARCO dataset: {e}")
            return False
    
    def _process_nli_split(self, split_data, split_name):
        """Process NLI data split"""
        processed_data = []
        max_samples = self.config.max_train_samples if split_name == 'train' else self.config.max_val_samples
        
        # Sample data if too large
        if len(split_data) > max_samples:
            indices = random.sample(range(len(split_data)), max_samples)
            split_data = split_data.select(indices)
        
        for item in tqdm(split_data, desc=f"Processing {split_name} data"):
            if item['label'] == 0:  # Entailment - positive pair
                processed_data.append({
                    'query': item['premise'],
                    'positive': item['hypothesis'],
                    'negative': self._get_random_negative(split_data, item['premise'])
                })
            elif item['label'] == 2:  # Contradiction - can be used as negative
                # Find a neutral or entailment pair for positive
                positive_text = self._get_random_positive(split_data, item['premise'])
                if positive_text:
                    processed_data.append({
                        'query': item['premise'],
                        'positive': positive_text,
                        'negative': item['hypothesis']
                    })
        
        return processed_data
    
    def _process_msmarco_split(self, split_data):
        """Process MS MARCO data split"""
        processed_data = []
        
        for item in tqdm(split_data, desc="Processing MS MARCO data"):
            if 'passages' in item and 'is_selected' in item:
                query = item['query']
                positive_passages = [p['passage_text'] for p, selected in zip(item['passages'], item['is_selected']) if selected]
                negative_passages = [p['passage_text'] for p, selected in zip(item['passages'], item['is_selected']) if not selected]
                
                # Create training pairs
                for positive in positive_passages:
                    if negative_passages:
                        negative = random.choice(negative_passages)
                        processed_data.append({
                            'query': query,
                            'positive': positive,
                            'negative': negative
                        })
        
        return processed_data
    
    def _get_random_negative(self, dataset, query):
        """Get a random negative example"""
        max_attempts = 10
        for _ in range(max_attempts):
            random_item = random.choice(dataset)
            if random_item['premise'] != query:
                return random_item['hypothesis']
        return "This is a random negative example."
    
    def _get_random_positive(self, dataset, query):
        """Get a random positive example"""
        for item in dataset:
            if item['premise'] == query and item['label'] == 0:
                return item['hypothesis']
        return None
    
    def _save_processed_data(self, data, filename):
        """Save processed data to JSON file"""
        filepath = os.path.join(self.data_dir, filename)
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        logger.info(f"Saved {len(data)} samples to {filepath}")

In [84]:
class MultilingualEmbeddingDataset(Dataset):
    """Dataset for multilingual embedding training"""
    
    def __init__(self, data_path: str, tokenizer, max_length: int = 512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = self.load_data(data_path)
        
    def load_data(self, data_path: str) -> List[Dict]:
        """Load training data from JSON file"""
        if not os.path.exists(data_path):
            raise FileNotFoundError(f"Data file not found: {data_path}")
        
        with open(data_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        logger.info(f"Loaded {len(data)} samples from {data_path}")
        return data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Tokenize query, positive, and negative examples
        query = self.tokenize_text(item['query'])
        positive = self.tokenize_text(item['positive'])
        negative = self.tokenize_text(item['negative'])
        
        return {
            'query': query,
            'positive': positive,
            'negative': negative
        }
    
    def tokenize_text(self, text: str) -> Dict[str, torch.Tensor]:
        """Tokenize text with proper padding and truncation"""
        if not isinstance(text, str):
            text = str(text)
        
        encoded = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze()
        }

In [85]:
class MultilingualEmbeddingModel(nn.Module):
    """Multilingual embedding model with advanced pooling"""
    
    def __init__(self, config: EmbeddingConfig):
        super().__init__()
        self.config = config
        self.base_model = AutoModel.from_pretrained(config.base_model)
        
        # Get the hidden size from base model
        self.hidden_size = self.base_model.config.hidden_size
        
        # Projection layers
        self.projection = nn.Sequential(
            nn.Linear(self.hidden_size, config.embedding_dim),
            nn.Tanh(),
            nn.Linear(config.embedding_dim, config.embedding_dim)
        )
        
        # Dropout for regularization
        self.dropout = nn.Dropout(0.1)
        
        # Layer normalization
        self.layer_norm = nn.LayerNorm(config.embedding_dim)
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        """Initialize projection layer weights"""
        for module in self.projection:
            if isinstance(module, nn.Linear):
                nn.init.normal_(module.weight, mean=0.0, std=0.02)
                nn.init.zeros_(module.bias)
    
    def mean_pooling(self, model_output, attention_mask):
        """Mean pooling with attention mask"""
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    def cls_pooling(self, model_output):
        """CLS token pooling"""
        return model_output[0][:, 0]
    
    def max_pooling(self, model_output, attention_mask):
        """Max pooling with attention mask"""
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        token_embeddings[input_mask_expanded == 0] = -1e9
        return torch.max(token_embeddings, 1)[0]
    
    def forward(self, input_ids, attention_mask, pooling_strategy='mean'):
        """Forward pass through the model"""
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Apply different pooling strategies
        if pooling_strategy == 'mean':
            pooled_output = self.mean_pooling(outputs, attention_mask)
        elif pooling_strategy == 'cls':
            pooled_output = self.cls_pooling(outputs)
        elif pooling_strategy == 'max':
            pooled_output = self.max_pooling(outputs, attention_mask)
        else:
            pooled_output = self.mean_pooling(outputs, attention_mask)
        
        # Apply projection
        embeddings = self.projection(pooled_output)
        
        # Apply dropout and layer norm
        embeddings = self.dropout(embeddings)
        embeddings = self.layer_norm(embeddings)
        
        # L2 normalize embeddings
        embeddings = F.normalize(embeddings, p=2, dim=1)
        
        return embeddings

In [86]:
class MultiScaleLoss(nn.Module):
    """Combined loss function for better training"""
    
    def __init__(self, margin: float = 0.3, temperature: float = 0.05, alpha: float = 0.5):
        super().__init__()
        self.margin = margin
        self.temperature = temperature
        self.alpha = alpha
    
    def triplet_loss(self, anchor, positive, negative):
        """Triplet loss"""
        pos_dist = F.pairwise_distance(anchor, positive, p=2)
        neg_dist = F.pairwise_distance(anchor, negative, p=2)
        loss = F.relu(pos_dist - neg_dist + self.margin)
        return loss.mean()
    
    def contrastive_loss(self, anchor, positive, negative):
        """Contrastive loss with temperature scaling"""
        pos_sim = F.cosine_similarity(anchor, positive, dim=1) / self.temperature
        neg_sim = F.cosine_similarity(anchor, negative, dim=1) / self.temperature
        
        logits = torch.cat([pos_sim.unsqueeze(1), neg_sim.unsqueeze(1)], dim=1)
        labels = torch.zeros(logits.size(0), dtype=torch.long, device=logits.device)
        
        loss = F.cross_entropy(logits, labels)
        return loss
    
    def forward(self, anchor, positive, negative):
        """Compute combined loss"""
        triplet_loss = self.triplet_loss(anchor, positive, negative)
        contrastive_loss = self.contrastive_loss(anchor, positive, negative)
        
        return self.alpha * triplet_loss + (1 - self.alpha) * contrastive_loss

In [87]:
class MultilingualEmbeddingTrainer:
    """Trainer for multilingual embedding models"""
    
    def __init__(self, model: MultilingualEmbeddingModel, config: EmbeddingConfig):
        self.model = model
        self.config = config
        
        # Setup device (Mac M1/M2 and Windows compatible)
        if torch.backends.mps.is_available():
            self.device = torch.device('mps')  # Mac M1/M2
            logger.info("Using MPS (Apple Silicon) device")
        elif torch.cuda.is_available():
            self.device = torch.device('cuda')  # Windows/Linux with CUDA
            logger.info(f"Using CUDA device: {torch.cuda.get_device_name()}")
        else:
            self.device = torch.device('cpu')
            logger.info("Using CPU device")
        
        self.model.to(self.device)
        
        # Initialize loss
        self.criterion = MultiScaleLoss(
            margin=config.margin,
            temperature=config.temperature,
            alpha=0.5
        )
        
        # Initialize optimizer
        self.optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=config.learning_rate,
            weight_decay=config.weight_decay
        )
        
        # Initialize metrics
        self.train_losses = []
        self.val_losses = []
        self.global_step = 0
        
    def train_epoch(self, dataloader: DataLoader, epoch: int):
        """Train for one epoch"""
        self.model.train()
        total_loss = 0
        num_batches = 0
        
        progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{self.config.num_epochs}')
        
        for batch_idx, batch in enumerate(progress_bar):
            try:
                # Move batch to device
                query = {k: v.to(self.device) for k, v in batch['query'].items()}
                positive = {k: v.to(self.device) for k, v in batch['positive'].items()}
                negative = {k: v.to(self.device) for k, v in batch['negative'].items()}
                
                # Forward pass
                query_emb = self.model(**query)
                positive_emb = self.model(**positive)
                negative_emb = self.model(**negative)
                
                # Compute loss
                loss = self.criterion(query_emb, positive_emb, negative_emb)
                
                # Scale loss for gradient accumulation
                loss = loss / self.config.gradient_accumulation_steps
                
                # Backward pass
                loss.backward()
                
                # Gradient accumulation
                if (batch_idx + 1) % self.config.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm)
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    self.global_step += 1
                
                total_loss += loss.item() * self.config.gradient_accumulation_steps
                num_batches += 1
                
                # Update progress bar
                progress_bar.set_postfix({
                    'loss': f'{loss.item() * self.config.gradient_accumulation_steps:.4f}',
                    'avg_loss': f'{total_loss / num_batches:.4f}'
                })
                
                # Save checkpoint
                if self.global_step % self.config.save_steps == 0:
                    self.save_checkpoint(f'checkpoint_step_{self.global_step}.pt')
                
            except Exception as e:
                logger.error(f"Error in training batch {batch_idx}: {e}")
                continue
        
        return total_loss / num_batches if num_batches > 0 else 0
    
    def validate(self, dataloader: DataLoader):
        """Validate the model"""
        self.model.eval()
        total_loss = 0
        num_batches = 0
        
        with torch.no_grad():
            for batch in tqdm(dataloader, desc='Validating'):
                try:
                    query = {k: v.to(self.device) for k, v in batch['query'].items()}
                    positive = {k: v.to(self.device) for k, v in batch['positive'].items()}
                    negative = {k: v.to(self.device) for k, v in batch['negative'].items()}
                    
                    query_emb = self.model(**query)
                    positive_emb = self.model(**positive)
                    negative_emb = self.model(**negative)
                    
                    loss = self.criterion(query_emb, positive_emb, negative_emb)
                    total_loss += loss.item()
                    num_batches += 1
                    
                except Exception as e:
                    logger.error(f"Error in validation batch: {e}")
                    continue
        
        return total_loss / num_batches if num_batches > 0 else float('inf')
    
    def train(self, train_dataloader: DataLoader, val_dataloader: Optional[DataLoader] = None):
        """Full training loop"""
        # Initialize scheduler
        total_steps = len(train_dataloader) * self.config.num_epochs // self.config.gradient_accumulation_steps
        scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=self.config.warmup_steps,
            num_training_steps=total_steps
        )
        
        best_val_loss = float('inf')
        
        logger.info(f"Starting training for {self.config.num_epochs} epochs...")
        logger.info(f"Total training steps: {total_steps}")
        
        for epoch in range(self.config.num_epochs):
            # Train
            train_loss = self.train_epoch(train_dataloader, epoch)
            self.train_losses.append(train_loss)
            
            logger.info(f'Epoch {epoch+1}/{self.config.num_epochs} - Train Loss: {train_loss:.4f}')
            
            # Validate
            if val_dataloader:
                val_loss = self.validate(val_dataloader)
                self.val_losses.append(val_loss)
                logger.info(f'Epoch {epoch+1}/{self.config.num_epochs} - Val Loss: {val_loss:.4f}')
                
                # Save best model
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    self.save_model(f'best_multilingual_embedding_model.pt')
                    logger.info(f'New best model saved with val loss: {val_loss:.4f}')
            
            # Update scheduler
            scheduler.step()
        
        # Save final model
        self.save_model('final_multilingual_embedding_model.pt')
        logger.info('Training completed!')
    
    def save_model(self, path: str):
        """Save model checkpoint"""
        save_dir = "model_checkpoints"
        os.makedirs(save_dir, exist_ok=True)
        full_path = os.path.join(save_dir, path)
        
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'config': self.config,
            'train_losses': self.train_losses,
            'val_losses': self.val_losses,
            'global_step': self.global_step
        }, full_path)
        
        logger.info(f'Model saved to {full_path}')
    
    def save_checkpoint(self, path: str):
        """Save training checkpoint"""
        save_dir = "model_checkpoints"
        os.makedirs(save_dir, exist_ok=True)
        full_path = os.path.join(save_dir, path)
        
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'config': self.config,
            'train_losses': self.train_losses,
            'val_losses': self.val_losses,
            'global_step': self.global_step
        }, full_path)

In [93]:
class MultilingualEmbeddingInference:
    """Inference class for multilingual embedding model"""
    
    def __init__(self, model_path: str, config: Optional[EmbeddingConfig] = None):
        # Setup device
        if torch.backends.mps.is_available():
            self.device = torch.device('mps')
        elif torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        
        with safe_globals({"__main__.EmbeddingConfig": EmbeddingConfig}):
            checkpoint = torch.load(model_path, map_location=self.device, weights_only=False)
        # Load checkpoint
        # checkpoint = torch.load(model_path, map_location=self.device)
        self.config = config or checkpoint['config']
        
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.base_model)
        
        # Load model
        self.model = MultilingualEmbeddingModel(self.config)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.to(self.device)
        self.model.eval()
        
        logger.info(f"Model loaded from {model_path}")
        logger.info(f"Using device: {self.device}")
    
    def encode(self, texts: Union[str, List[str]], batch_size: int = 32, 
               show_progress: bool = True, normalize: bool = True) -> np.ndarray:
        """Encode texts to embeddings"""
        if isinstance(texts, str):
            texts = [texts]
        
        all_embeddings = []
        
        iterator = range(0, len(texts), batch_size)
        if show_progress:
            iterator = tqdm(iterator, desc="Encoding")
        
        for i in iterator:
            batch_texts = texts[i:i + batch_size]
            
            # Tokenize
            try:
                encoded = self.tokenizer(
                    batch_texts,
                    max_length=self.config.max_length,
                    padding=True,
                    truncation=True,
                    return_tensors='pt'
                )
                
                # Move to device
                input_ids = encoded['input_ids'].to(self.device)
                attention_mask = encoded['attention_mask'].to(self.device)
                
                # Generate embeddings
                with torch.no_grad():
                    embeddings = self.model(input_ids, attention_mask)
                    all_embeddings.append(embeddings.cpu().numpy())
                    
            except Exception as e:
                logger.error(f"Error encoding batch: {e}")
                # Return zero embeddings for failed batch
                zero_embeddings = np.zeros((len(batch_texts), self.config.embedding_dim))
                all_embeddings.append(zero_embeddings)
        
        result = np.vstack(all_embeddings)
        
        if normalize:
            result = result / (np.linalg.norm(result, axis=1, keepdims=True) + 1e-8)
        
        return result
    
    def similarity(self, text1: str, text2: str) -> float:
        """Compute cosine similarity between two texts"""
        embeddings = self.encode([text1, text2], show_progress=False)
        return float(cosine_similarity([embeddings[0]], [embeddings[1]])[0][0])
    
    def find_most_similar(self, query: str, texts: List[str], top_k: int = 5) -> List[Tuple[str, float]]:
        """Find most similar texts to query"""
        query_embedding = self.encode([query], show_progress=False)
        text_embeddings = self.encode(texts, show_progress=True)
        
        similarities = cosine_similarity(query_embedding, text_embeddings)[0]
        
        # Get top k most similar
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            results.append((texts[idx], float(similarities[idx])))
        
        return results

In [89]:
def setup_environment():
    """Setup environment for cross-platform compatibility"""
    system = platform.system()
    logger.info(f"Running on {system}")
    
    if system == "Darwin":  # macOS
        logger.info("Mac detected - optimizing for Apple Silicon")
        if torch.backends.mps.is_available():
            logger.info("MPS backend available")
        else:
            logger.info("MPS backend not available, using CPU")
    elif system == "Windows":
        logger.info("Windows detected")
        if torch.cuda.is_available():
            logger.info(f"CUDA available: {torch.cuda.get_device_name()}")
        else:
            logger.info("CUDA not available, using CPU")
    
    # Set environment variables for better performance
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    
    # Set random seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    random.seed(42)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)

In [90]:
def main():
    """Main training and inference function"""
    # Setup environment
    setup_environment()
    
    # Configuration
    config = EmbeddingConfig(
        base_model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
        embedding_dim=768,
        max_length=512,
        batch_size=8,  # Reduced for better compatibility
        learning_rate=2e-5,
        num_epochs=3,
        warmup_steps=500,
        temperature=0.05,
        margin=0.3,
        gradient_accumulation_steps=4,
        max_train_samples=20000,
        max_val_samples=2000,
        dataset_config_name = "pair-class"
    )
    
    logger.info("Starting multilingual embedding model training...")
    
    # Initialize dataset processor
    processor = MultilingualDatasetProcessor(config)
    
    # Download and process dataset
    logger.info("Processing dataset...")
    if not processor.download_and_process_allnli():
        logger.error("Failed to process dataset")
        return
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(config.base_model)
    
    # Create datasets
    train_dataset = MultilingualEmbeddingDataset(
        os.path.join(processor.data_dir, 'train_data.json'),
        tokenizer,
        config.max_length
    )
    
    val_dataset = MultilingualEmbeddingDataset(
        os.path.join(processor.data_dir, 'val_data.json'),
        tokenizer,
        config.max_length
    )
    
    # Create data loaders
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=0,  # Set to 0 for Mac compatibility
        pin_memory=False
    )
    
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=False
    )
    
    # Initialize model
    model = MultilingualEmbeddingModel(config)
    
    # Initialize trainer
    trainer = MultilingualEmbeddingTrainer(model, config)
    
    # Train model
    trainer.train(train_dataloader, val_dataloader)
    
    # Test inference
    logger.info("Testing inference...")
    try:
        inference = MultilingualEmbeddingInference(
            'model_checkpoints/best_multilingual_embedding_model.pt',
            config
        )
        
        # Test multilingual similarity
        test_cases = [
            ("What is machine learning?", "Machine learning is a subset of AI"),
            ("Was ist maschinelles Lernen?", "Machine learning is a subset of AI"),  # German
            ("Qu'est-ce que l'apprentissage automatique?", "Machine learning is a subset of AI"),  # French
            ("¿Qué es el aprendizaje automático?", "Machine learning is a subset of AI"),  # Spanish
        ]
        
        for query, text in test_cases:
            similarity = inference.similarity(query, text)
            logger.info(f"Similarity between '{query}' and '{text}': {similarity:.4f}")
        
        # Test embedding generation
        test_texts = [
            "Hello world",
            "Hola mundo",
            "Bonjour le monde",
            "Hallo Welt",
            "Ciao mondo"
        ]
        
        embeddings = inference.encode(test_texts)
        logger.info(f"Generated embeddings shape: {embeddings.shape}")
        
        # Test cross-lingual retrieval
        corpus = [
            "Machine learning is a method of data analysis that automates analytical model building.",
            "Maschinelles Lernen ist eine Methode der Datenanalyse, die den Aufbau analytischer Modelle automatisiert.",
            "L'apprentissage automatique est une méthode d'analyse de données qui automatise la construction de modèles analytiques.",
            "El aprendizaje automático es un método de análisis de datos que automatiza la construcción de modelos analíticos.",
            "I love cooking pasta with tomatoes.",
            "Sports are great for physical fitness.",
            "Music helps me relax after work."
        ]
        
        # Query in German, should find ML-related sentences
        german_query = "Was ist maschinelles Lernen?"
        results = inference.find_most_similar(german_query, corpus, top_k=3)
        
        logger.info(f"\nTop 3 results for German query: '{german_query}'")
        for i, (text, score) in enumerate(results):
            logger.info(f"{i+1}. Score: {score:.4f} - Text: {text[:100]}...")
        
        # Query in English, should find ML-related sentences
        english_query = "What is machine learning?"
        results = inference.find_most_similar(english_query, corpus, top_k=3)
        
        logger.info(f"\nTop 3 results for English query: '{english_query}'")
        for i, (text, score) in enumerate(results):
            logger.info(f"{i+1}. Score: {score:.4f} - Text: {text[:100]}...")
        
    except Exception as e:
        logger.error(f"Error in inference testing: {e}")
        logger.info("You can still use the trained model for inference later.")

In [91]:
main()

2025-07-11 19:57:19,537 - INFO - Running on Darwin
2025-07-11 19:57:19,538 - INFO - Mac detected - optimizing for Apple Silicon
2025-07-11 19:57:19,539 - INFO - MPS backend available
2025-07-11 19:57:19,542 - INFO - Starting multilingual embedding model training...
2025-07-11 19:57:19,543 - INFO - Processing dataset...
2025-07-11 19:57:19,543 - INFO - Loading AllNLI dataset...
Generating train split: 100%|██████████| 942069/942069 [00:00<00:00, 6473324.54 examples/s]
Generating dev split: 100%|██████████| 19657/19657 [00:00<00:00, 4884905.42 examples/s]
Generating test split: 100%|██████████| 19656/19656 [00:00<00:00, 5464883.96 examples/s]


Dataset loaded: sentence-transformers/all-nli DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 942069
    })
    dev: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 19657
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 19656
    })
})


Processing train data: 100%|██████████| 20000/20000 [40:32<00:00,  8.22it/s] 
Processing validation data: 100%|██████████| 2000/2000 [00:21<00:00, 93.30it/s] 
2025-07-11 20:38:36,985 - INFO - Saved 6886 samples to embedding_data/train_data.json
2025-07-11 20:38:36,988 - INFO - Saved 779 samples to embedding_data/val_data.json
2025-07-11 20:38:36,988 - INFO - Processed 6886 training samples and 779 validation samples
2025-07-11 20:38:38,194 - INFO - Loaded 6886 samples from embedding_data/train_data.json
2025-07-11 20:38:38,196 - INFO - Loaded 779 samples from embedding_data/val_data.json
2025-07-11 20:39:44,621 - INFO - Using MPS (Apple Silicon) device
2025-07-11 20:39:45,712 - INFO - Starting training for 3 epochs...
2025-07-11 20:39:45,713 - INFO - Total training steps: 645
Epoch 1/3: 100%|██████████| 861/861 [31:46<00:00,  2.21s/it, loss=0.0946, avg_loss=0.0111] 
2025-07-11 21:11:32,376 - INFO - Epoch 1/3 - Train Loss: 0.0111
Validating: 100%|██████████| 98/98 [01:03<00:00,  1.55it/

In [94]:
config = EmbeddingConfig(
        base_model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
        embedding_dim=768,
        max_length=512,
        batch_size=8,  # Reduced for better compatibility
        learning_rate=2e-5,
        num_epochs=3,
        warmup_steps=500,
        temperature=0.05,
        margin=0.3,
        gradient_accumulation_steps=4,
        max_train_samples=20000,
        max_val_samples=2000,
        dataset_config_name = "pair-class"
    )

In [102]:
inference = MultilingualEmbeddingInference(
            'model_checkpoints/final_multilingual_embedding_model.pt',
            config
        )

2025-07-11 22:38:50,953 - INFO - Model loaded from model_checkpoints/final_multilingual_embedding_model.pt
2025-07-11 22:38:50,954 - INFO - Using device: mps


In [122]:
# Test multilingual similarity
# test_cases = [
#     ("What is machine learning?", "Machine learning is a subset of AI"),
#     ("i love dog", "i like cat"),
#     ("Was ist maschinelles Lernen?", "Machine learning is a subset of AI"),  # German
#     ("Qu'est-ce que l'apprentissage automatique?", "Machine learning is a subset of AI"),  # French
#     ("¿Qué es el aprendizaje automático?", "Machine learning is a subset of AI"),  # Spanish
# ]

# for query, text in test_cases:
#     similarity = inference.similarity(query, text)
#     logger.info(f"Similarity between '{query}' and '{text}': {similarity:.4f}")

# # Test embedding generation
# test_texts = [
#     "Hello world",
#     "Hola mundo",
#     "Bonjour le monde",
#     "Hallo Welt",
#     "Ciao mondo"
# ]

# embeddings = inference.encode(test_texts)
# logger.info(f"Generated embeddings shape: {embeddings.shape}")

# Test cross-lingual retrieval
# corpus = [
#     "Machine learning is a method of data analysis that automates analytical model building.",
#     "Maschinelles Lernen ist eine Methode der Datenanalyse, die den Aufbau analytischer Modelle automatisiert.",
#     "L'apprentissage automatique est une méthode d'analyse de données qui automatise la construction de modèles analytiques.",
#     "El aprendizaje automático es un método de análisis de datos que automatiza la construcción de modelos analíticos.",
#     "I love cooking pasta with tomatoes.",
#     "Sports are great for physical fitness.",
#     "Music helps me relax after work."
# ]
corpus = ['This page from The EUROCALL Review (Volume 25, No. 2, September 2017) presents a data table analyzing mobile device usage for language learning among second-year B.A. students. The table captures key metrics including device types (predominantly smartphones with some tablet usage), duration of mobile learning experience (ranging 2-5 years), and self-assessed competency levels. The document includes both qualitative research questions and quantitative analysis methodology, with findings organized into usage patterns, study performance, and mobile learning encounters.\n\n',
 'This image shows both a pictorial and schematic representation of a basic electrical circuit diagram, featuring key measurement components for financial analysis of electrical systems. The circuit documentation includes parallel voltage/current measurement capabilities using an ammeter (1-4A range) and voltmeter across a 3-ohm resistor powered by a 12V battery source. The dual diagram format enables cross-referencing between practical component layout and theoretical circuit analysis, supporting both operational and financial performance calculations.\n\n',
 'This page contains a customer review section and vehicle rental interface for Europcar Italy, featuring a negative review about service issues in Florence alongside a cookie consent notice. The document includes a vehicle category selection panel displaying four rental segments: City car, Electric, Premium, and Van & Truck options, each with representative vehicle images and "See more" navigation links. A feature list section discusses MVP app development and user experience considerations, followed by information about name change fees and booking policies.\n\n',
 'Europcar counter manual section detailing specialized pick-up procedures for Accor Bienvenue Card (I&O) rentals in Germany, version 06-2023, page 11/15. Document outlines identification requirements, reservation protocols, and payment procedures for rentals under €2,500.00, with specific focus on military procurement orders and tax-free rental processing. Contains detailed instructions for handling Bienvenue Card validation, rental agreement documentation, and proper form distribution to military units. Notable for its specialized procedures regarding military procurement thresholds and mandatory unit order requirements for rentals exceeding €2,500.00.\n\n',
 'Rental policy document section detailing discount terms and restrictions for Greenway location, with specific focus on billing procedures and loyalty program exclusions. Contains procedural requirements for Location Manager approval of billing period modifications, including mandatory documentation in Rental Notes (F7) system. Notable for explicit exclusion of Lufthansa Miles&More program and prohibition on earning reward miles/points, while covering comprehensive fee structure including CDW, TW, and airport fees.\n\n',
 'This academic research data table and analysis from The EUROCALL Review (Volume 25, No. 2, September 2017) presents mobile device usage patterns among language learners. The document contains a detailed participant matrix showing device types, experience levels, and usage duration, followed by qualitative analysis sections on mobile device adoption rationales and learning resource preferences. The findings highlight student utilization of online dictionaries (including diki and ColorDict), translation tools (Google Translate, Duolingo), and language learning apps (Fiszkoteka), with supplementary data on multimedia resource usage patterns through platforms like Vscreen, WhatsApp, TED and online newspapers.\n\n',
 "Comprehensive service matrix detailing Europcar's mobility solutions across six distinct categories, with availability specifications ranging from worldwide to select locations. This tabular overview presents core rental offerings including standard vehicles, specialty transport, and sustainable options, with particular emphasis on their growing electric vehicle fleet and business-focused solutions. The document structure enables analysis of service coverage, market penetration, and operational capabilities across different business segments, supported by detailed service descriptions and geographic availability indicators.\n\n",
 'Europcar counter manual section detailing NATO troop vehicle rental procedures and documentation requirements. The document outlines specific pick-up protocols requiring written orders from NATO troops and business account verification through MOP BHPO checkout system. Contains detailed instructions for processing forms during vehicle return, including requirements for official service stamps and handling of multiple rentals on a single form, with annotated example showing key form fields for payment amount, payment method, and troop unit stamp placement.\n\n',
 'Two variable DC power supply circuit diagrams are shown - a 0-30V/10A design using LM317 and dual display, and a simpler 0-30V/2A version with transformer input. The first circuit features digital voltage/current monitoring, dual 2N3055 power transistors, and potentiometer controls for voltage/current adjustment. The second diagram shows a basic transformer-based design using 1N4007 rectifiers, filter capacitors, and BC548 transistors for regulation. Both circuits provide adjustable DC output with voltage/current limiting capabilities, suitable for bench power supply applications requiring variable voltage control and overload protection.\n\n',
 "Europcar workforce analysis document showing detailed gender distribution metrics across management and pay quartiles for 2020-2022. Management-level data presented via bar and pie charts demonstrates distribution ratios, while four separate pie charts break down gender composition across pay quartiles from lowest to highest paid segments. The document enables analysis of gender representation trends, pay equity patterns, and vertical segregation within the organization's hierarchy. Key metrics include management-level proportions and quartile-specific gender ratios, allowing for comprehensive diversity and inclusion assessment.\n\n",
 'Europcar manufacturer restrictions matrix document detailing vehicle-specific rental limitations for passenger cars (PKW) and light commercial vehicles (LKW). The table provides maximum mileage allowances, rental duration limits (ranging from 84-180 days), and minimum power requirements for various vehicle brands including premium (Audi, BMW), mainstream (VW, Toyota), and commercial vehicles. Comprehensive reference guide for rental agents showing manufacturer-mandated constraints across the full vehicle fleet, with separate sections for passenger and commercial vehicle restrictions including specific LFZ/LZC classifications.\n\n',
 'Document formatting specifications page containing detailed layout and style guidelines for academic paper preparation. Key sections include Abstract formatting requirements (200-275 words in Times New Roman), Index Terms guidelines, and comprehensive Page Layout instructions with two-column format specifications. Contains precise measurements for margins, spacing, and indentation, along with detailed formatting rules for section headings, bullets, and references. Notable for its specific requirements around paper size (8.5"x11"), portrait orientation, and column width (3.4") specifications. Includes both technical document structure requirements and stylistic conventions for academic publishing.\n\n',
 "Europcar services matrix detailing premium mobility offerings and membership benefits, with emphasis on the Europcar Privilege loyalty program and corporate mobility solutions. Document includes comprehensive coverage of rental add-ons, from insurance protection to equipment upgrades, structured in a tabular format with three columns detailing service categories, descriptions, and availability/eligibility. Supplemental sections outline Europcar's sustainability initiatives focusing on electric vehicle fleet expansion and their customer experience framework highlighting 24/7 support and digital booking capabilities.\n\n",
 'This appears to be a quick start guide for a Flip video camera, featuring comprehensive setup and usage instructions across multiple sections. The document includes detailed diagrams showing device features, connectivity options, and operational controls, with clear step-by-step guidance for computer connection and TV viewing. Notable sections cover camcorder features, software capabilities, and recording/playback functions, presented in an accessible format with numbered steps and annotated illustrations. The guide effectively combines technical specifications with user-friendly visual aids to facilitate device operation and content sharing.',
 'IEEE document formatting guidelines page showing detailed specifications for figures, tables, and equations. Contains a reference table of point sizes and type styles for different document elements, with entries ranging from 8 to 24 points. Features the IEEE logo and demonstrates proper equation formatting with a sample differential equation. Includes specific instructions for graphics resolution requirements (600 dpi monochrome, 300 dpi grayscale/color) and proper insertion methods. Document emphasizes formatting rules for headings, captions, and table text using Times New Roman font in various styles (UPPERCASE, Small Caps, Bold). Notable for its technical documentation structure and precise typographical specifications for academic/engineering publications.\n\n']

# Query in German, should find ML-related sentences
# german_query = "Kundenbewertungsbereich und Schnittstelle zur Fahrzeugvermietung für Europcar"
# results = inference.find_most_similar(german_query, corpus, top_k=3)

# logger.info(f"\nTop 3 results for German query: '{german_query}'")
# for i, (text, score) in enumerate(results):
#     logger.info(f"{i+1}. Score: {score:.4f} - Text: {text[:100]}...")

# Query in English, should find ML-related sentences
english_query = "customer review section and vehicle rental interface for Europcar?"
results = inference.find_most_similar(english_query, corpus, top_k=3)

logger.info(f"\nTop 3 results for English query: '{english_query}'")
for i, (text, score) in enumerate(results):
    logger.info(f"{i+1}. Score: {score:.4f} - Text: {text[:100]}...")

Encoding: 100%|██████████| 1/1 [00:00<00:00,  6.51it/s]
2025-07-11 22:59:35,653 - INFO - 
Top 3 results for English query: 'customer review section and vehicle rental interface for Europcar?'
2025-07-11 22:59:35,653 - INFO - 1. Score: 0.8464 - Text: This page contains a customer review section and vehicle rental interface for Europcar Italy, featur...
2025-07-11 22:59:35,653 - INFO - 2. Score: 0.7609 - Text: Europcar services matrix detailing premium mobility offerings and membership benefits, with emphasis...
2025-07-11 22:59:35,653 - INFO - 3. Score: 0.7373 - Text: Comprehensive service matrix detailing Europcar's mobility solutions across six distinct categories,...


In [112]:
import random

final_data = []
for i in dataaa['results']:
    final_data.append(i['chunk_data'])

# Randomly shuffle the list in place
random.shuffle(final_data)

In [113]:
final_data

['This page from The EUROCALL Review (Volume 25, No. 2, September 2017) presents a data table analyzing mobile device usage for language learning among second-year B.A. students. The table captures key metrics including device types (predominantly smartphones with some tablet usage), duration of mobile learning experience (ranging 2-5 years), and self-assessed competency levels. The document includes both qualitative research questions and quantitative analysis methodology, with findings organized into usage patterns, study performance, and mobile learning encounters.\n\n',
 'This image shows both a pictorial and schematic representation of a basic electrical circuit diagram, featuring key measurement components for financial analysis of electrical systems. The circuit documentation includes parallel voltage/current measurement capabilities using an ammeter (1-4A range) and voltmeter across a 3-ohm resistor powered by a 12V battery source. The dual diagram format enables cross-referenc