# Language Detection with XLM-RoBERTa (HF, transformers) over WER-binned sample sentences

**⚠️ Note:** This model is for **LANGUAGE DETECTION**, not sentiment analysis!

XLM-RoBERTa fine-tuned to detect 20 languages with 99.6% accuracy. Useful for:
- Identifying language of transcriptions
- Preprocessing before language-specific sentiment analysis
- Detecting code-switching or multilingual content

**Model Details:**
- Model: [papluca/xlm-roberta-base-language-detection](https://huggingface.co/papluca/xlm-roberta-base-language-detection)
- Base: XLM-RoBERTa-base (~278M parameters)
- Accuracy: 99.6% on language identification
- License: MIT
- Supported Languages: ar, bg, de, el, en, es, fr, hi, it, ja, nl, pl, pt, ru, sw, th, tr, ur, vi, zh


### Setup


In [None]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from typing import List, Dict
import logging
import random
from datetime import datetime
import re
logging.basicConfig(level=logging.INFO)


###### Model Configuration

XLM-RoBERTa fine-tuned for language detection across 20 languages.
This is a direct classification model (not zero-shot) - very fast and accurate.


In [None]:
# XLM-RoBERTa fine-tuned for language detection
model_id = "papluca/xlm-roberta-base-language-detection"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using:", device)

# Check GPU memory (if available)
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)

# Move model to device
model = model.to(device)

# Initialize the text classification pipeline for language detection
lang_pipeline = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Language code to full name mapping
LANGUAGE_NAMES = {
    'ar': 'Arabic', 'bg': 'Bulgarian', 'de': 'German', 'el': 'Greek',
    'en': 'English', 'es': 'Spanish', 'fr': 'French', 'hi': 'Hindi',
    'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish',
    'pt': 'Portuguese', 'ru': 'Russian', 'sw': 'Swahili', 'th': 'Thai',
    'tr': 'Turkish', 'ur': 'Urdu', 'vi': 'Vietnamese', 'zh': 'Chinese'
}

print(f"✓ XLM-RoBERTa Language Detection model loaded successfully")
print(f"  Model: {model_id}")
print(f"  Parameters: ~278M")
print(f"  Accuracy: 99.6%")
print(f"  Supported languages: {len(LANGUAGE_NAMES)}")


###### Test Pipeline (Optional - Run this to verify setup)


In [None]:
# Quick test to verify the pipeline works
print("Testing XLM-RoBERTa language detection pipeline...")

test_sentences = [
    "Hello, how are you today?",  # English
    "Bonjour, comment allez-vous?",  # French
    "Hola, ¿cómo estás?",  # Spanish
    "こんにちは、元気ですか？",  # Japanese
    "مرحبا، كيف حالك؟",  # Arabic
]

for sentence in test_sentences:
    result = lang_pipeline(sentence, top_k=1, truncation=True)[0]
    lang_code = result['label']
    lang_name = LANGUAGE_NAMES.get(lang_code, lang_code)
    score = result['score']
    print(f"  '{sentence[:30]}...' -> {lang_name} ({lang_code}) [{score:.4f}]")

print("\n✓ Test successful! Language detection is working correctly.")


### Data Loading and Processing


###### Load Data from XLSX


In [None]:
def load_data_from_xlsx(xlsx_path: str, sheet_index: int = 1) -> pd.DataFrame:
    """
    Load data from the second sheet (index 1) of an XLSX file.
    Expected columns:
    - Machine Transcription (ground truth)
    - Human Transcription (hypothesis)
    - Sentiment Label
    - WER
    - CER
    - List of AAE features
    - List of transcription errors
    - Interview
    """
    xl_file = pd.ExcelFile(xlsx_path)
    sheet_names = xl_file.sheet_names
    if len(sheet_names) <= sheet_index:
        raise ValueError(f"Sheet index {sheet_index} not available. Available sheets: {sheet_names}")
    
    df = pd.read_excel(xlsx_path, sheet_name=sheet_index)
    logging.info(f"Loaded {len(df)} rows from sheet '{sheet_names[sheet_index]}'")
    return df


In [None]:
def split_data_into_sets(df: pd.DataFrame, test_ratio: float = 0.5, random_seed: int = None) -> tuple:
    """
    Split dataframe into test and example sets.
    Returns (test_df, example_df) with equal sizes (or as close as possible).
    
    This function is kept for API consistency with other model notebooks.
    """
    if random_seed is not None:
        random.seed(random_seed)
    
    # Shuffle the dataframe
    df_shuffled = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    
    # Split in half
    split_idx = len(df_shuffled) // 2
    test_df = df_shuffled.iloc[:split_idx].copy()
    example_df = df_shuffled.iloc[split_idx:].copy()
    
    logging.info(f"Split data: {len(test_df)} test samples, {len(example_df)} example samples")
    return test_df, example_df


### Detect Language with Model


###### Language Detection Function

This model performs direct classification into one of 20 languages.
It's very fast (similar to DistilBERT sentiment) as it's a fine-tuned classifier.


In [None]:
def detect_language(sentence: str) -> tuple:
    """
    Detect the language of a single sentence using XLM-RoBERTa.
    
    Returns:
        (raw_output, language_label, reason)
        - raw_output: String representation of detection results
        - language_label: Detected language (full name)
        - reason: Explanation with confidence score
    """
    # Run language detection
    with torch.no_grad():
        result = lang_pipeline(sentence, top_k=3, truncation=True)
    
    # Get top result
    top_result = result[0]
    lang_code = top_result['label']
    lang_name = LANGUAGE_NAMES.get(lang_code, lang_code)
    score = top_result['score']
    
    # Build reason with top-3 predictions
    top3_details = ", ".join([
        f"{LANGUAGE_NAMES.get(r['label'], r['label'])}: {r['score']:.2%}" 
        for r in result[:3]
    ])
    reason = f"Top predictions: {top3_details}"
    
    # Format raw output for consistency with other models
    raw_output = f"[Language: {lang_name} ({lang_code}), Score: {score:.4f}]"
    
    return raw_output, lang_name, reason


### Main Workflow


###### Process All Samples

Detect the language of all transcriptions in the dataset.


In [None]:
def process_all_samples(xlsx_path: str):
    """
    Process ALL samples for language detection.
    
    Args:
        xlsx_path: Path to the XLSX file
        
    Returns:
        DataFrame with results for all samples
    """
    # Load data
    df = load_data_from_xlsx(xlsx_path, sheet_index=1)
    
    logging.info(f"Processing all {len(df)} samples for language detection")
    
    # Prepare results list
    results = []
    
    # Process each sample
    for idx, (_, row) in enumerate(df.iterrows(), 1):
        print(f"\n{'='*80}")
        print(f"Processing sentence {idx}/{len(df)}")
        print(f"{'='*80}")
        
        # Get column names
        machine_col = None
        human_col = None
        
        for col in row.index:
            if 'machine' in col.lower() and 'transcription' in col.lower():
                machine_col = col
            elif 'human' in col.lower() and 'transcription' in col.lower():
                human_col = col
        
        if machine_col is None or human_col is None:
            # Fallback to first two columns
            cols = list(row.index)
            machine_col = cols[0] if machine_col is None else machine_col
            human_col = cols[1] if human_col is None else human_col
        
        ground_truth = row.get(machine_col, "")
        hypothesis = row.get(human_col, "")
        
        print(f"\nGround Truth: {ground_truth}")
        print(f"Hypothesis: {hypothesis}")
        
        # Detect language
        try:
            raw_output, language_label, reason = detect_language(hypothesis)
            
            print(f"\nModel Output: {raw_output}")
            print(f"Detected Language: {language_label}")
            print(f"Reason: {reason}")
            
        except Exception as e:
            print(f"\nERROR: {e}")
            raw_output = f"ERROR: {e}"
            language_label = "ERROR"
            reason = str(e)
        
        # Store results
        results.append({
            'ground_truth': ground_truth,
            'hypothesis': hypothesis,
            'detected_language': language_label,
            'reason': reason
        })
    
    # Create results dataframe
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    today = datetime.now().strftime("%Y%m%d")
    output_filename = f"xlm_roberta_language_detection_all_samples_{today}.csv"
    
    # Create output directory if it doesn't exist
    output_dir = r"data/model_outputs/xlm_roberta_language_detection"
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, output_filename)
    results_df.to_csv(output_path, index=False)
    print(f"\n{'='*80}")
    print(f"Results saved to: {output_path}")
    print(f"Total samples processed: {len(results_df)}")
    print(f"{'='*80}")
    
    # Print language distribution
    print("\nLanguage Distribution:")
    lang_counts = results_df['detected_language'].value_counts()
    for lang, count in lang_counts.items():
        print(f"  {lang}: {count} ({count/len(results_df):.1%})")
    
    return results_df


In [None]:
def process_language_detection(xlsx_path: str, random_seed: int = 42):
    """
    Process language detection with data splitting.
    
    Args:
        xlsx_path: Path to the XLSX file
        random_seed: Random seed for data splitting
        
    Returns:
        DataFrame with results
    """
    # Load data
    df = load_data_from_xlsx(xlsx_path, sheet_index=1)
    
    # Split data (for consistency with other notebooks)
    test_df, _ = split_data_into_sets(df, test_ratio=0.5, random_seed=random_seed)
    
    logging.info(f"Processing {len(test_df)} samples for language detection (split mode)")
    
    # Prepare results list
    results = []
    
    # Process each test sentence
    for idx, (_, row) in enumerate(test_df.iterrows(), 1):
        print(f"\n{'='*80}")
        print(f"Processing sentence {idx}/{len(test_df)}")
        print(f"{'='*80}")
        
        # Get column names
        machine_col = None
        human_col = None
        
        for col in row.index:
            if 'machine' in col.lower() and 'transcription' in col.lower():
                machine_col = col
            elif 'human' in col.lower() and 'transcription' in col.lower():
                human_col = col
        
        if machine_col is None or human_col is None:
            cols = list(row.index)
            machine_col = cols[0] if machine_col is None else machine_col
            human_col = cols[1] if human_col is None else human_col
        
        ground_truth = row.get(machine_col, "")
        hypothesis = row.get(human_col, "")
        
        print(f"\nGround Truth: {ground_truth}")
        print(f"Hypothesis: {hypothesis}")
        
        # Detect language
        try:
            raw_output, language_label, reason = detect_language(hypothesis)
            
            print(f"\nModel Output: {raw_output}")
            print(f"Detected Language: {language_label}")
            print(f"Reason: {reason}")
            
        except Exception as e:
            print(f"\nERROR: {e}")
            raw_output = f"ERROR: {e}"
            language_label = "ERROR"
            reason = str(e)
        
        # Store results
        results.append({
            'ground_truth': ground_truth,
            'hypothesis': hypothesis,
            'detected_language': language_label,
            'reason': reason
        })
    
    # Create results dataframe
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    today = datetime.now().strftime("%Y%m%d")
    output_filename = f"xlm_roberta_language_detection_split_{today}.csv"
    
    output_dir = r"data/model_outputs/xlm_roberta_language_detection"
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, output_filename)
    results_df.to_csv(output_path, index=False)
    print(f"\n{'='*80}")
    print(f"Results saved to: {output_path}")
    print(f"{'='*80}")
    
    return results_df


### Batch Processing (Faster)

Language detection is very fast - batch processing provides significant speedup.


In [None]:
def process_all_samples_batch(xlsx_path: str, batch_size: int = 32):
    """
    Process ALL samples using batch processing for faster inference.
    
    Language detection is very fast, so larger batch sizes work well.
    
    Args:
        xlsx_path: Path to the XLSX file
        batch_size: Number of sentences to process at once
        
    Returns:
        DataFrame with results for all samples
    """
    # Load data
    df = load_data_from_xlsx(xlsx_path, sheet_index=1)
    
    logging.info(f"Processing all {len(df)} samples (batch mode, batch_size={batch_size})")
    
    # Get column names from first row
    sample_row = df.iloc[0]
    machine_col = None
    human_col = None
    
    for col in sample_row.index:
        if 'machine' in col.lower() and 'transcription' in col.lower():
            machine_col = col
        elif 'human' in col.lower() and 'transcription' in col.lower():
            human_col = col
    
    if machine_col is None or human_col is None:
        cols = list(sample_row.index)
        machine_col = cols[0] if machine_col is None else machine_col
        human_col = cols[1] if human_col is None else human_col
    
    # Extract sentences
    ground_truths = df[machine_col].tolist()
    hypotheses = df[human_col].tolist()
    
    # Process in batches
    print(f"Processing {len(hypotheses)} sentences in batches of {batch_size}...")
    
    all_results = []
    for i in range(0, len(hypotheses), batch_size):
        batch = hypotheses[i:i+batch_size]
        # Convert any non-string to string
        batch = [str(s) if not isinstance(s, str) else s for s in batch]
        batch_results = lang_pipeline(batch, top_k=3, truncation=True)
        all_results.extend(batch_results)
        print(f"  Processed {min(i+batch_size, len(hypotheses))}/{len(hypotheses)} sentences")
    
    # Build results dataframe
    results = []
    for idx, (gt, hyp, res) in enumerate(zip(ground_truths, hypotheses, all_results)):
        top_result = res[0]
        lang_code = top_result['label']
        lang_name = LANGUAGE_NAMES.get(lang_code, lang_code)
        score = top_result['score']
        
        top3_details = ", ".join([
            f"{LANGUAGE_NAMES.get(r['label'], r['label'])}: {r['score']:.2%}" 
            for r in res[:3]
        ])
        reason = f"Top predictions: {top3_details}"
        
        results.append({
            'ground_truth': gt,
            'hypothesis': hyp,
            'detected_language': lang_name,
            'reason': reason
        })
    
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    today = datetime.now().strftime("%Y%m%d")
    output_filename = f"xlm_roberta_language_detection_batch_all_samples_{today}.csv"
    
    output_dir = r"data/model_outputs/xlm_roberta_language_detection"
    os.makedirs(output_dir, exist_ok=True)
    
    output_path = os.path.join(output_dir, output_filename)
    results_df.to_csv(output_path, index=False)
    
    print(f"\n{'='*80}")
    print(f"Results saved to: {output_path}")
    print(f"Total samples processed: {len(results_df)}")
    print(f"{'='*80}")
    
    # Print language distribution
    print("\nLanguage Distribution:")
    lang_counts = results_df['detected_language'].value_counts()
    for lang, count in lang_counts.items():
        print(f"  {lang}: {count} ({count/len(results_df):.1%})")
    
    return results_df


### Usage


In [None]:
# Specify the path to your XLSX file
xlsx_file_path = r"C:\Users\pryce\OneDrive\Desktop\Lost in Transcription\Text Inputs\Samples.xlsx"  # Update this path

# Set random seed for reproducibility
RANDOM_SEED = 33


###### Option 1: Process with Split (50% of data)


In [None]:
# Run language detection on 50% of data (split mode)
split_results = process_language_detection(
    xlsx_file_path, 
    random_seed=RANDOM_SEED
)


###### Option 2: Process All Samples (One by One)


In [None]:
# Run language detection on ALL samples (verbose, one by one)
all_results = process_all_samples(xlsx_file_path)


###### Option 3: Batch Processing (Recommended - Fastest)


In [None]:
# Run language detection on ALL samples using batch processing (fastest)
# Language detection is fast, so batch_size=32 works well
batch_results = process_all_samples_batch(xlsx_file_path, batch_size=32)
