## 1. Setup

In [14]:
import os
import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from typing import List, Tuple
import warnings
warnings.filterwarnings('ignore')

from sentence_transformers import SentenceTransformer
import torch
from tqdm.auto import tqdm

print("‚úì Imports complete")

‚úì Imports complete


In [15]:
# Configuration
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")

if device == 'cuda':
    BATCH_SIZE = 64
else:
    BATCH_SIZE = 16

# Paths
DATA_DIR = Path('data_new')
DATA_DIR.mkdir(exist_ok=True)

# Training pair parameters (ENHANCED for better quality)
NUM_POSITIVE_PAIRS = 5000  # Increased from 3000 for more robust training
NUM_NEGATIVE_PAIRS = 5000  # Increased from 3000 for more robust training
POSITIVE_SIMILARITY_THRESHOLD = 0.35  # Increased from 0.3 for higher quality positives
NEGATIVE_SIMILARITY_THRESHOLD = 0.45  # Decreased from 0.5 for clearer negatives
RANDOM_STATE = 42

# Quality requirements
MIN_SEPARABILITY = 0.15  # Minimum acceptable separability
MAX_OVERLAP_PCT = 10.0  # Maximum acceptable overlap percentage

print(f"\nConfiguration:")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Positive pairs target: {NUM_POSITIVE_PAIRS}")
print(f"  Negative pairs target: {NUM_NEGATIVE_PAIRS}")
print(f"  Positive similarity threshold: ‚â•{POSITIVE_SIMILARITY_THRESHOLD}")
print(f"  Negative similarity threshold: <{NEGATIVE_SIMILARITY_THRESHOLD}")
print(f"\nQuality Requirements:")
print(f"  Minimum separability: {MIN_SEPARABILITY}")
print(f"  Maximum overlap: {MAX_OVERLAP_PCT}%")

Device: cpu

Configuration:
  Batch size: 16
  Positive pairs target: 5000
  Negative pairs target: 5000
  Positive similarity threshold: ‚â•0.35
  Negative similarity threshold: <0.45

Quality Requirements:
  Minimum separability: 0.15
  Maximum overlap: 10.0%


## 2. Load Data

In [16]:
# Load ServiceNow incident data
data_path = DATA_DIR / 'SNow_incident_ticket_data.csv'
df = pd.read_csv(data_path)

print(f"Loaded {len(df)} ServiceNow incidents")
print(f"\nColumns: {df.columns.tolist()}")

Loaded 10633 ServiceNow incidents

Columns: ['Number', 'Description', 'Opened by', 'Company', 'ITSM Department', 'Created', 'Urgency', 'Impact', 'Priority', 'Assignment group', 'Assigned to', 'State', 'Service', 'Service offering', 'Closed', 'Closed by', 'Category', 'Subcategory', 'Resolution code', 'Resolution notes', 'User input', 'Comments and Work notes', 'Manday Effort (hrs)', 'Ticket Type', 'AMS Domain', 'AMS System Type', 'AMS Category Type', 'AMS Service Type', 'AMS Business Related', 'AMS IT Related']


In [17]:
# Combine text fields
def create_combined_text(row):
    """Combine available text fields with proper handling of NaN"""
    text_parts = []
    
    for col in ['Number', 'Description', 'User input', 'Resolution notes']:
        if col in row.index:
            value = str(row.get(col, '')).strip() if pd.notna(row.get(col)) else ''
            if value and value.lower() != 'nan':
                text_parts.append(value)
    
    return ' '.join(text_parts) if text_parts else ''

df['combined_text'] = df.apply(create_combined_text, axis=1)
df['combined_text'] = df['combined_text'].astype(str)
df = df[df['combined_text'].str.len() > 10].reset_index(drop=True)

print(f"After filtering: {len(df)} valid incidents")
print(f"\nSample text: {df['combined_text'].iloc[0][:200]}...")

After filtering: 10633 valid incidents

Sample text: INC0010171 GRPT not working as expected. ZMMM_PO_REV is not generating correct dates as per maintained in GRPT table. 
E.g. P/O# 100024066
Vendor Ship mode is 03. 
As per GRPT route days are 12 day...


## 3. Load Baseline Model for Validation

In [18]:
# Load baseline model for semantic validation
print("Loading baseline model: sentence-transformers/all-mpnet-base-v2")
baseline_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)
print("‚úì Baseline model loaded")

Loading baseline model: sentence-transformers/all-mpnet-base-v2
‚úì Baseline model loaded


## 4. Test Set Consistency Check

In [19]:
# Load existing test pairs to ensure training consistency
test_pairs_path = DATA_DIR / 'fixed_test_pairs.json'

if test_pairs_path.exists():
    print("Loading existing test pairs for consistency check...")
    with open(test_pairs_path, 'r') as f:
        test_data = json.load(f)
    
    test_metadata = test_data.get('metadata', {})
    
    print(f"\n{'='*80}")
    print("TEST SET REFERENCE METRICS")
    print(f"{'='*80}")
    print(f"Test pairs: {test_metadata.get('num_pairs', 'N/A')}")
    print(f"Test separability: {test_metadata.get('baseline_separability', 'N/A'):.4f}")
    print(f"Test pos threshold: {test_metadata.get('positive_similarity_threshold', 'N/A')}")
    print(f"Test neg threshold: {test_metadata.get('negative_similarity_threshold', 'N/A')}")
    
    # Compare with current settings
    test_sep = test_metadata.get('baseline_separability', 0)
    print(f"\n{'='*80}")
    print("TRAINING vs TEST CONFIGURATION")
    print(f"{'='*80}")
    print(f"Positive threshold: Train={POSITIVE_SIMILARITY_THRESHOLD} vs Test={test_metadata.get('positive_similarity_threshold', 'N/A')}")
    print(f"Negative threshold: Train={NEGATIVE_SIMILARITY_THRESHOLD} vs Test={test_metadata.get('negative_similarity_threshold', 'N/A')}")
    print(f"Target separability: Train ‚â•{MIN_SEPARABILITY} vs Test={test_sep:.4f}")
    
    if test_sep > 0 and test_sep < MIN_SEPARABILITY:
        print(f"\n‚ö†Ô∏è  WARNING: Training target separability ({MIN_SEPARABILITY}) exceeds test separability ({test_sep:.4f})")
        print(f"   Consider lowering MIN_SEPARABILITY to match test quality")
    elif test_sep > MIN_SEPARABILITY + 0.05:
        print(f"\n‚ö†Ô∏è  WARNING: Test separability ({test_sep:.4f}) significantly exceeds training minimum ({MIN_SEPARABILITY})")
        print(f"   Consider raising MIN_SEPARABILITY to {test_sep:.4f} for consistency")
    else:
        print(f"\n‚úì Training and test quality targets are consistent")
else:
    print(f"‚ö†Ô∏è  Test pairs file not found at {test_pairs_path}")
    print("Proceeding without consistency check...")

Loading existing test pairs for consistency check...

TEST SET REFERENCE METRICS
Test pairs: 1000
Test separability: 0.1865
Test pos threshold: 0.3
Test neg threshold: 0.5

TRAINING vs TEST CONFIGURATION
Positive threshold: Train=0.35 vs Test=0.3
Negative threshold: Train=0.45 vs Test=0.5
Target separability: Train ‚â•0.15 vs Test=0.1865

‚úì Training and test quality targets are consistent


## 5. Generate Candidate Pairs (Category-Based)

In [20]:
def generate_candidate_pairs(df: pd.DataFrame, 
                            num_positives: int,
                            num_negatives: int,
                            random_state: int = 42) -> Tuple[List[str], List[str], List[int]]:
    """
    Generate candidate pairs based on categories.
    These will be filtered with semantic validation.
    
    Generate MORE than needed since filtering will remove noisy pairs.
    """
    np.random.seed(random_state)
    
    texts1, texts2, labels = [], [], []
    
    # Check if we have category information
    has_categories = 'category' in df.columns
    
    if has_categories:
        categories = df['category'].dropna().unique()
        print(f"Found {len(categories)} categories")
        
        # Generate 2x more candidates than needed (will filter later)
        target_pos = num_positives * 2
        target_neg = num_negatives * 2
        
        print(f"\nGenerating {target_pos} candidate positive pairs...")
        # Positive pairs - same category
        for _ in tqdm(range(target_pos)):
            cat = np.random.choice(categories)
            cat_incidents = df[df['category'] == cat]
            if len(cat_incidents) >= 2:
                idx1, idx2 = np.random.choice(cat_incidents.index, size=2, replace=False)
                texts1.append(df.loc[idx1, 'combined_text'])
                texts2.append(df.loc[idx2, 'combined_text'])
                labels.append(1)
        
        print(f"\nGenerating {target_neg} candidate negative pairs...")
        # Negative pairs - different categories
        for _ in tqdm(range(target_neg)):
            cat1, cat2 = np.random.choice(categories, size=2, replace=False)
            incidents1 = df[df['category'] == cat1]
            incidents2 = df[df['category'] == cat2]
            if len(incidents1) > 0 and len(incidents2) > 0:
                idx1 = np.random.choice(incidents1.index)
                idx2 = np.random.choice(incidents2.index)
                texts1.append(df.loc[idx1, 'combined_text'])
                texts2.append(df.loc[idx2, 'combined_text'])
                labels.append(0)
    else:
        print("No category column found - using random pairs")
        # Random pairs as fallback
        for _ in range(num_positives * 2):
            idx1, idx2 = np.random.choice(len(df), size=2, replace=False)
            texts1.append(df.loc[idx1, 'combined_text'])
            texts2.append(df.loc[idx2, 'combined_text'])
            labels.append(1)
        
        for _ in range(num_negatives * 2):
            idx1, idx2 = np.random.choice(len(df), size=2, replace=False)
            texts1.append(df.loc[idx1, 'combined_text'])
            texts2.append(df.loc[idx2, 'combined_text'])
            labels.append(0)
    
    print(f"\n‚úì Generated {len(labels)} candidate pairs")
    print(f"  Positive: {sum(labels)}")
    print(f"  Negative: {len(labels) - sum(labels)}")
    
    return texts1, texts2, labels

# Generate candidates
candidate_texts1, candidate_texts2, candidate_labels = generate_candidate_pairs(
    df,
    num_positives=NUM_POSITIVE_PAIRS,
    num_negatives=NUM_NEGATIVE_PAIRS,
    random_state=RANDOM_STATE
)

No category column found - using random pairs

‚úì Generated 20000 candidate pairs
  Positive: 10000
  Negative: 10000


## 6. Semantic Validation with Baseline Model

In [None]:
def compute_similarities(texts1: List[str], 
                        texts2: List[str],
                        model: SentenceTransformer,
                        batch_size: int = 16) -> np.ndarray:
    """
    Compute cosine similarities between text pairs.
    """
    print(f"\nComputing embeddings for {len(texts1)} text pairs...")
    
    # Encode texts
    embeddings1 = model.encode(
        texts1,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    
    embeddings2 = model.encode(
        texts2,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    
    # Compute cosine similarity
    similarities = np.sum(embeddings1 * embeddings2, axis=1) / (
        np.linalg.norm(embeddings1, axis=1) * np.linalg.norm(embeddings2, axis=1)
    )
    
    return similarities

# Compute similarities for all candidate pairs
similarities = compute_similarities(
    candidate_texts1,
    candidate_texts2,
    baseline_model,
    batch_size=BATCH_SIZE
)

print(f"\n‚úì Computed {len(similarities)} similarities")


Computing embeddings for 20000 text pairs...


Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

## 7. Filter Pairs with Semantic Thresholds

In [None]:
def filter_validated_pairs(texts1: List[str],
                          texts2: List[str],
                          labels: List[int],
                          similarities: np.ndarray,
                          positive_threshold: float,
                          negative_threshold: float,
                          target_positives: int,
                          target_negatives: int) -> Tuple[List[str], List[str], List[int], np.ndarray]:
    """
    Filter pairs based on semantic similarity thresholds.
    
    Positive pairs: similarity >= positive_threshold
    Negative pairs: similarity < negative_threshold
    """
    labels_array = np.array(labels)
    
    # Filter positive pairs
    positive_mask = (labels_array == 1) & (similarities >= positive_threshold)
    valid_positive_indices = np.where(positive_mask)[0]
    
    print(f"\nPositive pairs:")
    print(f"  Candidates: {sum(labels_array == 1)}")
    print(f"  Valid (similarity ‚â•{positive_threshold}): {len(valid_positive_indices)}")
    print(f"  Rejection rate: {(1 - len(valid_positive_indices)/sum(labels_array == 1))*100:.1f}%")
    
    # Filter negative pairs
    negative_mask = (labels_array == 0) & (similarities < negative_threshold)
    valid_negative_indices = np.where(negative_mask)[0]
    
    print(f"\nNegative pairs:")
    print(f"  Candidates: {sum(labels_array == 0)}")
    print(f"  Valid (similarity <{negative_threshold}): {len(valid_negative_indices)}")
    print(f"  Rejection rate: {(1 - len(valid_negative_indices)/sum(labels_array == 0))*100:.1f}%")
    
    # Sample to target sizes (if we have enough)
    if len(valid_positive_indices) >= target_positives:
        selected_positive_indices = np.random.choice(
            valid_positive_indices,
            size=target_positives,
            replace=False
        )
    else:
        selected_positive_indices = valid_positive_indices
        print(f"\n‚ö†Ô∏è  Warning: Only {len(selected_positive_indices)} valid positive pairs (target: {target_positives})")
    
    if len(valid_negative_indices) >= target_negatives:
        selected_negative_indices = np.random.choice(
            valid_negative_indices,
            size=target_negatives,
            replace=False
        )
    else:
        selected_negative_indices = valid_negative_indices
        print(f"\n‚ö†Ô∏è  Warning: Only {len(selected_negative_indices)} valid negative pairs (target: {target_negatives})")
    
    # Combine selected indices
    selected_indices = np.concatenate([selected_positive_indices, selected_negative_indices])
    
    # Extract selected pairs
    validated_texts1 = [texts1[i] for i in selected_indices]
    validated_texts2 = [texts2[i] for i in selected_indices]
    validated_labels = [labels[i] for i in selected_indices]
    validated_similarities = similarities[selected_indices]
    
    return validated_texts1, validated_texts2, validated_labels, validated_similarities

# Filter pairs
train_texts1, train_texts2, train_labels, train_similarities = filter_validated_pairs(
    candidate_texts1,
    candidate_texts2,
    candidate_labels,
    similarities,
    positive_threshold=POSITIVE_SIMILARITY_THRESHOLD,
    negative_threshold=NEGATIVE_SIMILARITY_THRESHOLD,
    target_positives=NUM_POSITIVE_PAIRS,
    target_negatives=NUM_NEGATIVE_PAIRS
)

print(f"\n{'='*80}")
print(f"FINAL TRAINING SET")
print(f"{'='*80}")
print(f"Total pairs: {len(train_labels)}")
print(f"  Positive: {sum(train_labels)} ({sum(train_labels)/len(train_labels)*100:.1f}%)")
print(f"  Negative: {len(train_labels)-sum(train_labels)} ({(len(train_labels)-sum(train_labels))/len(train_labels)*100:.1f}%)")

## 8. Comparison: Category-Only vs Semantic Filtering

In [None]:
# Compare quality metrics between category-only and semantic filtering approaches

print(f"\n{'='*80}")
print("COMPARISON: CATEGORY-ONLY vs SEMANTIC FILTERING")
print(f"{'='*80}")

# Category-only metrics (before filtering)
candidate_labels_array = np.array(candidate_labels)
candidate_pos_sims = similarities[candidate_labels_array == 1]
candidate_neg_sims = similarities[candidate_labels_array == 0]

print("\nüìä CATEGORY-ONLY METHOD (Before Filtering):")
print(f"  Positive pairs: {len(candidate_pos_sims)}")
print(f"    Mean similarity: {candidate_pos_sims.mean():.4f}")
print(f"    Below threshold (<{POSITIVE_SIMILARITY_THRESHOLD}): {np.sum(candidate_pos_sims < POSITIVE_SIMILARITY_THRESHOLD)} ({np.sum(candidate_pos_sims < POSITIVE_SIMILARITY_THRESHOLD)/len(candidate_pos_sims)*100:.1f}%)")
print(f"  Negative pairs: {len(candidate_neg_sims)}")
print(f"    Mean similarity: {candidate_neg_sims.mean():.4f}")
print(f"    Above threshold (‚â•{NEGATIVE_SIMILARITY_THRESHOLD}): {np.sum(candidate_neg_sims >= NEGATIVE_SIMILARITY_THRESHOLD)} ({np.sum(candidate_neg_sims >= NEGATIVE_SIMILARITY_THRESHOLD)/len(candidate_neg_sims)*100:.1f}%)")

candidate_separability = candidate_pos_sims.mean() - candidate_neg_sims.mean()
print(f"  Separability: {candidate_separability:.4f}")

# Semantic filtering metrics (after filtering)
train_labels_array = np.array(train_labels)
pos_similarities = train_similarities[train_labels_array == 1]
neg_similarities = train_similarities[train_labels_array == 0]

print("\n‚ú® SEMANTIC FILTERING METHOD (After Filtering):")
print(f"  Positive pairs: {len(pos_similarities)}")
print(f"    Mean similarity: {pos_similarities.mean():.4f}")
print(f"  Negative pairs: {len(neg_similarities)}")
print(f"    Mean similarity: {neg_similarities.mean():.4f}")

separability = pos_similarities.mean() - neg_similarities.mean()
print(f"  Separability: {separability:.4f}")

# Show improvement
print(f"\n{'='*80}")
print("IMPROVEMENT FROM SEMANTIC FILTERING")
print(f"{'='*80}")
pos_rejected = len(candidate_pos_sims) - len(pos_similarities)
neg_rejected = len(candidate_neg_sims) - len(neg_similarities)
print(f"Rejected noisy positives: {pos_rejected} ({pos_rejected/len(candidate_pos_sims)*100:.1f}%)")
print(f"Rejected ambiguous negatives: {neg_rejected} ({neg_rejected/len(candidate_neg_sims)*100:.1f}%)")
print(f"Separability improvement: {separability - candidate_separability:+.4f} ({(separability - candidate_separability)/candidate_separability*100:+.1f}%)")

if separability > candidate_separability * 1.2:
    print("\n‚úì SIGNIFICANT IMPROVEMENT: Semantic filtering dramatically improved quality")
elif separability > candidate_separability:
    print("\n‚úì IMPROVED: Semantic filtering enhanced quality")
else:
    print("\n‚ö†Ô∏è  LIMITED IMPROVEMENT: Consider adjusting thresholds")


## 9. Quality Analysis & Pre-Flight Validation

In [None]:
# Detailed quality analysis and validation checks

print(f"\n{'='*80}")
print("DETAILED QUALITY ANALYSIS")
print(f"{'='*80}")

print(f"\nüìà Positive Pairs (label=1):")
print(f"  Count: {len(pos_similarities)}")
print(f"  Mean similarity: {pos_similarities.mean():.4f}")
print(f"  Median similarity: {np.median(pos_similarities):.4f}")
print(f"  Std dev: {pos_similarities.std():.4f}")
print(f"  Range: [{pos_similarities.min():.4f}, {pos_similarities.max():.4f}]")

print(f"\nüìâ Negative Pairs (label=0):")
print(f"  Count: {len(neg_similarities)}")
print(f"  Mean similarity: {neg_similarities.mean():.4f}")
print(f"  Median similarity: {np.median(neg_similarities):.4f}")
print(f"  Std dev: {neg_similarities.std():.4f}")
print(f"  Range: [{neg_similarities.min():.4f}, {neg_similarities.max():.4f}]")

print(f"\nüìä Separability Analysis:")
print(f"  Separability (Pos - Neg): {separability:.4f}")
print(f"  Required minimum: {MIN_SEPARABILITY:.4f}")

if separability >= MIN_SEPARABILITY:
    print(f"  ‚úì PASS: Meets minimum requirement")
else:
    print(f"  ‚úó FAIL: Below minimum requirement by {MIN_SEPARABILITY - separability:.4f}")

# Check overlap
overlap_count = np.sum(
    (pos_similarities[:, None] <= neg_similarities[None, :]).any(axis=1)
)
overlap_pct = overlap_count / len(pos_similarities) * 100

print(f"\nüîÑ Overlap Analysis:")
print(f"  Overlap percentage: {overlap_pct:.1f}%")
print(f"  Maximum allowed: {MAX_OVERLAP_PCT:.1f}%")

if overlap_pct <= MAX_OVERLAP_PCT:
    print(f"  ‚úì PASS: Within acceptable range")
else:
    print(f"  ‚úó FAIL: Exceeds maximum by {overlap_pct - MAX_OVERLAP_PCT:.1f}%")

# Count risky negative pairs (too similar)
risky_negatives = np.sum(neg_similarities > POSITIVE_SIMILARITY_THRESHOLD)
risky_neg_pct = risky_negatives / len(neg_similarities) * 100

print(f"\n‚ö†Ô∏è  Risk Assessment:")
print(f"  Risky negatives (>{POSITIVE_SIMILARITY_THRESHOLD}): {risky_negatives} ({risky_neg_pct:.1f}%)")
if risky_neg_pct < 5:
    print(f"  ‚úì LOW RISK: Very few ambiguous negatives")
elif risky_neg_pct < 15:
    print(f"  ‚ö†Ô∏è  MODERATE RISK: Some ambiguous negatives")
else:
    print(f"  ‚úó HIGH RISK: Many ambiguous negatives (tighten NEGATIVE_SIMILARITY_THRESHOLD)")

# Overall quality gate
print(f"\n{'='*80}")
print("PRE-FLIGHT VALIDATION")
print(f"{'='*80}")

validation_passed = True
issues = []

if separability < MIN_SEPARABILITY:
    validation_passed = False
    issues.append(f"Separability {separability:.4f} < {MIN_SEPARABILITY:.4f}")

if overlap_pct > MAX_OVERLAP_PCT:
    validation_passed = False
    issues.append(f"Overlap {overlap_pct:.1f}% > {MAX_OVERLAP_PCT:.1f}%")

if risky_neg_pct > 15:
    validation_passed = False
    issues.append(f"Risky negatives {risky_neg_pct:.1f}% > 15%")

if len(pos_similarities) < NUM_POSITIVE_PAIRS * 0.9:
    validation_passed = False
    issues.append(f"Insufficient positive pairs: {len(pos_similarities)} < {NUM_POSITIVE_PAIRS * 0.9:.0f}")

if len(neg_similarities) < NUM_NEGATIVE_PAIRS * 0.9:
    validation_passed = False
    issues.append(f"Insufficient negative pairs: {len(neg_similarities)} < {NUM_NEGATIVE_PAIRS * 0.9:.0f}")

if validation_passed:
    print("‚úì ALL CHECKS PASSED")
    print("\nTraining data quality is GOOD. Safe to proceed with saving.")
else:
    print("‚úó VALIDATION FAILED")
    print("\nIssues detected:")
    for i, issue in enumerate(issues, 1):
        print(f"  {i}. {issue}")
    print("\n‚ö†Ô∏è  WARNING: Training with this data may produce poor models!")
    print("Recommended actions:")
    print("  - Increase POSITIVE_SIMILARITY_THRESHOLD (currently {POSITIVE_SIMILARITY_THRESHOLD})")
    print("  - Decrease NEGATIVE_SIMILARITY_THRESHOLD (currently {NEGATIVE_SIMILARITY_THRESHOLD})")
    print("  - Generate more candidate pairs (increase 2x multiplier)")

# Store validation result for later use
VALIDATION_PASSED = validation_passed

## 10. Visualize Distribution

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

plt.hist(neg_similarities, bins=50, alpha=0.6, label='Negative (label=0)', color='red', edgecolor='black')
plt.hist(pos_similarities, bins=50, alpha=0.6, label='Positive (label=1)', color='green', edgecolor='black')

plt.axvline(POSITIVE_SIMILARITY_THRESHOLD, color='green', linestyle='--', linewidth=2, label=f'Pos threshold={POSITIVE_SIMILARITY_THRESHOLD}')
plt.axvline(NEGATIVE_SIMILARITY_THRESHOLD, color='red', linestyle='--', linewidth=2, label=f'Neg threshold={NEGATIVE_SIMILARITY_THRESHOLD}')

plt.xlabel('Cosine Similarity', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Training Pair Similarity Distribution (Validated)', fontsize=14, fontweight='bold')
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\n‚úì Distribution looks {'GOOD' if separability > 0.1 else 'NEEDS IMPROVEMENT'}")

## 11. Save Validated Training Pairs (with Quality Gate)

In [None]:
# Quality gate check before saving
print(f"\n{'='*80}")
print("QUALITY GATE CHECK")
print(f"{'='*80}")

if not VALIDATION_PASSED:
    print("‚úó QUALITY GATE: FAILED")
    print("\nData quality is below acceptable standards.")
    print("Refusing to save training pairs to prevent training poor models.")
    print("\nPlease adjust configuration and regenerate:")
    print(f"  - Current separability: {separability:.4f} (required: ‚â•{MIN_SEPARABILITY})")
    print(f"  - Current overlap: {overlap_pct:.1f}% (required: ‚â§{MAX_OVERLAP_PCT}%)")
    print("\nRecommendations:")
    print(f"  1. Increase POSITIVE_SIMILARITY_THRESHOLD from {POSITIVE_SIMILARITY_THRESHOLD} to {POSITIVE_SIMILARITY_THRESHOLD + 0.05}")
    print(f"  2. Decrease NEGATIVE_SIMILARITY_THRESHOLD from {NEGATIVE_SIMILARITY_THRESHOLD} to {NEGATIVE_SIMILARITY_THRESHOLD - 0.05}")
    print(f"  3. Rerun notebook from configuration cell")
    
    raise ValueError("Quality gate failed: Data quality below minimum standards")

print("‚úì QUALITY GATE: PASSED")
print(f"  Separability: {separability:.4f} ‚â• {MIN_SEPARABILITY} ‚úì")
print(f"  Overlap: {overlap_pct:.1f}% ‚â§ {MAX_OVERLAP_PCT}% ‚úì")
print("\nProceeding with save...")

# Save to JSON
output_file = DATA_DIR / 'fixed_training_pairs.json'

training_data = {
    'texts1': train_texts1,
    'texts2': train_texts2,
    'labels': train_labels,
    'metadata': {
        'generated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'num_pairs': len(train_labels),
        'num_positive': sum(train_labels),
        'num_negative': len(train_labels) - sum(train_labels),
        'positive_similarity_threshold': POSITIVE_SIMILARITY_THRESHOLD,
        'negative_similarity_threshold': NEGATIVE_SIMILARITY_THRESHOLD,
        'min_separability_requirement': MIN_SEPARABILITY,
        'max_overlap_requirement': MAX_OVERLAP_PCT,
        'baseline_model': 'sentence-transformers/all-mpnet-base-v2',
        'baseline_separability': float(separability),
        'baseline_overlap': float(overlap_pct / 100),
        'positive_mean_similarity': float(pos_similarities.mean()),
        'negative_mean_similarity': float(neg_similarities.mean()),
        'quality_status': 'EXCELLENT' if separability > 0.15 else 'GOOD',
        'validation_passed': True
    }
}

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(training_data, f, indent=2)

print(f"\n{'='*80}")
print(f"‚úì TRAINING PAIRS SAVED")
print(f"{'='*80}")
print(f"File: {output_file}")
print(f"Size: {output_file.stat().st_size / 1024 / 1024:.2f} MB")
print(f"Quality: {training_data['metadata']['quality_status']}")
print(f"\nYou can now use these validated pairs to train your models!")
print(f"\nNext steps:")
print(f"  1. Use these pairs in your training notebook")
print(f"  2. Retrain all models with clean data")
print(f"  3. Re-evaluate with fixed test pairs")
print(f"  4. Expect significant performance improvements!")

## 12. Sample Pairs for Manual Inspection

In [None]:
# Show sample positive pairs
print("\n" + "="*80)
print("SAMPLE POSITIVE PAIRS (should be semantically similar)")
print("="*80)

pos_indices = np.where(train_labels_array == 1)[0]
sample_pos = np.random.choice(pos_indices, size=min(3, len(pos_indices)), replace=False)

for i, idx in enumerate(sample_pos, 1):
    print(f"\nPair {i} (similarity: {train_similarities[idx]:.3f}):")
    print(f"  Text 1: {train_texts1[idx][:150]}...")
    print(f"  Text 2: {train_texts2[idx][:150]}...")

# Show sample negative pairs
print("\n" + "="*80)
print("SAMPLE NEGATIVE PAIRS (should be semantically different)")
print("="*80)

neg_indices = np.where(train_labels_array == 0)[0]
sample_neg = np.random.choice(neg_indices, size=min(3, len(neg_indices)), replace=False)

for i, idx in enumerate(sample_neg, 1):
    print(f"\nPair {i} (similarity: {train_similarities[idx]:.3f}):")
    print(f"  Text 1: {train_texts1[idx][:150]}...")
    print(f"  Text 2: {train_texts2[idx][:150]}...")

## 13. Save Sample Inspection Output

In [None]:
# Save sample pairs to a text file for manual review
from datetime import datetime

sample_output_file = DATA_DIR / f'sample_pairs_inspection_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt'

with open(sample_output_file, 'w', encoding='utf-8') as f:
    f.write("="*80 + "\n")
    f.write("TRAINING PAIRS QUALITY INSPECTION\n")
    f.write("="*80 + "\n\n")
    
    # Write summary
    f.write(f"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Total pairs: {len(train_labels)}\n")
    f.write(f"Positive pairs: {sum(train_labels)} ({sum(train_labels)/len(train_labels)*100:.1f}%)\n")
    f.write(f"Negative pairs: {len(train_labels) - sum(train_labels)} ({(len(train_labels)-sum(train_labels))/len(train_labels)*100:.1f}%)\n")
    f.write(f"Separability: {separability:.4f}\n\n")
    
    # Write sample positive pairs
    f.write("="*80 + "\n")
    f.write("SAMPLE POSITIVE PAIRS (should be semantically similar)\n")
    f.write("="*80 + "\n\n")
    
    for i, idx in enumerate(sample_pos, 1):
        f.write(f"Pair {i} (similarity: {train_similarities[idx]:.3f}):\n")
        f.write(f"  Text 1: {train_texts1[idx][:150]}...\n")
        f.write(f"  Text 2: {train_texts2[idx][:150]}...\n\n")
    
    # Write sample negative pairs
    f.write("="*80 + "\n")
    f.write("SAMPLE NEGATIVE PAIRS (should be semantically different)\n")
    f.write("="*80 + "\n\n")
    
    for i, idx in enumerate(sample_neg, 1):
        f.write(f"Pair {i} (similarity: {train_similarities[idx]:.3f}):\n")
        f.write(f"  Text 1: {train_texts1[idx][:150]}...\n")
        f.write(f"  Text 2: {train_texts2[idx][:150]}...\n\n")

print(f"‚úì Sample pairs inspection saved to: {sample_output_file}")
print(f"File size: {sample_output_file.stat().st_size / 1024:.2f} KB")