## 1. Setup

In [1]:
import os
import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from typing import List, Tuple
import warnings
warnings.filterwarnings('ignore')

from sentence_transformers import SentenceTransformer
import torch
from tqdm.auto import tqdm

print("✓ Imports complete")

✓ Imports complete


In [2]:
# Configuration
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")

if device == 'cuda':
    BATCH_SIZE = 64
else:
    BATCH_SIZE = 16

# Paths
DATA_DIR = Path('data_new')
DATA_DIR.mkdir(exist_ok=True)

# Training pair parameters
NUM_POSITIVE_PAIRS = 3000  # Increased for better training
NUM_NEGATIVE_PAIRS = 3000
POSITIVE_SIMILARITY_THRESHOLD = 0.3  # Minimum similarity for positive pairs
NEGATIVE_SIMILARITY_THRESHOLD = 0.5  # Maximum similarity for negative pairs
RANDOM_STATE = 42

print(f"\nConfiguration:")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Positive pairs target: {NUM_POSITIVE_PAIRS}")
print(f"  Negative pairs target: {NUM_NEGATIVE_PAIRS}")
print(f"  Positive similarity threshold: ≥{POSITIVE_SIMILARITY_THRESHOLD}")
print(f"  Negative similarity threshold: <{NEGATIVE_SIMILARITY_THRESHOLD}")

Device: cpu

Configuration:
  Batch size: 16
  Positive pairs target: 3000
  Negative pairs target: 3000
  Positive similarity threshold: ≥0.3
  Negative similarity threshold: <0.5


## 2. Load Data

In [3]:
# Load ServiceNow incident data
data_path = DATA_DIR / 'SNow_incident_ticket_data.csv'
df = pd.read_csv(data_path)

print(f"Loaded {len(df)} ServiceNow incidents")
print(f"\nColumns: {df.columns.tolist()}")

Loaded 10633 ServiceNow incidents

Columns: ['Number', 'Description', 'Opened by', 'Company', 'ITSM Department', 'Created', 'Urgency', 'Impact', 'Priority', 'Assignment group', 'Assigned to', 'State', 'Service', 'Service offering', 'Closed', 'Closed by', 'Category', 'Subcategory', 'Resolution code', 'Resolution notes', 'User input', 'Comments and Work notes', 'Manday Effort (hrs)', 'Ticket Type', 'AMS Domain', 'AMS System Type', 'AMS Category Type', 'AMS Service Type', 'AMS Business Related', 'AMS IT Related']


In [4]:
# Combine text fields
def create_combined_text(row):
    """Combine available text fields with proper handling of NaN"""
    text_parts = []
    
    for col in ['Number', 'Description', 'User input', 'Resolution notes']:
        if col in row.index:
            value = str(row.get(col, '')).strip() if pd.notna(row.get(col)) else ''
            if value and value.lower() != 'nan':
                text_parts.append(value)
    
    return ' '.join(text_parts) if text_parts else ''

df['combined_text'] = df.apply(create_combined_text, axis=1)
df['combined_text'] = df['combined_text'].astype(str)
df = df[df['combined_text'].str.len() > 10].reset_index(drop=True)

print(f"After filtering: {len(df)} valid incidents")
print(f"\nSample text: {df['combined_text'].iloc[0][:200]}...")

After filtering: 10633 valid incidents

Sample text: INC0010171 GRPT not working as expected. ZMMM_PO_REV is not generating correct dates as per maintained in GRPT table. 
E.g. P/O# 100024066
Vendor Ship mode is 03. 
As per GRPT route days are 12 day...


## 3. Load Baseline Model for Validation

In [5]:
# Load baseline model for semantic validation
print("Loading baseline model: sentence-transformers/all-mpnet-base-v2")
baseline_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)
print("✓ Baseline model loaded")

Loading baseline model: sentence-transformers/all-mpnet-base-v2
✓ Baseline model loaded


## 4. Generate Candidate Pairs (Category-Based)

In [6]:
def generate_candidate_pairs(df: pd.DataFrame, 
                            num_positives: int,
                            num_negatives: int,
                            random_state: int = 42) -> Tuple[List[str], List[str], List[int]]:
    """
    Generate candidate pairs based on categories.
    These will be filtered with semantic validation.
    
    Generate MORE than needed since filtering will remove noisy pairs.
    """
    np.random.seed(random_state)
    
    texts1, texts2, labels = [], [], []
    
    # Check if we have category information
    has_categories = 'category' in df.columns
    
    if has_categories:
        categories = df['category'].dropna().unique()
        print(f"Found {len(categories)} categories")
        
        # Generate 2x more candidates than needed (will filter later)
        target_pos = num_positives * 2
        target_neg = num_negatives * 2
        
        print(f"\nGenerating {target_pos} candidate positive pairs...")
        # Positive pairs - same category
        for _ in tqdm(range(target_pos)):
            cat = np.random.choice(categories)
            cat_incidents = df[df['category'] == cat]
            if len(cat_incidents) >= 2:
                idx1, idx2 = np.random.choice(cat_incidents.index, size=2, replace=False)
                texts1.append(df.loc[idx1, 'combined_text'])
                texts2.append(df.loc[idx2, 'combined_text'])
                labels.append(1)
        
        print(f"\nGenerating {target_neg} candidate negative pairs...")
        # Negative pairs - different categories
        for _ in tqdm(range(target_neg)):
            cat1, cat2 = np.random.choice(categories, size=2, replace=False)
            incidents1 = df[df['category'] == cat1]
            incidents2 = df[df['category'] == cat2]
            if len(incidents1) > 0 and len(incidents2) > 0:
                idx1 = np.random.choice(incidents1.index)
                idx2 = np.random.choice(incidents2.index)
                texts1.append(df.loc[idx1, 'combined_text'])
                texts2.append(df.loc[idx2, 'combined_text'])
                labels.append(0)
    else:
        print("No category column found - using random pairs")
        # Random pairs as fallback
        for _ in range(num_positives * 2):
            idx1, idx2 = np.random.choice(len(df), size=2, replace=False)
            texts1.append(df.loc[idx1, 'combined_text'])
            texts2.append(df.loc[idx2, 'combined_text'])
            labels.append(1)
        
        for _ in range(num_negatives * 2):
            idx1, idx2 = np.random.choice(len(df), size=2, replace=False)
            texts1.append(df.loc[idx1, 'combined_text'])
            texts2.append(df.loc[idx2, 'combined_text'])
            labels.append(0)
    
    print(f"\n✓ Generated {len(labels)} candidate pairs")
    print(f"  Positive: {sum(labels)}")
    print(f"  Negative: {len(labels) - sum(labels)}")
    
    return texts1, texts2, labels

# Generate candidates
candidate_texts1, candidate_texts2, candidate_labels = generate_candidate_pairs(
    df,
    num_positives=NUM_POSITIVE_PAIRS,
    num_negatives=NUM_NEGATIVE_PAIRS,
    random_state=RANDOM_STATE
)

No category column found - using random pairs

✓ Generated 12000 candidate pairs
  Positive: 6000
  Negative: 6000


## 5. Semantic Validation with Baseline Model

In [None]:
def compute_similarities(texts1: List[str], 
                        texts2: List[str],
                        model: SentenceTransformer,
                        batch_size: int = 16) -> np.ndarray:
    """
    Compute cosine similarities between text pairs.
    """
    print(f"\nComputing embeddings for {len(texts1)} text pairs...")
    
    # Encode texts
    embeddings1 = model.encode(
        texts1,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    
    embeddings2 = model.encode(
        texts2,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    
    # Compute cosine similarity
    similarities = np.sum(embeddings1 * embeddings2, axis=1) / (
        np.linalg.norm(embeddings1, axis=1) * np.linalg.norm(embeddings2, axis=1)
    )
    
    return similarities

# Compute similarities for all candidate pairs
similarities = compute_similarities(
    candidate_texts1,
    candidate_texts2,
    baseline_model,
    batch_size=BATCH_SIZE
)

print(f"\n✓ Computed {len(similarities)} similarities")


Computing embeddings for 12000 text pairs...


Batches:   0%|          | 0/750 [00:00<?, ?it/s]

## 6. Filter Pairs with Semantic Thresholds

In [None]:
def filter_validated_pairs(texts1: List[str],
                          texts2: List[str],
                          labels: List[int],
                          similarities: np.ndarray,
                          positive_threshold: float,
                          negative_threshold: float,
                          target_positives: int,
                          target_negatives: int) -> Tuple[List[str], List[str], List[int], np.ndarray]:
    """
    Filter pairs based on semantic similarity thresholds.
    
    Positive pairs: similarity >= positive_threshold
    Negative pairs: similarity < negative_threshold
    """
    labels_array = np.array(labels)
    
    # Filter positive pairs
    positive_mask = (labels_array == 1) & (similarities >= positive_threshold)
    valid_positive_indices = np.where(positive_mask)[0]
    
    print(f"\nPositive pairs:")
    print(f"  Candidates: {sum(labels_array == 1)}")
    print(f"  Valid (similarity ≥{positive_threshold}): {len(valid_positive_indices)}")
    print(f"  Rejection rate: {(1 - len(valid_positive_indices)/sum(labels_array == 1))*100:.1f}%")
    
    # Filter negative pairs
    negative_mask = (labels_array == 0) & (similarities < negative_threshold)
    valid_negative_indices = np.where(negative_mask)[0]
    
    print(f"\nNegative pairs:")
    print(f"  Candidates: {sum(labels_array == 0)}")
    print(f"  Valid (similarity <{negative_threshold}): {len(valid_negative_indices)}")
    print(f"  Rejection rate: {(1 - len(valid_negative_indices)/sum(labels_array == 0))*100:.1f}%")
    
    # Sample to target sizes (if we have enough)
    if len(valid_positive_indices) >= target_positives:
        selected_positive_indices = np.random.choice(
            valid_positive_indices,
            size=target_positives,
            replace=False
        )
    else:
        selected_positive_indices = valid_positive_indices
        print(f"\n⚠️  Warning: Only {len(selected_positive_indices)} valid positive pairs (target: {target_positives})")
    
    if len(valid_negative_indices) >= target_negatives:
        selected_negative_indices = np.random.choice(
            valid_negative_indices,
            size=target_negatives,
            replace=False
        )
    else:
        selected_negative_indices = valid_negative_indices
        print(f"\n⚠️  Warning: Only {len(selected_negative_indices)} valid negative pairs (target: {target_negatives})")
    
    # Combine selected indices
    selected_indices = np.concatenate([selected_positive_indices, selected_negative_indices])
    
    # Extract selected pairs
    validated_texts1 = [texts1[i] for i in selected_indices]
    validated_texts2 = [texts2[i] for i in selected_indices]
    validated_labels = [labels[i] for i in selected_indices]
    validated_similarities = similarities[selected_indices]
    
    return validated_texts1, validated_texts2, validated_labels, validated_similarities

# Filter pairs
train_texts1, train_texts2, train_labels, train_similarities = filter_validated_pairs(
    candidate_texts1,
    candidate_texts2,
    candidate_labels,
    similarities,
    positive_threshold=POSITIVE_SIMILARITY_THRESHOLD,
    negative_threshold=NEGATIVE_SIMILARITY_THRESHOLD,
    target_positives=NUM_POSITIVE_PAIRS,
    target_negatives=NUM_NEGATIVE_PAIRS
)

print(f"\n{'='*80}")
print(f"FINAL TRAINING SET")
print(f"{'='*80}")
print(f"Total pairs: {len(train_labels)}")
print(f"  Positive: {sum(train_labels)} ({sum(train_labels)/len(train_labels)*100:.1f}%)")
print(f"  Negative: {len(train_labels)-sum(train_labels)} ({(len(train_labels)-sum(train_labels))/len(train_labels)*100:.1f}%)")

## 7. Quality Analysis

In [None]:
# Analyze quality metrics
train_labels_array = np.array(train_labels)
pos_similarities = train_similarities[train_labels_array == 1]
neg_similarities = train_similarities[train_labels_array == 0]

print(f"\n{'='*80}")
print(f"QUALITY METRICS")
print(f"{'='*80}")

print(f"\nPositive Pairs (label=1):")
print(f"  Mean similarity: {pos_similarities.mean():.4f}")
print(f"  Median similarity: {np.median(pos_similarities):.4f}")
print(f"  Range: [{pos_similarities.min():.4f}, {pos_similarities.max():.4f}]")

print(f"\nNegative Pairs (label=0):")
print(f"  Mean similarity: {neg_similarities.mean():.4f}")
print(f"  Median similarity: {np.median(neg_similarities):.4f}")
print(f"  Range: [{neg_similarities.min():.4f}, {neg_similarities.max():.4f}]")

separability = pos_similarities.mean() - neg_similarities.mean()
print(f"\nSeparability (Pos - Neg): {separability:.4f}")

if separability > 0.15:
    print("  ✓ EXCELLENT: Clear separation")
elif separability > 0.1:
    print("  ✓ GOOD: Reasonable separation")
elif separability > 0.05:
    print("  ⚠️  OK: Moderate separation")
else:
    print("  ✗ POOR: Weak separation (increase thresholds)")

# Check overlap
overlap_count = np.sum(
    (pos_similarities[:, None] <= neg_similarities[None, :]).any(axis=1)
)
overlap_pct = overlap_count / len(pos_similarities) * 100

print(f"\nScore Overlap: {overlap_pct:.1f}%")
if overlap_pct < 30:
    print("  ✓ GOOD: Low overlap")
elif overlap_pct < 50:
    print("  ⚠️  OK: Moderate overlap")
else:
    print("  ✗ POOR: High overlap (adjust thresholds)")

## 8. Visualize Distribution

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

plt.hist(neg_similarities, bins=50, alpha=0.6, label='Negative (label=0)', color='red', edgecolor='black')
plt.hist(pos_similarities, bins=50, alpha=0.6, label='Positive (label=1)', color='green', edgecolor='black')

plt.axvline(POSITIVE_SIMILARITY_THRESHOLD, color='green', linestyle='--', linewidth=2, label=f'Pos threshold={POSITIVE_SIMILARITY_THRESHOLD}')
plt.axvline(NEGATIVE_SIMILARITY_THRESHOLD, color='red', linestyle='--', linewidth=2, label=f'Neg threshold={NEGATIVE_SIMILARITY_THRESHOLD}')

plt.xlabel('Cosine Similarity', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Training Pair Similarity Distribution (Validated)', fontsize=14, fontweight='bold')
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\n✓ Distribution looks {'GOOD' if separability > 0.1 else 'NEEDS IMPROVEMENT'}")

## 9. Save Validated Training Pairs

In [None]:
# Save to JSON
output_file = DATA_DIR / 'fixed_training_pairs.json'

training_data = {
    'texts1': train_texts1,
    'texts2': train_texts2,
    'labels': train_labels,
    'metadata': {
        'generated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'num_pairs': len(train_labels),
        'num_positive': sum(train_labels),
        'num_negative': len(train_labels) - sum(train_labels),
        'positive_similarity_threshold': POSITIVE_SIMILARITY_THRESHOLD,
        'negative_similarity_threshold': NEGATIVE_SIMILARITY_THRESHOLD,
        'baseline_model': 'sentence-transformers/all-mpnet-base-v2',
        'baseline_separability': float(separability),
        'baseline_overlap': float(overlap_pct / 100),
        'positive_mean_similarity': float(pos_similarities.mean()),
        'negative_mean_similarity': float(neg_similarities.mean()),
        'quality_status': 'GOOD' if separability > 0.1 else 'NEEDS_IMPROVEMENT'
    }
}

with open(output_file, 'w') as f:
    json.dump(training_data, f, indent=2)

print(f"\n{'='*80}")
print(f"✓ TRAINING PAIRS SAVED")
print(f"{'='*80}")
print(f"File: {output_file}")
print(f"Size: {output_file.stat().st_size / 1024 / 1024:.2f} MB")
print(f"\nYou can now use these validated pairs to train your models!")
print(f"\nNext steps:")
print(f"  1. Use these pairs in your training notebook")
print(f"  2. Retrain all models with clean data")
print(f"  3. Re-evaluate with fixed test pairs")
print(f"  4. Expect significant performance improvements!")

## 10. Sample Pairs for Manual Inspection

In [None]:
# Show sample positive pairs
print("\n" + "="*80)
print("SAMPLE POSITIVE PAIRS (should be semantically similar)")
print("="*80)

pos_indices = np.where(train_labels_array == 1)[0]
sample_pos = np.random.choice(pos_indices, size=min(3, len(pos_indices)), replace=False)

for i, idx in enumerate(sample_pos, 1):
    print(f"\nPair {i} (similarity: {train_similarities[idx]:.3f}):")
    print(f"  Text 1: {train_texts1[idx][:150]}...")
    print(f"  Text 2: {train_texts2[idx][:150]}...")

# Show sample negative pairs
print("\n" + "="*80)
print("SAMPLE NEGATIVE PAIRS (should be semantically different)")
print("="*80)

neg_indices = np.where(train_labels_array == 0)[0]
sample_neg = np.random.choice(neg_indices, size=min(3, len(neg_indices)), replace=False)

for i, idx in enumerate(sample_neg, 1):
    print(f"\nPair {i} (similarity: {train_similarities[idx]:.3f}):")
    print(f"  Text 1: {train_texts1[idx][:150]}...")
    print(f"  Text 2: {train_texts2[idx][:150]}...")