# Fine-tune SentenceTransformer Models for ITSM Tickets (v6)

This notebook represents the `v6` iteration of the ITSM similarity model pipeline. It builds upon the robust pipeline of `v5` but introduces critical feature engineering improvements.
### Key Enhancements from v5
1. **Contextual Prefixing**: Instead of training on just `Short Description` + `Description`, we now embed structured metadata (`Service`, `Category`, `Group`) directly into the text string. This helps the model distinguish between technically distinct but linguistically similar tickets (e.g., "Login failed" in SAP vs. VPN).
2. **Data Leakage Prevention**: Explicitly **EXCLUDING** `Resolution notes` from the training text to ensure the model learns to match problems based on symptoms, not retrospective solutions.
3. **Advanced Evaluation**: Continued real-time tracking of Spearman correlation, ROC AUC, and F1 scores.
## Overview
This notebook represents the `v5` iteration of the ITSM similarity model pipeline. It transitions from a functional script to a robust, configurable machine learning pipeline.

### Key Enhancements
1. **Comprehensive Configuration**: Centralized `CONFIG` for all hyperparameters.
2. **Smart Data Generation**: TF-IDF based filtering for high-quality positives and dynamic hard negative mining.
3. **Advanced Evaluation**: Real-time tracking of Spearman correlation, ROC AUC, and F1 scores.
4. **Reproducibility**: Full seeding of Random, NumPy, and PyTorch.
5. **Relationship Classifier**: Integrated training of the secondary classifier.


In [None]:
# [Setup] Install dependencies and download NLTK data (Run this first!)
# This cell ensures your environment has all required libraries and data.

# 1. Install essential libraries
%pip install --upgrade sentence-transformers imbalanced-learn "protobuf<=3.20.1" --quiet
%pip install imbalanced-learn --quiet

# 2. Fix TensorFlow/Protobuf conflicts (common in Kaggle)
import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

# 3. Download NLTK data
import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)  # Often needed for wordnet
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True) # Newer nltk versions might need this

print("‚úÖ Setup complete. Dependencies installed and NLTK data downloaded.")

In [None]:

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress CUDA warnings
import sys
import json
import math
import random
import logging
import warnings
import pickle
import collections
import re
from pathlib import Path
from typing import List, Dict, Tuple, Union
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, f1_score,
    roc_auc_score, average_precision_score, precision_recall_curve, roc_curve
)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr, spearmanr
import joblib


# --- Robust NLTK Setup ---
import nltk
# Download essential NLTK data immediately to avoid LookupErrors
for res in ['wordnet', 'omw-1.4', 'stopwords', 'punkt', 'punkt_tab']:
    try:
        nltk.download(res, quiet=True)
    except Exception as e:
        print(f"Warning: Failed to download NLTK resource {res}: {e}")
# -------------------------

from sentence_transformers import SentenceTransformer, InputExample, losses, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SentenceEvaluator
from sentence_transformers.util import cos_sim

# --- Setup & Reproducibility ---

def find_workspace_root(marker='nexustism'):
    cwd = Path.cwd()
    if marker in cwd.parts:
        idx = cwd.parts.index(marker)
        return Path(*cwd.parts[:idx+1])
    return cwd


def resolve_log_path():
    # 1. Agent Temp Dir (High priority for agent runs)
    if os.environ.get("GEMINI_TEMP_DIR"):
        p = Path(os.environ.get("GEMINI_TEMP_DIR")).expanduser()
        p.mkdir(parents=True, exist_ok=True)
        return p / "training_v6.log"
        
    # 2. Explicit User Override
    if os.environ.get("LOG_DIR"):
        p = Path(os.environ.get("LOG_DIR")).expanduser()
        p.mkdir(parents=True, exist_ok=True)
        return p / "training_v6.log"

    # 3. Default: Local Current Working Directory
    # This ensures it logs to the local folder where the notebook is running
    return Path.cwd() / "training_v6.log"
LOG_PATH = resolve_log_path()

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(LOG_PATH),
        logging.StreamHandler()
    ],
    force=True  # ensure handler resets even if logger was configured earlier
)
logger = logging.getLogger(__name__)

def log_step(msg: str):
    logger.info(msg)
    print(msg)

warnings.filterwarnings("ignore", category=UserWarning)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# NLTK Downloads

def ensure_nltk_resource(resource_path, download_name=None):
    try:
        nltk.data.find(resource_path)
        return True
    except LookupError:
        try:
            nltk.download(download_name or resource_path.split('/')[-1], quiet=True)
            nltk.data.find(resource_path)
            return True
        except Exception as exc:
            logger.info(f"NLTK resource '{resource_path}' unavailable ({exc}); falling back when possible.")
            return False

HAS_STOPWORDS = ensure_nltk_resource('corpora/stopwords', 'stopwords')
HAS_WORDNET = ensure_nltk_resource('corpora/wordnet', 'wordnet')
HAS_PUNKT = ensure_nltk_resource('tokenizers/punkt', 'punkt')

def set_seeds(seed=42):
    #Set seeds for reproducibility across all libraries.
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    log_step(f"‚úÖ Seeds set to {seed}")

set_seeds(42)
log_step(f"Logging to {LOG_PATH}")

# Device info (SentenceTransformer handles placement, but we log visibility)
if torch.cuda.is_available():
    log_step(f"CUDA available: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
    log_step("MPS available: using Apple Silicon accelerator")
else:
    log_step("No GPU detected; training will run on CPU")

# Check Imbalanced-Learn for Relationship Classifier
try:
    from imblearn.over_sampling import SMOTE
    IMBLEARN_AVAILABLE = True
except ImportError:
    log_step('‚ö†Ô∏è imbalanced-learn not installed. Relationship classifier will skip SMOTE.')
    IMBLEARN_AVAILABLE = False



## 2. Configuration


In [None]:
CONFIG = {
    # Paths
    'base_model': 'sentence-transformers/all-mpnet-base-v2',
    'output_dir': 'models/all-mpnet-finetuned-v6',
    'source_data': 'data/dummy_data_promax.csv',
    'relationship_data': 'data/relationship_pairs.json',
    
    # Training Hyperparameters
    'epochs': 20,
    'batch_size': 64,
    'learning_rate': 2e-5,
    'max_learning_rate': 5e-5,
    'weight_decay': 0.01,
    'warmup_ratio': 0.1,
    'max_seq_length': 384,  # Increased for detailed ticket descriptions
    
    # Data Generation Strategy
    'target_pairs': 50000,  # Total training pairs to generate
    'positive_ratio': 0.4,  # 40% positives, 60% negatives
    'augmentation_ratio': 0.2,
    'eval_split': 0.15,
    
    # Quality Filtering Thresholds
    'quality_threshold': 0.3,      # Minimum TF-IDF similarity for 'good' positive pairs
    'hard_negative_min': 0.15,     # Min similarity for hard negatives
    'hard_negative_max': 0.45,     # Max similarity for hard negatives (confusing zone)
    
    # Early Stopping
    'early_stopping_patience': 7,
    'min_delta': 0.005,
    'eval_steps': 100,
    
    # Loss Weights (Future use for Multi-Loss)
    'mnr_loss_weight': 1.0,
    'triplet_loss_weight': 0.5,
    'cosine_loss_weight': 0.2}

# Create Output Directory
output_path = os.path.join(os.getcwd(), CONFIG['output_dir'])
os.makedirs(output_path, exist_ok=True)
print(f"üìÇ Output directory set to: {output_path}")


## 3. Data Loading and Preprocessing (Enhanced for v6)
Includes **Contextual Prefixing** using `Service`, `Category`, `Subcategory`, and `Assignment group`.


In [None]:


# Resolve data paths for local/Kaggle
# Reuses the v5 Kaggle-friendly resolver to avoid FileNotFound errors
log_step("[Cell 5] Resolving data paths and loading data...")

def resolve_data_path(filepath):
    """
    Resolve a relative file path to an absolute path.
    Tries multiple locations (workspace root, Kaggle input) to find the file.
    """
    cwd = os.getcwd()
    tried_paths = []

    if os.path.isabs(filepath):
        if os.path.exists(filepath):
            return filepath
        tried_paths.append(filepath)
    else:
        if os.path.exists(filepath):
            return os.path.abspath(filepath)
        tried_paths.append(os.path.abspath(filepath))

    # Kaggle paths
    if '/kaggle/' in cwd:
        kaggle_input_base = '/kaggle/input'
        if os.path.exists(kaggle_input_base):
            for dataset_dir in os.listdir(kaggle_input_base):
                dataset_path = os.path.join(kaggle_input_base, dataset_dir)
                if not os.path.isdir(dataset_path):
                    continue
                potential_path = os.path.join(dataset_path, filepath)
                if os.path.exists(potential_path):
                    return os.path.abspath(potential_path)
                tried_paths.append(potential_path)
                if filepath.startswith('data/'):
                    filename = filepath.replace('data/', '')
                    potential_path = os.path.join(dataset_path, filename)
                    if os.path.exists(potential_path):
                        return os.path.abspath(potential_path)
                    tried_paths.append(potential_path)
        kaggle_working_path = os.path.join('/kaggle/working', filepath)
        if os.path.exists(kaggle_working_path):
            return os.path.abspath(kaggle_working_path)
        tried_paths.append(kaggle_working_path)

    # Workspace root (nexustism)
    if 'nexustism' in cwd:
        parts = cwd.split(os.sep)
        if 'nexustism' in parts:
            idx = parts.index('nexustism')
            workspace_root = os.sep.join(parts[:idx+1])
            potential_path = os.path.join(workspace_root, filepath)
            if os.path.exists(potential_path):
                return os.path.abspath(potential_path)
            tried_paths.append(potential_path)

    # Build helpful error
    error_msg = (
f"Data file not found: {filepath}
"
        f"Current working directory: {cwd}

"
        f"Tried the following paths:
"
    )
    for p in tried_paths[:10]:
        error_msg += f"  - {p}
"

    if '/kaggle/' in cwd:
        error_msg += (
            "
üìù Kaggle Environment Detected:
"
            "   - Upload the dataset to Kaggle and add it to the notebook.
"
            "   - Check available datasets with os.listdir('/kaggle/input').
"
            "   - Update CONFIG['source_data'] if the path differs.
"
        )
    else:
        error_msg += (
            "
üìù Local Environment:
"
            "   - Ensure the file exists under data/ or update CONFIG['source_data'].
"
        )
    raise FileNotFoundError(error_msg)

# Resolve configured paths
try:
    CONFIG['source_data'] = resolve_data_path(CONFIG['source_data'])
    if CONFIG.get('relationship_data'):
        CONFIG['relationship_data'] = resolve_data_path(CONFIG['relationship_data'])
    log_step(f"‚úÖ Data path resolved to: {CONFIG['source_data']}")
except FileNotFoundError as e:
    log_step(f"‚ùå Error: {e}")
    raise


def load_and_clean_data(filepath, min_length=10):
    """
    Loads data, checks columns, and performs cleaning.
    """
    log_step(f"Loading data from {filepath}")
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Data file not found: {filepath}")

    df = pd.read_csv(filepath)
    required_cols = ["Number", "Short Description", "Description", "Category", "Subcategory", 
                     "Service", "Service offering", "Assignment group"]
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")

    # Fill NA and Clean Text
    # We ensure all context fields are strings
    placeholders = {"", "nan", "none", "null", "unknown", "n/a", "na"}

    def normalize_field(val: str) -> str:
        s = str(val).strip()
        s = re.sub(r"\s+", " ", s)
        if s.lower() in placeholders:
            return ""
        return s

    for col in [c for c in required_cols if c != "Number"]:
        df[col] = df[col].fillna("").apply(normalize_field)

    # Normalize casing for structured context fields to reduce duplicates
    context_cols = ["Service", "Service offering", "Category", "Subcategory", "Assignment group"]
    for col in context_cols:
        df[col] = df[col].str.lower()

    # Construct Rich Text Representation
    # Format: [Service | Service offering] [Category | Subcategory] Group: Assignment group. Short Description. Description
    def build_bracketed(parts):
        clean_parts = [p for p in parts if p]
        return f"[{ ' | '.join(clean_parts) }] " if clean_parts else ""

    context_service = df.apply(lambda row: build_bracketed([row['Service'], row['Service offering']]), axis=1)
    context_category = df.apply(lambda row: build_bracketed([row['Category'], row['Subcategory']]), axis=1)
    context_group = df.apply(lambda row: f"Group: {row['Assignment group']}. " if row['Assignment group'] else "", axis=1)

    df['text'] = (
        context_service +
        context_category +
        context_group +
        df['Short Description'].str.strip() + ". " +
        df['Description'].str.strip()
    ).str.replace(r"\s+", " ", regex=True).str.strip()

    # Filter empty or too short
    initial_count = len(df)
    df = df[df['text'].str.len() >= min_length].copy()
    dropped = initial_count - len(df)

    # Clean up artifacts (e.g., multiple spaces, empty group/service components)
    df['text'] = df['text'].str.replace(r"\s+\.", ".", regex=True).str.replace(r"\s+", " ", regex=True).str.strip()

    df['category_id'] = df.groupby(['Category', 'Subcategory']).ngroup()

    log_step(f"‚úÖ Loaded {len(df)} incidents (dropped {dropped} short/empty)")
    log_step(f"   Unique Categories: {df['Category'].nunique()}")
    log_step(f"   Unique Subcategories: {df['Subcategory'].nunique()}")

    return df

# Load Data
df_incidents = load_and_clean_data(CONFIG['source_data'])

# Split incidents first to avoid leakage between train/eval pairs
train_df, eval_df = train_test_split(
    df_incidents,
    test_size=CONFIG['eval_split'],
    random_state=42,
    stratify=df_incidents['category_id']
)

train_target_pairs = int(CONFIG['target_pairs'] * (1 - CONFIG['eval_split']))
eval_target_pairs = max(500, CONFIG['target_pairs'] - train_target_pairs)

log_step(f"Train incidents: {len(train_df)}, Eval incidents: {len(eval_df)}")
log_step(f"Target pairs -> Train: {train_target_pairs}, Eval: {eval_target_pairs}")

log_step("Preview of Contextual Embeddings (train split):")
for t in train_df['text'].head(3).tolist():
    print(f" - {t[:200]}...") # Truncate for display

train_df.head(3)



## 4. Text Similarity Utilities
We use TF-IDF similarity to find "quality" pairs (related but not identical) and "hard negatives" (different category but lexically similar).


In [None]:

class TextSimilarityCalculator:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer() if HAS_WORDNET else None
        self.stop_words = set(stopwords.words('english')) if HAS_STOPWORDS else ENGLISH_STOP_WORDS
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
        self.tfidf_matrix = None

    def fit_tfidf(self, texts):
        log_step(f"Fitting TF-IDF vectorizer on {len(texts)} texts...")
        self.tfidf_matrix = self.vectorizer.fit_transform(texts)
        log_step(f"TF-IDF fit complete. Matrix shape: {self.tfidf_matrix.shape}")

    def get_tfidf_similarity(self, idx1, idx2):
        if self.tfidf_matrix is None:
            raise ValueError("Run fit_tfidf first")
        # Compute cosine similarity between two sparse vectors
        return (self.tfidf_matrix[idx1] * self.tfidf_matrix[idx2].T).toarray()[0][0]



## 5. Smart Pair Generation
Generating positive pairs (same subcategory + semantic overlap) and hard negatives (different category + lexical overlap).


In [None]:

def generate_smart_pairs(df, target_count, pos_ratio=0.4, sim_calculator=None):
    if sim_calculator is None:
        raise ValueError("sim_calculator must be provided (pre-fitted TF-IDF)")

    positive_target = int(target_count * pos_ratio)
    negative_target = target_count - positive_target

    positives = []
    negatives = []

    # Group by Category/Subcategory
    groups = df.groupby('category_id')
    group_indices = {k: v.index.tolist() for k, v in groups}
    all_indices = df.index.tolist()

    log_step(f"Generating {positive_target} positive and {negative_target} negative pairs...")

    # --- 1. Positive Pairs ---
    # Strategy: Sample pairs from same group, check TF-IDF score to ensure they aren't duplicates or too vague
    pbar = tqdm(total=positive_target, desc="Positives")
    attempts = 0
    while len(positives) < positive_target and attempts < positive_target * 5:
        attempts += 1
        # Pick random group
        gid = random.choice(list(group_indices.keys()))
        g_idxs = group_indices[gid]
        if len(g_idxs) < 2: continue

        i1, i2 = random.sample(g_idxs, 2)

        # Convert DataFrame index to integer location for TF-IDF
        loc1 = df.index.get_loc(i1)
        loc2 = df.index.get_loc(i2)

        sim = sim_calculator.get_tfidf_similarity(loc1, loc2)

        # Accept if similarity is decent (avoiding identicals if sim=1.0, though duplicates happen)
        if sim > CONFIG['quality_threshold']:
            positives.append((i1, i2, sim))
            pbar.update(1)

    pbar.close()

    if len(positives) < positive_target:
        remaining = positive_target - len(positives)
        log_step(f"‚ö†Ô∏è Only found {len(positives)} positives with quality threshold; filling remaining {remaining} with random in-group samples.")
        filler_attempts = 0
        while len(positives) < positive_target and filler_attempts < positive_target * 3:
            filler_attempts += 1
            gid = random.choice(list(group_indices.keys()))
            g_idxs = group_indices[gid]
            if len(g_idxs) < 2: continue

            i1, i2 = random.sample(g_idxs, 2)
            loc1 = df.index.get_loc(i1)
            loc2 = df.index.get_loc(i2)
            sim = sim_calculator.get_tfidf_similarity(loc1, loc2)
            positives.append((i1, i2, sim))

    # --- 2. Hard Negative Pairs ---
    # Strategy: Different categories but high TF-IDF overlap (confusing examples)
    pbar = tqdm(total=negative_target, desc="Negatives")
    attempts = 0
    max_attempts = negative_target * 5  # Prevent runaway loops when hard negatives are scarce
    fallback_after = negative_target * 2
    while len(negatives) < negative_target and attempts < max_attempts:
        attempts += 1

        # Random sampling
        i1, i2 = random.sample(all_indices, 2)

        # Must be different categories
        if df.at[i1, 'Category'] == df.at[i2, 'Category']:
            continue

        loc1 = df.index.get_loc(i1)
        loc2 = df.index.get_loc(i2)
        sim = sim_calculator.get_tfidf_similarity(loc1, loc2)

        # Hard Negative Criteria
        is_hard = CONFIG['hard_negative_min'] < sim < CONFIG['hard_negative_max']

        # Accept if hard negative OR we are struggling to find hard ones (fallback after many attempts)
        if is_hard or attempts > fallback_after:
            negatives.append((i1, i2, sim))
            pbar.update(1)

    pbar.close()

    if len(negatives) < negative_target:
        remaining = negative_target - len(negatives)
        log_step(f"‚ö†Ô∏è Only found {len(negatives)} hard negatives after {attempts} attempts; filling remaining {remaining} with random cross-category pairs.")
        for _ in range(remaining):
            tries = 0
            while True:
                i1, i2 = random.sample(all_indices, 2)
                tries += 1
                if df.at[i1, 'Category'] != df.at[i2, 'Category'] or tries > 10:
                    break
            loc1 = df.index.get_loc(i1)
            loc2 = df.index.get_loc(i2)
            sim = sim_calculator.get_tfidf_similarity(loc1, loc2)
            negatives.append((i1, i2, sim))

    log_step(f"‚úÖ Pair generation complete: {len(positives)} positives, {len(negatives)} negatives")
    return positives, negatives

# Fit TF-IDF on splits to avoid leakage
train_sim_calculator = TextSimilarityCalculator()
train_sim_calculator.fit_tfidf(train_df['text'].tolist())

eval_sim_calculator = TextSimilarityCalculator()
eval_sim_calculator.fit_tfidf(eval_df['text'].tolist())

# Generate pairs per split
train_pos_idxs, train_neg_idxs = generate_smart_pairs(train_df, train_target_pairs, CONFIG['positive_ratio'], train_sim_calculator)
eval_pos_idxs, eval_neg_idxs = generate_smart_pairs(eval_df, eval_target_pairs, CONFIG['positive_ratio'], eval_sim_calculator)



## 6. Data Augmentation


In [None]:

log_step("[Cell 11] Converting pairs to SentenceTransformer examples (with augmentation)...")

def simple_augment(text):
    """Randomly swap or delete words."""
    words = text.split()
    if len(words) < 5: return text

    if random.random() > 0.5:
        # Swap
        idx = random.randint(0, len(words)-2)
        words[idx], words[idx+1] = words[idx+1], words[idx]
    else:
        # Delete
        idx = random.randint(0, len(words)-1)
        words.pop(idx)
    return " ".join(words)


def pairs_to_examples(df, pos_pairs, neg_pairs, augment=False):
    examples = []
    for i1, i2, score in pos_pairs:
        t1 = df.at[i1, 'text']
        t2 = df.at[i2, 'text']

        # Standard Pair
        examples.append(InputExample(texts=[t1, t2], label=1.0))

        # Augmentation (only for subset)
        if augment and random.random() < CONFIG['augmentation_ratio']:
            examples.append(InputExample(texts=[simple_augment(t1), t2], label=1.0))

    for i1, i2, score in neg_pairs:
        t1 = df.at[i1, 'text']
        t2 = df.at[i2, 'text']
        examples.append(InputExample(texts=[t1, t2], label=0.0))

    return examples

train_examples = pairs_to_examples(train_df, train_pos_idxs, train_neg_idxs, augment=True)
eval_data = pairs_to_examples(eval_df, eval_pos_idxs, eval_neg_idxs, augment=False)

# Shuffle train examples only
random.shuffle(train_examples)
train_data = train_examples

log_step(f"Training Samples: {len(train_data)}")
log_step(f"Evaluation Samples: {len(eval_data)}")



## 7. Model Setup


In [None]:

log_step("[Cell 13] Loading base SentenceTransformer model...")
model = SentenceTransformer(CONFIG['base_model'])
model.max_seq_length = CONFIG['max_seq_length']

# --- Loss Function ---
# Using CosineSimilarityLoss to leverage labeled positive/negative pairs directly
train_loss = losses.CosineSimilarityLoss(model)

# --- DataLoader ---
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=CONFIG['batch_size'])

log_step(f"Model loaded: {CONFIG['base_model']}")
log_step(f"Max Seq Length: {model.max_seq_length}")
log_step(f"Training batches per epoch: {len(train_dataloader)}")



## 8. Evaluation Setup & Custom Callbacks


In [None]:

class ComprehensiveEvaluator(SentenceEvaluator):

    #Custom evaluator to track multiple metrics: Spearman, Pearson, ROC AUC, F1.

    def __init__(self, examples, batch_size=32, name='', show_progress_bar=False):
        self.examples = examples
        self.batch_size = batch_size
        self.name = name
        self.show_progress_bar = show_progress_bar

        self.texts1 = [ex.texts[0] for ex in examples]
        self.texts2 = [ex.texts[1] for ex in examples]
        self.labels = [ex.label for ex in examples]

    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        model.eval()
        log_step(f"[Eval] Running evaluator at epoch={epoch}, step={steps} on {len(self.labels)} pairs...")

        # Encode
        emb1 = model.encode(self.texts1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
        emb2 = model.encode(self.texts2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)

        # Cosine Similarity
        cosine_scores = np.sum(emb1 * emb2, axis=1) / (np.linalg.norm(emb1, axis=1) * np.linalg.norm(emb2, axis=1))

        # Metrics
        eval_pearson, _ = pearsonr(self.labels, cosine_scores)
        eval_spearman, _ = spearmanr(self.labels, cosine_scores)

        # Classification Metrics (Threshold optimization)
        try:
            roc_auc = roc_auc_score(self.labels, cosine_scores)
            pr_auc = average_precision_score(self.labels, cosine_scores)
        except ValueError:
            roc_auc = 0.0
            pr_auc = 0.0

        logger.info(f"Epoch {epoch} Steps {steps}: Spearman={eval_spearman:.4f}, Pearson={eval_pearson:.4f}, ROC_AUC={roc_auc:.4f}, PR_AUC={pr_auc:.4f}")
        print()
        print(f"üìä Eval @ epoch {epoch}, step {steps}: Spearman={eval_spearman:.4f}, Pearson={eval_pearson:.4f}, ROC_AUC={roc_auc:.4f}, PR_AUC={pr_auc:.4f}")

        # Save detailed metrics to CSV
        if output_path:
            csv_path = os.path.join(output_path, 'eval_metrics.csv')
            file_exists = os.path.isfile(csv_path)
            with open(csv_path, mode='a', newline='') as f:
                header = 'epoch,steps,spearman,pearson,roc_auc,pr_auc'
                if not file_exists:
                    f.write(header + '
')
                f.write(f"{epoch},{steps},{eval_spearman:.4f},{eval_pearson:.4f},{roc_auc:.4f},{pr_auc:.4f}
")

        return eval_spearman

# Initialize Evaluator
log_step("[Cell 15] Initializing evaluator...")
evaluator = ComprehensiveEvaluator(eval_data, batch_size=CONFIG['batch_size'], name='dev')
log_step(f"Evaluator ready on {len(eval_data)} pairs")



## 9. Training


In [None]:


# Define Output Path
stamp = datetime.now().strftime('%Y%m%d_%H%M')
save_path = f"{CONFIG['output_dir']}_{stamp}"

start_msg = f"üöÄ Starting training... Saving to {save_path}"
log_step(start_msg)
log_step(f"Training config: epochs={CONFIG['epochs']}, batch_size={CONFIG['batch_size']}, eval_steps={CONFIG['eval_steps']}, warmup_ratio={CONFIG['warmup_ratio']}")
logger.info(f"Train batches per epoch: {len(train_dataloader)}")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=CONFIG['epochs'],
    warmup_steps=int(len(train_dataloader) * CONFIG['epochs'] * CONFIG['warmup_ratio']),
    optimizer_params={'lr': CONFIG['learning_rate'], 'weight_decay': CONFIG['weight_decay']},
    output_path=save_path,
    evaluation_steps=CONFIG['eval_steps'],
    save_best_model=True,
    show_progress_bar=True
)

logger.info('‚úÖ Training complete.')
print('‚úÖ Training complete.')



## 10. Final Evaluation & Visualization


In [None]:


# Reload Best Model
log_step(f"[Cell 19] Reloading best model from {save_path} for final evaluation...")
best_model = SentenceTransformer(save_path)

# Encode Eval Data
log_step(f"Encoding {len(eval_data)} evaluation pairs for final metrics...")
eval_texts1 = [ex.texts[0] for ex in eval_data]
eval_texts2 = [ex.texts[1] for ex in eval_data]
eval_labels = np.array([ex.label for ex in eval_data])

logger.info(f"Encoding {len(eval_texts1)} evaluation pairs...")
embeddings1 = best_model.encode(eval_texts1, batch_size=CONFIG['batch_size'], show_progress_bar=True)
embeddings2 = best_model.encode(eval_texts2, batch_size=CONFIG['batch_size'], show_progress_bar=True)

# Proper cosine similarity (normalized)
cosine_scores = np.sum(embeddings1 * embeddings2, axis=1) / (np.linalg.norm(embeddings1, axis=1) * np.linalg.norm(embeddings2, axis=1))

# Final metrics (match v5-style reporting)
final_spearman, _ = spearmanr(eval_labels, cosine_scores)
final_pearson, _ = pearsonr(eval_labels, cosine_scores)
final_roc_auc = roc_auc_score(eval_labels, cosine_scores)
final_pr_auc = average_precision_score(eval_labels, cosine_scores)

log_step("üìä Final Evaluation Metrics:")
log_step(f"   Spearman Correlation: {final_spearman:.4f}")
log_step(f"   Pearson Correlation: {final_pearson:.4f}")
log_step(f"   ROC AUC: {final_roc_auc:.4f}")
log_step(f"   PR AUC: {final_pr_auc:.4f}")
logger.info(f"Final metrics - Spearman: {final_spearman:.4f}, Pearson: {final_pearson:.4f}, ROC AUC: {final_roc_auc:.4f}, PR AUC: {final_pr_auc:.4f}")

# --- Plots ---
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1. Distribution
sns.histplot(cosine_scores[eval_labels==1], color='green', label='Positive', kde=True, ax=axes[0])
sns.histplot(cosine_scores[eval_labels==0], color='red', label='Negative', kde=True, ax=axes[0])
axes[0].set_title('Cosine Similarity Distribution')
axes[0].legend()

# 2. ROC Curve
fpr, tpr, _ = roc_curve(eval_labels, cosine_scores)
roc_auc = roc_auc_score(eval_labels, cosine_scores)
axes[1].plot(fpr, tpr, label=f'AUC = {roc_auc:.3f}')
axes[1].plot([0, 1], [0, 1], 'k--')
axes[1].set_title('ROC Curve')
axes[1].legend()

# 3. Precision-Recall
precision, recall, _ = precision_recall_curve(eval_labels, cosine_scores)
pr_auc = average_precision_score(eval_labels, cosine_scores)
axes[2].plot(recall, precision, label=f'PR AUC = {pr_auc:.3f}')
axes[2].set_title('Precision-Recall Curve')
axes[2].legend()

plt.tight_layout()
plt.show()



## 11. Relationship Classifier (Optional)
Trains a secondary classifier to predict relationship types (duplicate, causal, related).


In [None]:

if IMBLEARN_AVAILABLE and os.path.exists(CONFIG['relationship_data']):
    log_step('üß† Training Relationship Classifier...')
    with open(CONFIG['relationship_data'], 'r') as f:
        rel_data = json.load(f)

    rel_df = pd.DataFrame(rel_data)
    # Filter valid labels
    valid_labels = ['duplicate', 'causal', 'related', 'none']
    rel_df = rel_df[rel_df['label'].isin(valid_labels)]
    log_step(f"Relationship samples after filtering: {len(rel_df)}")

    # Encode features using fine-tuned model
    text_a = rel_df['text_a'].tolist()
    text_b = rel_df['text_b'].tolist()

    emb_a = best_model.encode(text_a)
    emb_b = best_model.encode(text_b)

    # Feature Engineering: (u, v, |u-v|, u*v)
    X = np.hstack([emb_a, emb_b, np.abs(emb_a - emb_b), emb_a * emb_b])
    y = rel_df['label']

    # SMOTE Balancing
    smote = SMOTE(k_neighbors=2, random_state=42)
    X_res, y_res = smote.fit_resample(X, y)
    log_step(f"After SMOTE: {len(X_res)} samples")

    # Train Classifier
    clf = LogisticRegression(max_iter=1000, multi_class='multinomial')
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

    clf.fit(X_train, y_train)

    # Evaluation
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))

    # Save
    joblib.dump(clf, os.path.join(save_path, 'relationship_classifier.joblib'))
    log_step('‚úÖ Relationship classifier saved.')
else:
    log_step('‚ö†Ô∏è Skipping relationship classifier (missing data or imbalanced-learn).')

