# Fine-tune SentenceTransformer Models for ITSM Tickets (v5)

## Overview
This notebook represents the `v5` iteration of the ITSM similarity model pipeline. It transitions from a functional script to a robust, configurable machine learning pipeline.

### Key Enhancements
1. **Comprehensive Configuration**: Centralized `CONFIG` for all hyperparameters.
2. **Smart Data Generation**: TF-IDF based filtering for high-quality positives and dynamic hard negative mining.
3. **Advanced Evaluation**: Real-time tracking of Spearman correlation, ROC AUC, and F1 scores.
4. **Reproducibility**: Full seeding of Random, NumPy, and PyTorch.
5. **Relationship Classifier**: Integrated training of the secondary classifier.


In [None]:
import os
import sys
import json
import math
import random
import logging
import warnings
import pickle
import collections
from typing import List, Dict, Tuple, Union
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, f1_score,
    roc_auc_score, average_precision_score, precision_recall_curve, roc_curve
)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr, spearmanr
import joblib

from sentence_transformers import SentenceTransformer, InputExample, losses, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SentenceEvaluator
from sentence_transformers.util import cos_sim

# --- Setup & Reproducibility ---

# NLTK Downloads
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('omw-1.4')

# Logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("training_v5.log"),
        logging.StreamHandler()
    ]
 )
logger = logging.getLogger(__name__)

# Suppress non-critical warnings
warnings.filterwarnings("ignore", category=UserWarning)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def set_seeds(seed=42):
    Set seeds for reproducibility across all libraries.
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    print(f"‚úÖ Seeds set to {seed}")

set_seeds(42)

# Check Imbalanced-Learn for Relationship Classifier
try:
    from imblearn.over_sampling import SMOTE
    IMBLEARN_AVAILABLE = True
except ImportError:
    print('‚ö†Ô∏è imbalanced-learn not installed. Relationship classifier will skip SMOTE.')
    IMBLEARN_AVAILABLE = False


## 2. Configuration


In [None]:
CONFIG = {
    # Paths
    'base_model': 'sentence-transformers/all-mpnet-base-v2',
    'output_dir': 'models/all-mpnet-finetuned-v5',
    'source_data': 'data/dummy_data_promax.csv',
    'relationship_data': 'data/relationship_pairs.json',
    
    # Training Hyperparameters
    'epochs': 20,
    'batch_size': 64,
    'learning_rate': 2e-5,
    'max_learning_rate': 5e-5,
    'weight_decay': 0.01,
    'warmup_ratio': 0.1,
    'max_seq_length': 384,  # Increased for detailed ticket descriptions
    
    # Data Generation Strategy
    'target_pairs': 50000,  # Total training pairs to generate
    'positive_ratio': 0.4,  # 40% positives, 60% negatives
    'augmentation_ratio': 0.2,
    'eval_split': 0.15,
    
    # Quality Filtering Thresholds
    'quality_threshold': 0.3,      # Minimum TF-IDF similarity for 'good' positive pairs
    'hard_negative_min': 0.15,     # Min similarity for hard negatives
    'hard_negative_max': 0.45,     # Max similarity for hard negatives (confusing zone)
    
    # Early Stopping
    'early_stopping_patience': 7,
    'min_delta': 0.005,
    'eval_steps': 100,
    
    # Loss Weights (Future use for Multi-Loss)
    'mnr_loss_weight': 1.0,
    'triplet_loss_weight': 0.5,
    'cosine_loss_weight': 0.2}

# Create Output Directory
output_path = os.path.join(os.getcwd(), CONFIG['output_dir'])
os.makedirs(output_path, exist_ok=True)
print(f"üìÇ Output directory set to: {output_path}")


## 3. Data Loading and Preprocessing


In [None]:
def load_and_clean_data(filepath, min_length=10):
    """
    Loads data, checks columns, and performs cleaning.
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Data file not found: {filepath}")
    
    df = pd.read_csv(filepath)
    required_cols = ["Number", "Short Description", "Description", "Category", "Subcategory"]
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    
    # Fill NA and Clean Text
    for col in ["Short Description", "Description"]:
        df[col] = df[col].fillna("").astype(str).str.strip()
        
    for col in ["Category", "Subcategory"]:
        df[col] = df[col].fillna("Unknown").astype(str).str.strip()
        
    # Combined Text
    df["text"] = (df["Short Description"] + " " + df["Description"]).str.strip()
    # Remove excessive whitespace
    df["text"] = df["text"].str.replace(r'\s+', ' ', regex=True)
    
    # Filter empty or too short
    initial_count = len(df)
    df = df[df["text"] .str.len() >= min_length].copy()
    dropped = initial_count - len(df)
    
    # Encode Categories
    df['category_id'] = df.groupby(['Category', 'Subcategory']).ngroup()
    
    print(f"‚úÖ Loaded {len(df)} incidents (dropped {dropped} short/empty)")
    print(f"   Unique Categories: {df['Category'].nunique()}")
    print(f"   Unique Subcategories: {df['Subcategory'].nunique()}")
    
    return df

# Load Data
df_incidents = load_and_clean_data(CONFIG['source_data'])
df_incidents.head(3)


## 4. Text Similarity Utilities
We use TF-IDF similarity to find "quality" pairs (related but not identical) and "hard negatives" (different category but lexically similar).


In [None]:
class TextSimilarityCalculator:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
        self.tfidf_matrix = None

    def fit_tfidf(self, texts):
        print("Fitting TF-IDF vectorizer...")
        self.tfidf_matrix = self.vectorizer.fit_transform(texts)
        print("TF-IDF fit complete.")

    def get_tfidf_similarity(self, idx1, idx2):
        if self.tfidf_matrix is None:
            raise ValueError("Run fit_tfidf first")
        # Compute cosine similarity between two sparse vectors
        return (self.tfidf_matrix[idx1] * self.tfidf_matrix[idx2].T).toarray()[0][0]

sim_calculator = TextSimilarityCalculator()
sim_calculator.fit_tfidf(df_incidents['text'].tolist())


## 5. Smart Pair Generation
Generating positive pairs (same subcategory + semantic overlap) and hard negatives (different category + lexical overlap).


In [None]:
def generate_smart_pairs(df, target_count, pos_ratio=0.4):
    positive_target = int(target_count * pos_ratio)
    negative_target = target_count - positive_target
    
    positives = []
    negatives = []
    
    # Group by Category/Subcategory
    groups = df.groupby('category_id')
    group_indices = {k: v.index.tolist() for k, v in groups}
    all_indices = df.index.tolist()
    
    print(f"Generating {positive_target} positive and {negative_target} negative pairs...")
    
    # --- 1. Positive Pairs ---
    # Strategy: Sample pairs from same group, check TF-IDF score to ensure they aren't duplicates or too vague
    pbar = tqdm(total=positive_target, desc="Positives")
    attempts = 0
    while len(positives) < positive_target and attempts < positive_target * 5:
        attempts += 1
        # Pick random group
        gid = random.choice(list(group_indices.keys()))
        g_idxs = group_indices[gid]
        if len(g_idxs) < 2: continue
        
        i1, i2 = random.sample(g_idxs, 2)
        
        # Convert DataFrame index to integer location for TF-IDF
        loc1 = df.index.get_loc(i1)
        loc2 = df.index.get_loc(i2)
        
        sim = sim_calculator.get_tfidf_similarity(loc1, loc2)
        
        # Accept if similarity is decent (avoiding identicals if sim=1.0, though duplicates happen)
        if sim > CONFIG['quality_threshold']:
            positives.append((i1, i2, sim))
            pbar.update(1)
            
    pbar.close()
    
    # --- 2. Hard Negative Pairs ---
    # Strategy: Different categories but high TF-IDF overlap (confusing examples)
    pbar = tqdm(total=negative_target, desc="Negatives")
    attempts = 0
    while len(negatives) < negative_target:
        attempts += 1
        
        # Random sampling
        i1, i2 = random.sample(all_indices, 2)
        
        # Must be different categories
        if df.at[i1, 'Category'] == df.at[i2, 'Category']:
            continue
            
        loc1 = df.index.get_loc(i1)
        loc2 = df.index.get_loc(i2)
        sim = sim_calculator.get_tfidf_similarity(loc1, loc2)
        
        # Hard Negative Criteria
        is_hard = CONFIG['hard_negative_min'] < sim < CONFIG['hard_negative_max']
        
        # Accept if hard negative OR we are struggling to find hard ones (fallback to random after many attempts)
        if is_hard or attempts > negative_target * 10:
            negatives.append((i1, i2, sim))
            pbar.update(1)
            if attempts > negative_target * 10:
                # Reset attempts to avoid infinite fallback loop
                attempts = 0 

    pbar.close()
    
    return positives, negatives

pos_pairs_idxs, neg_pairs_idxs = generate_smart_pairs(df_incidents, CONFIG['target_pairs'], CONFIG['positive_ratio'])


## 6. Data Augmentation


In [None]:
def simple_augment(text):
    """Randomly swap or delete words."""
    words = text.split()
    if len(words) < 5: return text
    
    if random.random() > 0.5:
        # Swap
        idx = random.randint(0, len(words)-2)
        words[idx], words[idx+1] = words[idx+1], words[idx]
    else:
        # Delete
        idx = random.randint(0, len(words)-1)
        words.pop(idx)
    return " ".join(words)

train_examples = []

# --- Positives to InputExamples ---
for i1, i2, score in pos_pairs_idxs:
    t1 = df_incidents.at[i1, 'text']
    t2 = df_incidents.at[i2, 'text']
    
    # Standard Pair
    train_examples.append(InputExample(texts=[t1, t2], label=1.0))
    
    # Augmentation (only for subset)
    if random.random() < CONFIG['augmentation_ratio']:
        train_examples.append(InputExample(texts=[simple_augment(t1), t2], label=1.0))

# --- Negatives to InputExamples ---
for i1, i2, score in neg_pairs_idxs:
    t1 = df_incidents.at[i1, 'text']
    t2 = df_incidents.at[i2, 'text']
    train_examples.append(InputExample(texts=[t1, t2], label=0.0))

# Shuffle
random.shuffle(train_examples)

# Split Train/Eval
train_size = int(len(train_examples) * (1 - CONFIG['eval_split']))
train_data = train_examples[:train_size]
eval_data = train_examples[train_size:]

print(f"Training Samples: {len(train_data)}")
print(f"Evaluation Samples: {len(eval_data)}")


## 7. Model Setup


In [None]:
model = SentenceTransformer(CONFIG['base_model'])
model.max_seq_length = CONFIG['max_seq_length']

# --- Loss Function ---
# MultipleNegativesRankingLoss is standard for semantic search.
# It treats other samples in the batch as negatives.
train_loss = losses.MultipleNegativesRankingLoss(model)

# --- DataLoader ---
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=CONFIG['batch_size'])

print(f"Model loaded: {CONFIG['base_model']}")
print(f"Max Seq Length: {model.max_seq_length}")


## 8. Evaluation Setup & Custom Callbacks


In [None]:
class ComprehensiveEvaluator(SentenceEvaluator):
    
    Custom evaluator to track multiple metrics: Spearman, Pearson, ROC AUC, F1.
    
    def __init__(self, examples, batch_size=32, name='', show_progress_bar=False):
        self.examples = examples
        self.batch_size = batch_size
        self.name = name
        self.show_progress_bar = show_progress_bar
        
        self.texts1 = [ex.texts[0] for ex in examples]
        self.texts2 = [ex.texts[1] for ex in examples]
        self.labels = [ex.label for ex in examples]

    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        model.eval()
        
        # Encode
        emb1 = model.encode(self.texts1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
        emb2 = model.encode(self.texts2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
        
        # Cosine Similarity
        cosine_scores = np.sum(emb1 * emb2, axis=1) / (np.linalg.norm(emb1, axis=1) * np.linalg.norm(emb2, axis=1))
        
        # Metrics
        eval_pearson, _ = pearsonr(self.labels, cosine_scores)
        eval_spearman, _ = spearmanr(self.labels, cosine_scores)
        
        # Classification Metrics (Threshold optimization)
        try:
            roc_auc = roc_auc_score(self.labels, cosine_scores)
            pr_auc = average_precision_score(self.labels, cosine_scores)
        except ValueError:
            roc_auc = 0.0
            pr_auc = 0.0

        logger.info(f'Epoch {epoch}: Spearman={eval_spearman:.4f}, ROC_AUC={roc_auc:.4f}')
        
        # Save detailed metrics to CSV
        if output_path:
            csv_path = os.path.join(output_path, 'eval_metrics.csv')
            file_exists = os.path.isfile(csv_path)
            with open(csv_path, mode='a', newline='') as f:
                header = 'epoch,steps,spearman,pearson,roc_auc,pr_auc
'
                if not file_exists: f.write(header)
                f.write(f'{epoch},{steps},{eval_spearman:.4f},{eval_pearson:.4f},{roc_auc:.4f},{pr_auc:.4f}
')

        return eval_spearman

# Initialize Evaluator
evaluator = ComprehensiveEvaluator(eval_data, batch_size=CONFIG['batch_size'], name='dev')


## 9. Training


In [None]:
# Define Output Path
timestamp = datetime.now().strftime('%Y%m%d_%H%M')
save_path = f"{CONFIG['output_dir']}_{timestamp}"

# Train
print(f'üöÄ Starting training... Saving to {save_path}')
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=CONFIG['epochs'],
    warmup_steps=int(len(train_dataloader) * CONFIG['epochs'] * CONFIG['warmup_ratio']),
    optimizer_params={'lr': CONFIG['learning_rate'], 'weight_decay': CONFIG['weight_decay']},
    output_path=save_path,
    evaluation_steps=CONFIG['eval_steps'],
    save_best_model=True,
    show_progress_bar=True
)

print('‚úÖ Training complete.')


## 10. Final Evaluation & Visualization


In [None]:
# Reload Best Model
best_model = SentenceTransformer(save_path)

# Encode Eval Data
eval_texts1 = [ex.texts[0] for ex in eval_data]
eval_texts2 = [ex.texts[1] for ex in eval_data]
eval_labels = [ex.label for ex in eval_data]

embeddings1 = best_model.encode(eval_texts1)
embeddings2 = best_model.encode(eval_texts2)
cosine_scores = np.sum(embeddings1 * embeddings2, axis=1)

# --- Plots ---
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1. Distribution
sns.histplot(cosine_scores[np.array(eval_labels)==1], color='green', label='Positive', kde=True, ax=axes[0])
sns.histplot(cosine_scores[np.array(eval_labels)==0], color='red', label='Negative', kde=True, ax=axes[0])
axes[0].set_title('Cosine Similarity Distribution')
axes[0].legend()

# 2. ROC Curve
fpr, tpr, _ = roc_curve(eval_labels, cosine_scores)
roc_auc = roc_auc_score(eval_labels, cosine_scores)
axes[1].plot(fpr, tpr, label=f'AUC = {roc_auc:.3f}')
axes[1].plot([0, 1], [0, 1], 'k--')
axes[1].set_title('ROC Curve')
axes[1].legend()

# 3. Precision-Recall
precision, recall, _ = precision_recall_curve(eval_labels, cosine_scores)
pr_auc = average_precision_score(eval_labels, cosine_scores)
axes[2].plot(recall, precision, label=f'PR AUC = {pr_auc:.3f}')
axes[2].set_title('Precision-Recall Curve')
axes[2].legend()

plt.tight_layout()
plt.show()


## 11. Relationship Classifier (Optional)
Trains a secondary classifier to predict relationship types (duplicate, causal, related).


In [None]:
if IMBLEARN_AVAILABLE and os.path.exists(CONFIG['relationship_data']):
    print('üß† Training Relationship Classifier...')
    with open(CONFIG['relationship_data'], 'r') as f:
        rel_data = json.load(f)
    
    rel_df = pd.DataFrame(rel_data)
    # Filter valid labels
    valid_labels = ['duplicate', 'causal', 'related', 'none']
    rel_df = rel_df[rel_df['label'].isin(valid_labels)]
    
    # Encode features using fine-tuned model
    text_a = rel_df['text_a'].tolist()
    text_b = rel_df['text_b'].tolist()
    
    emb_a = best_model.encode(text_a)
    emb_b = best_model.encode(text_b)
    
    # Feature Engineering: (u, v, |u-v|, u*v)
    X = np.hstack([emb_a, emb_b, np.abs(emb_a - emb_b), emb_a * emb_b])
    y = rel_df['label']
    
    # SMOTE Balancing
    smote = SMOTE(k_neighbors=2, random_state=42)
    X_res, y_res = smote.fit_resample(X, y)
    
    # Train Classifier
    clf = LogisticRegression(max_iter=1000, multi_class='multinomial')
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
    
    clf.fit(X_train, y_train)
    
    # Evaluation
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    # Save
    joblib.dump(clf, os.path.join(save_path, 'relationship_classifier.joblib'))
    print('‚úÖ Relationship classifier saved.')
else:
    print('‚ö†Ô∏è Skipping relationship classifier (missing data or imbalanced-learn).')
