# ITSM Ticket Similarity - Model Fine-tuning (v6 Refactored)

**Version 6 Refactored** improves upon the original v6 by:
1. **Robust Environment Setup:** Automatically handles NLTK data and library dependencies (Kaggle/Local).
2. **Improved Pipeline:** Cleaner data loading and preprocessing.
3. **Contextual Embeddings:** Retains the structured input format `[Service] [Category] Description`.
4. **Reliable Logging:** Auto-detects the best location for logs.


In [1]:
import os, sys, subprocess, pkg_resources
from pathlib import Path

def ensure_packages(pkgs):
    missing = []
    for name, spec in pkgs.items():
        try:
            pkg_resources.get_distribution(name)
        except pkg_resources.DistributionNotFound:
            missing.append(spec)
    if missing:
        print("üì¶ Installing:", ", ".join(missing))
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", *missing])

def ensure_nltk(resources=("wordnet","omw-1.4","stopwords","punkt")):
    import nltk
    nltk_data = Path.home() / "nltk_data"
    nltk_data.mkdir(exist_ok=True)
    if str(nltk_data) not in nltk.data.path:
        nltk.data.path.append(str(nltk_data))
    for res in resources:
        try:
            nltk.data.find(f"corpora/{res}")
        except LookupError:
            try:
                nltk.data.find(f"tokenizers/{res}")
            except LookupError:
                print(f"‚¨áÔ∏è Downloading NLTK: {res}")
                nltk.download(res, quiet=True, download_dir=str(nltk_data))

def run_setup():
    os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

    pkgs = {
        "sentence-transformers": "sentence-transformers",
        "transformers": "transformers",
        "torch": "torch",               # use Kaggle‚Äôs unless missing
        "torchvision": "torchvision",
        "torchaudio": "torchaudio",
        "scikit-learn": "scikit-learn",
        "scipy": "scipy",
        "numpy": "numpy",
        "pandas": "pandas",
        "tqdm": "tqdm",
        "imbalanced-learn": "imbalanced-learn",
        "datasets": "datasets",
        "joblib": "joblib",
        "protobuf": "protobuf<=3.20.1",
        "requests": "requests",
        "python-dotenv": "python-dotenv",
        "openai": "openai",
        "seaborn": "seaborn",
        "matplotlib": "matplotlib",
        "pytorch-lightning": "pytorch-lightning",
    }
    ensure_packages(pkgs)
    ensure_nltk()

run_setup()

  import os, sys, subprocess, pkg_resources


‚¨áÔ∏è Downloading NLTK: wordnet
‚¨áÔ∏è Downloading NLTK: omw-1.4


In [2]:
# [SETUP] Run this cell FIRST to ensure environment stability.
import os
import sys
import subprocess
import pkg_resources
from pathlib import Path

def run_setup():
    print("‚öôÔ∏è Checking environment...")
    
    # 1. Fix Protobuf/TensorFlow conflict (common in Kaggle)
    os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
    
    # 2. Install critical missing packages
    required_packages = {
        'sentence-transformers': 'sentence-transformers', 
        'imblearn': 'imbalanced-learn',
        'protobuf': 'protobuf<=3.20.1', # Specific version for compatibility
        'scipy': 'scipy' # Added as it's a core dependency for many things
    }
    
    # Helper to check if a package is installed
    def is_installed(package_name):
        try:
            pkg_resources.get_distribution(package_name)
            return True
        except pkg_resources.DistributionNotFound:
            return False

    to_install = []
    for key, install_name in required_packages.items():
        if not is_installed(key): # Check if the base package name is installed
            to_install.append(install_name)
            
    if to_install:
        print(f"üì¶ Installing missing packages: {', '.join(to_install)}...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install"] + to_install + ["--quiet"])
            # Re-initialize pkg_resources to reflect new installs
            import importlib
            importlib.reload(pkg_resources)
        except subprocess.CalledProcessError as e:
            print(f"‚ùå Failed to install packages: {e}")
            print("Please try installing manually or check network connection.")
            sys.exit(1)
    else:
        print("‚úÖ All required Python packages are installed.")
        
    # 3. Download NLTK Data
    print("üìö Checking NLTK data...")
    import nltk
    # Ensure NLTK data directory exists and is discoverable
    nltk_data_path = Path.home() / 'nltk_data'
    if not nltk_data_path.exists():
        nltk_data_path.mkdir(parents=True, exist_ok=True)
    if str(nltk_data_path) not in nltk.data.path:
        nltk.data.path.append(str(nltk_data_path))
        
    resources = ['wordnet', 'omw-1.4', 'stopwords', 'punkt'] # omw-1.4 for wordnet, punkt for tokenizers
    downloaded_all_nltk = True
    for res in resources:
        try:
            nltk.data.find(f"corpora/{res}") # Check for corporas
        except LookupError:
            try:
                nltk.data.find(f"tokenizers/{res}") # Check for tokenizers
            except LookupError:
                print(f"‚¨áÔ∏è Downloading NLTK resource: {res}...")
                try:
                    nltk.download(res, quiet=True, download_dir=str(nltk_data_path)) # Download to user's home dir
                except Exception as e:
                    print(f"‚ùå Failed to download NLTK resource '{res}': {e}")
                    downloaded_all_nltk = False
                 
    if downloaded_all_nltk:
        print("‚úÖ All NLTK Data Check Complete.")
    else:
        print("‚ö†Ô∏è Some NLTK data could not be downloaded. This might affect text processing.")

run_setup()


‚öôÔ∏è Checking environment...
üì¶ Installing missing packages: imbalanced-learn...
üìö Checking NLTK data...
‚¨áÔ∏è Downloading NLTK resource: wordnet...
‚¨áÔ∏è Downloading NLTK resource: omw-1.4...
‚úÖ All NLTK Data Check Complete.


In [3]:
# Core Python
import os
import random
import logging
import re
import json
from pathlib import Path
from datetime import datetime
import warnings

# Data Handling
import numpy as np
import pandas as pd

# Progress Bar
from tqdm.auto import tqdm

# ML Frameworks
import torch
from torch.utils.data import DataLoader

# NLP & Metrics
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from scipy.stats import pearsonr, spearmanr
from imblearn.over_sampling import SMOTE # Ensure SMOTE is imported for classifier

# Sentence Transformers
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from sentence_transformers.evaluation import SentenceEvaluator

# Filter warnings
warnings.filterwarnings("ignore", category=UserWarning)
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Suppress tokenizers warning

# --- Logging Setup ---
def setup_logger():
    # Determine log path: Prefer local current dir, allow agent override
    log_filename = "training_v6_refactored.log"
    log_path = Path.cwd() / log_filename
    
    # If running in Agent env, use agent's temp dir
    if os.environ.get("GEMINI_TEMP_DIR"):
        log_path = Path(os.environ.get("GEMINI_TEMP_DIR")) / log_filename

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_path, mode='w'), # Overwrite for fresh run
            logging.StreamHandler()
        ],
        force=True
    )
    return logging.getLogger(__name__), log_path

logger, LOG_FILE = setup_logger()

def log(msg, level=logging.INFO):
    if level == logging.INFO:
        logger.info(msg)
    elif level == logging.WARNING:
        logger.warning(msg)
    else:
        logger.debug(msg)

log(f"üìù Logging to: {LOG_FILE}")


E0000 00:00:1764528396.894900     127 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764528396.901188     127 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

2025-11-30 18:46:41,951 - INFO - üìù Logging to: /kaggle/working/training_v6_refactored.log


In [4]:
# --- CONFIGURATION ---
CONFIG = {
    'model_name': 'sentence-transformers/all-mpnet-base-v2',
    'output_dir': 'models/v6_refactored_finetuned', # New output directory
    'source_data': 'data/dummy_data_promax.csv', # Will try to resolve this
    'relationship_data': 'data/relationship_pairs.json', # For relationship classifier
    
    # Hyperparameters
    'epochs': 15,
    'batch_size': 32, # Lower batch size for stability
    'lr': 2e-5,
    'max_seq_length': 384,
    
    # Data Strategy
    'num_pairs': 50000, # Number of pairs for training/validation
    'pos_ratio': 0.4,   # 40% Positive, 60% Negative
    'neg_mining_range': (0.2, 0.5), # TF-IDF score range for "Hard Negatives"
    'eval_split': 0.15, # % of data for validation
    
    # Seed
    'seed': 42
}

# Set Seeds
random.seed(CONFIG['seed'])
np.random.seed(CONFIG['seed'])
torch.manual_seed(CONFIG['seed'])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(CONFIG['seed'])
    log(f"üöÄ CUDA Detected: {torch.cuda.get_device_name(0)}")
else:
    log("‚ö†Ô∏è CUDA Not Detected. Running on CPU.")


2025-11-30 18:46:42,028 - INFO - üöÄ CUDA Detected: Tesla P100-PCIE-16GB


In [5]:
def resolve_data_path(path_str):
    """Smart path resolver for Local/Kaggle/Colab/Agent envs."""
    # 1. As-is
    p = Path(path_str)
    if p.exists(): return p.resolve()
    
    # 2. Relative to current script location (for agent or local execution)
    # Using Path.cwd() as a robust base for notebooks
    script_dir = Path.cwd() 
    if (script_dir / path_str).exists(): return (script_dir / path_str).resolve()

    # 3. Common Kaggle/Colab input paths
    # Assuming path_str might be like 'data/file.csv'
    base_filename = Path(path_str).name
    
    kaggle_input_dir = Path("/kaggle/input")
    if kaggle_input_dir.exists():
        for dataset_dir in kaggle_input_dir.iterdir():
            if (dataset_dir / base_filename).exists():
                return (dataset_dir / base_filename).resolve()
            if (dataset_dir / path_str).exists(): # if path_str includes subdir like 'data/'
                return (dataset_dir / path_str).resolve()
                
    colab_dir = Path("/content")
    if (colab_dir / path_str).exists(): return (colab_dir / path_str).resolve()

    raise FileNotFoundError(f"Could not find {path_str} in any common locations (cwd, relative, Kaggle, Colab).")

def load_and_preprocess_data(config):
    source_path = resolve_data_path(config['source_data'])
    log(f"üìÇ Loading incident data from: {source_path}")
    df = pd.read_csv(source_path)
    
    # Required columns for contextual embedding
    required_cols = ["Number", "Short Description", "Description", "Category", "Subcategory", 
                     "Service", "Service offering", "Assignment group"]
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")

    # Fill NA and Clean Text
    # We ensure all context fields are strings
    placeholders = {"", "nan", "none", "null", "unknown", "n/a", "na"}

    def normalize_field(val: str) -> str:
        s = str(val).strip()
        s = re.sub(r"\s+", " ", s) # Replace multiple spaces with single
        if s.lower() in placeholders:
            return ""
        return s

    for col in [c for c in required_cols if c != "Number"]:
        df[col] = df[col].fillna("").apply(normalize_field)

    # Normalize casing for structured context fields to reduce duplicates
    context_cols = ["Service", "Service offering", "Category", "Subcategory", "Assignment group"]
    for col in context_cols:
        df[col] = df[col].str.lower()

    # Construct Rich Text Representation (Contextual Prefixing)
    # Format: [Service | Service offering] [Category | Subcategory] Group: Assignment group. Short Description. Description
    def build_bracketed(parts):
        clean_parts = [p for p in parts if p]
        return f"[{ ' | '.join(clean_parts) }] " if clean_parts else "" 

    df['context_service'] = df.apply(lambda row: build_bracketed([row['Service'], row['Service offering']]), axis=1)
    df['context_category'] = df.apply(lambda row: build_bracketed([row['Category'], row['Subcategory']]), axis=1)
    df['context_group'] = df.apply(lambda row: f"Group: {row['Assignment group']}. " if row['Assignment group'] else "", axis=1)

    df['text'] = (
        df['context_service'] +
        df['context_category'] +
        df['context_group'] +
        df['Short Description'].str.strip() + ". " +
        df['Description'].str.strip()
    ).str.replace(r"\\s+\\.", ".", regex=True) # Remove space before period
    df['text'] = df['text'].str.replace(r"\\s+", " ", regex=True).str.strip() # Clean up excess spaces

    # Filter empty or too short
    initial_count = len(df)
    min_length = 10 # Configurable if needed
    df = df[df['text'].str.len() >= min_length].copy()
    dropped = initial_count - len(df)
    if dropped > 0:
        log(f"‚ö†Ô∏è Dropped {dropped} incidents due to short/empty text after preprocessing.")
    
    # Create unique group ID for stratified splitting (Category-Subcategory)
    df['category_id'] = df.groupby(['Category', 'Subcategory']).ngroup()
    
    log(f"‚úÖ Preprocessed {len(df)} incidents.")
    log(f"Sample preprocessed text: '{df['text'].iloc[0]}'")
    df = df.reset_index(drop=True)
    log(f'‚úÖ Index reset. Range: {df.index.min()} to {df.index.max()}')
    return df

df_incidents = load_and_preprocess_data(CONFIG)


2025-11-30 18:46:42,113 - INFO - üìÇ Loading incident data from: /kaggle/input/itsm-dataset/dummy_data_promax.csv
2025-11-30 18:46:42,791 - INFO - ‚úÖ Preprocessed 10000 incidents.
2025-11-30 18:46:42,792 - INFO - Sample preprocessed text: '[crm (d365, salesforce, genesis, pcube, hussmann services) | bc - basis] [configuration | program bug] Group: piscap l2 workflow. Request: Adjust Configuration/Program bug configuration in CRM (D365, SalesForce, Genesis, PCube, HussMann Services). I encountered an issue where Request: Adjust Configuration/Program bug configuration in CRM (D365, SalesForce, Genesis, PCube, HussMann Services). I'd like assistance to investigate and resolve it.'
2025-11-30 18:46:42,796 - INFO - ‚úÖ Index reset. Range: 0 to 9999


In [None]:
class TextSimilarityCalculator:
    def __init__(self, texts):
        self.lemmatizer = WordNetLemmatizer() if 'wordnet' in nltk.data.path else None
        self.stop_words = set(stopwords.words('english')) if 'stopwords' in nltk.data.path else ENGLISH_STOP_WORDS
        self.vectorizer = TfidfVectorizer(stop_words=list(self.stop_words), max_features=10000)
        
        log("‚è≥ Fitting TF-IDF for similarity mining...")
        self.tfidf = self.vectorizer.fit_transform(texts)
        log(f"‚úÖ TF-IDF fit complete. Matrix shape: {self.tfidf.shape}")

    def get_tfidf_similarity(self, idx1, idx2):
        if idx1 >= self.tfidf.shape[0] or idx2 >= self.tfidf.shape[0]:
            return 0.0 
        return (self.tfidf[idx1] @ self.tfidf[idx2].T).toarray()[0][0]

def generate_smart_pairs(df, target_count, config):
    """Generates positive and hard negative pairs based on TF-IDF similarity."""
    # Ensure index is reset for direct iloc/loc correspondence
    df = df.reset_index(drop=True)
    
    sim_calculator = TextSimilarityCalculator(df['text'].tolist())

    positive_target = int(target_count * config['pos_ratio'])
    negative_target = target_count - positive_target

    pairs = []
    
    # Group by Category/Subcategory
    # groups indices are now reliable 0..N integers because of reset_index
    groups = df.groupby('category_id').indices 
    valid_groups = {k: list(v) for k, v in groups.items() if len(v) >= 2}
    all_indices = list(df.index)

    log(f"üîé Generating {positive_target} positive and {negative_target} hard negative pairs...")

    # --- 1. Positive Pairs ---
    pbar_pos = tqdm(total=positive_target, desc="Generating Positives")
    attempts = 0
    while len(pairs) < positive_target and attempts < positive_target * 5:
        attempts += 1
        if not valid_groups: break
        
        gid = random.choice(list(valid_groups.keys()))
        g_idxs = valid_groups[gid] # already a list
        
        if len(g_idxs) < 2: continue
        
        i1, i2 = random.sample(g_idxs, 2)
        
        # i1, i2 are integer positions. Since we reset index, they are also labels.
        # Using iloc is safest for 'text' column access if we mix things up, 
        # but here loc==iloc. We use simple integer indexing for tfidf.
        
        sim = sim_calculator.get_tfidf_similarity(i1, i2)
        
        if sim > 0.3:
            pairs.append(InputExample(texts=[df.at[i1, 'text'], df.at[i2, 'text']], label=1.0))
            pbar_pos.update(1)
            
    # Fill remaining positives
    if len(pairs) < positive_target:
        log(f"‚ö†Ô∏è Filling {positive_target - len(pairs)} remaining positives with random in-group pairs.")
        while len(pairs) < positive_target:
            if not valid_groups: break
            gid = random.choice(list(valid_groups.keys()))
            g_idxs = valid_groups[gid]
            if len(g_idxs) < 2: continue
            i1, i2 = random.sample(g_idxs, 2)
            pairs.append(InputExample(texts=[df.at[i1, 'text'], df.at[i2, 'text']], label=1.0))
            pbar_pos.update(1)
            
    pbar_pos.close()

    # --- 2. Hard Negative Pairs ---
    current_pos_count = len(pairs)
    pbar_neg = tqdm(total=negative_target, desc="Generating Negatives")
    attempts = 0
    max_attempts = negative_target * 10 
    
    while (len(pairs) - current_pos_count) < negative_target and attempts < max_attempts:
        attempts += 1
        
        i1, i2 = random.sample(all_indices, 2)
        
        if df.at[i1, 'category_id'] == df.at[i2, 'category_id']:
            continue
            
        sim = sim_calculator.get_tfidf_similarity(i1, i2)
        
        min_sim, max_sim = config['neg_mining_range']
        if min_sim <= sim <= max_sim:
            pairs.append(InputExample(texts=[df.at[i1, 'text'], df.at[i2, 'text']], label=0.0))
            pbar_neg.update(1)
            
    # Fill remaining negatives
    neg_generated = len(pairs) - current_pos_count
    if neg_generated < negative_target:
         log(f"‚ö†Ô∏è Filling {negative_target - neg_generated} remaining negatives with random cross-category pairs.")
         while (len(pairs) - current_pos_count) < negative_target:
            i1, i2 = random.sample(all_indices, 2)
            if df.at[i1, 'category_id'] != df.at[i2, 'category_id']:
                pairs.append(InputExample(texts=[df.at[i1, 'text'], df.at[i2, 'text']], label=0.0))
                pbar_neg.update(1)
                
    pbar_neg.close()
    
    log(f"‚úÖ Generated {len(pairs)} training pairs.")
    return pairs

# Split incidents into train/eval sets for pair generation
# Note: we split BEFORE pair generation, so we must reset index on the splits individually
train_incidents_df, eval_incidents_df = train_test_split(
    df_incidents,
    test_size=CONFIG['eval_split'],
    stratify=df_incidents['category_id'],
    random_state=CONFIG['seed']
)

log(f"Split Incidents: Train={len(train_incidents_df)}, Eval={len(eval_incidents_df)}")

# Calculate target pairs for each split
train_num_pairs = int(CONFIG['num_pairs'] * (1 - CONFIG['eval_split']))
eval_num_pairs = CONFIG['num_pairs'] - train_num_pairs

train_examples = generate_smart_pairs(train_incidents_df, train_num_pairs, CONFIG)
eval_examples = generate_smart_pairs(eval_incidents_df, eval_num_pairs, CONFIG)

log(f"Final Samples: Training={len(train_examples)}, Evaluation={len(eval_examples)}")



2025-11-30 18:46:42,888 - INFO - Split Incidents: Train=8500, Eval=1500
2025-11-30 18:46:42,892 - INFO - ‚è≥ Fitting TF-IDF for similarity mining...
2025-11-30 18:46:43,209 - INFO - ‚úÖ TF-IDF fit complete. Matrix shape: (8500, 110)
2025-11-30 18:46:43,212 - INFO - üîé Generating 17000 positive and 25500 hard negative pairs...


Generating Positives:   0%|          | 0/17000 [00:00<?, ?it/s]

Generating Negatives:   0%|          | 0/25500 [00:00<?, ?it/s]

In [None]:
# --- Model Evaluation Class ---
class ITSMEvaluator(SentenceEvaluator):
    def __init__(self, examples: list[InputExample], batch_size: int = 16, name: str = ''):
        self.examples = examples
        self.batch_size = batch_size
        self.name = name

        self.texts1 = [ex.texts[0] for ex in examples]
        self.texts2 = [ex.texts[1] for ex in examples]
        self.labels = np.array([ex.label for ex in examples])

        self.csv_file = f"{name}_eval_results.csv"
        self.csv_headers = ["epoch", "steps", "spearman", "pearson", "roc_auc", "pr_auc"]

    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        model.eval()
        log(f"üìä Running evaluation at epoch={epoch}, step={steps}...", level=logging.DEBUG)

        # Encode all texts
        embeddings1 = model.encode(self.texts1, batch_size=self.batch_size, show_progress_bar=False, convert_to_numpy=True)
        embeddings2 = model.encode(self.texts2, batch_size=self.batch_size, show_progress_bar=False, convert_to_numpy=True)

        # Calculate cosine similarities
        cosine_scores = np.sum(embeddings1 * embeddings2, axis=1) / (np.linalg.norm(embeddings1, axis=1) * np.linalg.norm(embeddings2, axis=1))

        # Calculate metrics
        eval_pearson, _ = pearsonr(self.labels, cosine_scores)
        eval_spearman, _ = spearmanr(self.labels, cosine_scores)
        
        try:
            roc_auc = roc_auc_score(self.labels, cosine_scores)
            pr_auc = average_precision_score(self.labels, cosine_scores)
        except ValueError: # Happens if only one class is present in labels
            roc_auc = 0.0
            pr_auc = 0.0
            log("‚ö†Ô∏è ROC/PR AUC cannot be calculated due to single class in evaluation labels.", level=logging.WARNING)


        log_msg = (
                   f"Epoch {epoch if epoch != -1 else 'N/A'} Steps {steps if steps != -1 else 'N/A'}: "
                   f"Spearman={eval_spearman:.4f}, Pearson={eval_pearson:.4f}, "
                   f"ROC_AUC={roc_auc:.4f}, PR_AUC={pr_auc:.4f}")
        log(log_msg)

        if output_path is not None:
            csv_path = Path(output_path) / self.csv_file
            output_data = [epoch, steps, eval_spearman, eval_pearson, roc_auc, pr_auc]
            
            if not csv_path.is_file():
                with open(csv_path, 'w', encoding='utf-8') as f:
                    f.write(",".join(self.csv_headers) + "\n")
            
            with open(csv_path, 'a', encoding='utf-8') as f:
                f.write(",".join(map(str, output_data)) + "\n")

        return eval_spearman # Return spearman as the main score for model selection

# --- Model Setup ---
model = SentenceTransformer(CONFIG['model_name'])
model.max_seq_length = CONFIG['max_seq_length']

train_loss = losses.CosineSimilarityLoss(model)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=CONFIG['batch_size'])

evaluator = ITSMEvaluator(eval_examples, batch_size=CONFIG['batch_size'], name='validation')

# --- Training Execution ---
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
save_path = Path(CONFIG['output_dir']) / f"{Path(CONFIG['output_dir']).name}_{timestamp}"
save_path.mkdir(parents=True, exist_ok=True) # Ensure output directory exists

log(f"üöÄ Starting training... Model will be saved to: {save_path}")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=CONFIG['epochs'],
    warmup_steps=int(len(train_dataloader) * CONFIG['epochs'] * 0.1), # 10% warmup
    optimizer_params={'lr': CONFIG['lr']},
    output_path=str(save_path), # SentenceTransformer expects string path
    evaluation_steps=int(len(train_dataloader) * 0.1), # Evaluate every 10% of an epoch
    save_best_model=True,
    show_progress_bar=True
)

log("‚úÖ Training complete.")

# --- Final Evaluation ---
log("‚ú® Reloading best model for final evaluation...")
best_model = SentenceTransformer(str(save_path)) # Reload the best saved model

final_evaluator = ITSMEvaluator(eval_examples, batch_size=CONFIG['batch_size'], name='final_evaluation')
final_spearman = final_evaluator(best_model, output_path=str(save_path), epoch='final', steps='final')

log(f"Final Model (best) saved to: {save_path}")


In [None]:
# --- Relationship Classifier (Optional) ---
# This part is optional and only runs if imbalanced-learn is available and data exists.

try:
    from imblearn.over_sampling import SMOTE
    IMBLEARN_AVAILABLE = True
except ImportError:
    IMBLEARN_AVAILABLE = False

if IMBLEARN_AVAILABLE:
    rel_data_path = None
    try:
        rel_data_path = resolve_data_path(CONFIG['relationship_data'])
    except FileNotFoundError:
        log(f"‚ö†Ô∏è Relationship data not found at {CONFIG['relationship_data']}. Skipping classifier training.", level=logging.WARNING)

    if rel_data_path and rel_data_path.exists():
        log('üß† Training Relationship Classifier...')
        with open(rel_data_path, 'r') as f:
            rel_data = json.load(f)

        rel_df = pd.DataFrame(rel_data)
        # Filter valid labels (adjust as per your dataset)
        valid_labels = ['duplicate', 'causal', 'related', 'none']
        rel_df = rel_df[rel_df['label'].isin(valid_labels)]
        log(f"Relationship samples after filtering: {len(rel_df)}")

        if len(rel_df) > 0:
            # Encode features using fine-tuned model
            text_a = rel_df['text_a'].tolist()
            text_b = rel_df['text_b'].tolist()

            log("‚è≥ Encoding relationship data with the best model...")
            emb_a = best_model.encode(text_a, batch_size=CONFIG['batch_size'], show_progress_bar=False)
            emb_b = best_model.encode(text_b, batch_size=CONFIG['batch_size'], show_progress_bar=False)

            # Feature Engineering: (u, v, |u-v|, u*v)
            X = np.hstack([emb_a, emb_b, np.abs(emb_a - emb_b), emb_a * emb_b])
            y = rel_df['label']

            from sklearn.linear_model import LogisticRegression
            from sklearn.model_selection import train_test_split
            from sklearn.metrics import classification_report
            
            # Only apply SMOTE if there are enough samples and multiple classes
            if len(np.unique(y)) > 1 and len(X) > 1 and len(np.unique(y)) < len(X): # Ensure SMOTE doesn't crash
                smote = SMOTE(k_neighbors=min(2, len(X) - 1), random_state=CONFIG['seed']) # k_neighbors must be <= n_samples-1
                X_res, y_res = smote.fit_resample(X, y)
                log(f"After SMOTE: {len(X_res)} samples")
            else:
                X_res, y_res = X, y
                log("‚ö†Ô∏è Skipping SMOTE due to insufficient samples or single class after filtering.", level=logging.WARNING)

            # Train Classifier
            clf = LogisticRegression(max_iter=1000, multi_class='multinomial', random_state=CONFIG['seed'], solver='lbfgs')
            X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=CONFIG['seed'], stratify=y_res)

            log("‚è≥ Training Logistic Regression classifier...")
            clf.fit(X_train, y_train)

            # Evaluation
            y_pred = clf.predict(X_test)
            log("‚úÖ Relationship Classifier Report:")
            log(f"\n{classification_report(y_test, y_pred)}")

            # Save Classifier
            import joblib
            classifier_save_path = save_path / "relationship_classifier.joblib"
            joblib.dump(clf, classifier_save_path)
            log(f"‚úÖ Relationship classifier saved to: {classifier_save_path}")
        else:
            log("‚ö†Ô∏è No valid relationship samples to train classifier. Skipping.", level=logging.WARNING)
    else:
        log(f"‚ö†Ô∏è Relationship data not found at resolved path '{rel_data_path}'. Skipping classifier training.", level=logging.WARNING)
else:
    log('‚ö†Ô∏è imbalanced-learn not installed. Skipping relationship classifier.', level=logging.WARNING)
