# Model Training and Comparison
## ServiceNow Incident Auto-Assignment

This notebook trains multiple ML models and compares their performance.

In [None]:
# Import libraries
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import time

from src.preprocessing import TextCleaner, DataLoader, remove_duplicates, handle_null_values
from src.features import FeatureExtractor, TextVectorizer
from src.models import TraditionalMLModels
from src.evaluation import ModelEvaluator
from src.utils import load_config, format_time

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
# Load configuration
config = load_config('../config.yaml')
print("Configuration loaded!")

## 1. Load and Preprocess Data

In [None]:
# Load data
data_loader = DataLoader('../config.yaml')
csv_path = Path('../data/raw/') / config['data']['incident_csv']

if csv_path.exists():
    df = data_loader.load_csv(str(csv_path))
else:
    print("Using sample data...")
    from src.preprocessing import load_sample_data
    df = load_sample_data()

print(f"Initial shape: {df.shape}")

In [None]:
# Preprocess data
df = handle_null_values(df, strategy='fill')
df = remove_duplicates(df)

# Clean text
text_cleaner = TextCleaner(config.get('preprocessing', {}))
text_columns = [col for col in ['short_description', 'description'] if col in df.columns]

if text_columns:
    df = text_cleaner.clean_dataframe(df, text_columns)

print(f"After preprocessing: {df.shape}")

## 2. Feature Engineering

In [None]:
# Split data
target_col = config.get('data', {}).get('target_column', 'assignment_group')

train_df, val_df, test_df = data_loader.split_data(
    df, target_col,
    train_size=0.8, val_size=0.1, test_size=0.1,
    random_state=42
)

In [None]:
# Vectorize text using TF-IDF
vectorizer = TextVectorizer(config)
vectorizer.fit_tfidf_vectorizer(train_df['cleaned_text'].tolist())

X_train = vectorizer.transform_tfidf_vectorizer(train_df['cleaned_text'].tolist())
X_test = vectorizer.transform_tfidf_vectorizer(test_df['cleaned_text'].tolist())

y_train = train_df[target_col].values
y_test = test_df[target_col].values

print(f"Training features shape: {X_train.shape}")
print(f"Test features shape: {X_test.shape}")

## 3. Train Models

In [None]:
# Initialize models and evaluator
ml_models = TraditionalMLModels(config)
evaluator = ModelEvaluator(config)

models_to_train = ['logistic_regression', 'random_forest', 'naive_bayes']

In [None]:
# Train and evaluate each model
training_times = {}

for model_name in models_to_train:
    print(f"\n{'='*60}")
    print(f"Training {model_name}...")
    print(f"{'='*60}")
    
    start_time = time.time()
    
    # Train
    model = ml_models.train_model(model_name, X_train, y_train)
    
    training_time = time.time() - start_time
    training_times[model_name] = training_time
    
    # Predict
    y_pred = ml_models.predict(model_name, X_test)
    y_pred_proba = ml_models.predict_proba(model_name, X_test) if hasattr(model, 'predict_proba') else None
    
    # Evaluate
    metrics = evaluator.calculate_all_metrics(y_test, y_pred, y_pred_proba, model_name)
    evaluator.print_metrics(model_name)
    
    print(f"Training time: {format_time(training_time)}")

## 4. Model Comparison

In [None]:
# Compare models
comparison_df = evaluator.compare_models()
comparison_df['training_time'] = comparison_df.index.map(training_times)
comparison_df

In [None]:
# Plot comparison
evaluator.plot_model_comparison(
    metrics=['accuracy', 'precision', 'recall', 'f1_score'],
    figsize=(16, 5)
)

## 5. Best Model Analysis

In [None]:
# Get best model
best_model_name = comparison_df['f1_score'].idxmax()
print(f"Best Model: {best_model_name}")
print(f"F1 Score: {comparison_df.loc[best_model_name, 'f1_score']:.4f}")

In [None]:
# Confusion matrix for best model
y_pred_best = ml_models.predict(best_model_name, X_test)

evaluator.plot_confusion_matrix(
    y_test, y_pred_best,
    title=f'Confusion Matrix - {best_model_name}',
    figsize=(10, 8)
)

In [None]:
# Classification report
evaluator.print_classification_report(y_test, y_pred_best, model_name=best_model_name)

In [None]:
# Save best model
ml_models.save_model(best_model_name, f'../models/saved_models/best_model.pkl')
print(f"Best model saved!")

## 6. Summary

This notebook trained and compared multiple ML models for incident auto-assignment. The best performing model has been saved and can be used for predictions.