# TIE MCP Server - Model Retraining and Data Experimentation

This notebook provides an interactive environment for:
- Creating custom datasets for TIE model training
- Experimenting with different model configurations
- Retraining models with new data
- Evaluating model performance
- Data preprocessing and augmentation

## Table of Contents
1. [Setup and Configuration](#setup)
2. [Data Loading and Exploration](#data-loading)
3. [Dataset Creation and Preprocessing](#dataset-creation)
4. [Model Training Experiments](#model-training)
5. [Model Evaluation and Comparison](#model-evaluation)
6. [Advanced Data Framing](#advanced-data-framing)
7. [Model Deployment](#model-deployment)

## 1. Setup and Configuration {#setup}

In [None]:
import sys
import os
import asyncio
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Any, Optional
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path.cwd().parent
sys.path.append(str(project_root / "src"))

# TIE MCP imports
from tie_mcp.config.settings import settings
from tie_mcp.core.engine_manager import TIEEngineManager
from tie_mcp.models.model_manager import ModelManager
from tie_mcp.storage.database import DatabaseManager
from tie_mcp.monitoring.metrics import MetricsCollector
from tie_mcp.utils.logging import setup_logging, get_logger

# Original TIE imports
from tie_mcp.core.tie.engine import TechniqueInferenceEngine
from tie_mcp.core.tie.matrix_builder import ReportTechniqueMatrixBuilder
from tie_mcp.core.tie.constants import PredictionMethod
from tie_mcp.core.tie.recommender import (
    WalsRecommender,
    BPRRecommender,
    TopItemsRecommender,
)
from tie_mcp.core.tie.utils import get_mitre_technique_ids_to_names

# Configure matplotlib for interactive plots
%matplotlib inline
plt.style.use('default')
sns.set_palette("husl")

# Setup logging
setup_logging()
logger = get_logger(__name__)

print("Setup complete!")
print(f"Project root: {project_root}")
print(f"Current environment: {settings.environment}")

## 2. Data Loading and Exploration {#data-loading}

In [None]:
# Load default dataset
def load_tie_dataset(dataset_path: str) -> Dict[str, Any]:
    """Load TIE dataset from JSON file"""
    with open(dataset_path, 'r') as f:
        return json.load(f)

# Load the default dataset
dataset_path = project_root / "data" / "datasets" / "combined_dataset_full_frequency.json"
dataset = load_tie_dataset(dataset_path)

print(f"Dataset loaded: {len(dataset['reports'])} reports")
print(f"First report keys: {list(dataset['reports'][0].keys())}")

# Display dataset statistics
reports = dataset['reports']
all_techniques = set()
report_technique_counts = []

for report in reports:
    techniques = list(report['mitre_techniques'].keys())
    all_techniques.update(techniques)
    report_technique_counts.append(len(techniques))

print(f"\nDataset Statistics:")
print(f"Total reports: {len(reports)}")
print(f"Unique techniques: {len(all_techniques)}")
print(f"Average techniques per report: {np.mean(report_technique_counts):.2f}")
print(f"Median techniques per report: {np.median(report_technique_counts):.2f}")
print(f"Min techniques per report: {np.min(report_technique_counts)}")
print(f"Max techniques per report: {np.max(report_technique_counts)}")

In [None]:
# Visualize dataset characteristics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Distribution of techniques per report
axes[0, 0].hist(report_technique_counts, bins=30, alpha=0.7, edgecolor='black')
axes[0, 0].set_xlabel('Number of Techniques per Report')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Techniques per Report')
axes[0, 0].grid(True, alpha=0.3)

# Technique frequency analysis
technique_counts = {}
for report in reports:
    for technique in report['mitre_techniques'].keys():
        technique_counts[technique] = technique_counts.get(technique, 0) + 1

# Top 20 most frequent techniques
top_techniques = sorted(technique_counts.items(), key=lambda x: x[1], reverse=True)[:20]
techniques, counts = zip(*top_techniques)

axes[0, 1].barh(range(len(techniques)), counts)
axes[0, 1].set_yticks(range(len(techniques)))
axes[0, 1].set_yticklabels(techniques, fontsize=8)
axes[0, 1].set_xlabel('Frequency')
axes[0, 1].set_title('Top 20 Most Frequent Techniques')
axes[0, 1].grid(True, alpha=0.3)

# Technique frequency distribution (log scale)
all_counts = list(technique_counts.values())
axes[1, 0].hist(all_counts, bins=50, alpha=0.7, edgecolor='black')
axes[1, 0].set_xlabel('Technique Frequency')
axes[1, 0].set_ylabel('Number of Techniques')
axes[1, 0].set_title('Distribution of Technique Frequencies')
axes[1, 0].set_yscale('log')
axes[1, 0].grid(True, alpha=0.3)

# Cumulative distribution of technique coverage
sorted_counts = sorted(all_counts, reverse=True)
cumulative_pct = np.cumsum(sorted_counts) / np.sum(sorted_counts) * 100
axes[1, 1].plot(range(1, len(cumulative_pct) + 1), cumulative_pct)
axes[1, 1].set_xlabel('Number of Techniques (ranked by frequency)')
axes[1, 1].set_ylabel('Cumulative Coverage (%)')
axes[1, 1].set_title('Cumulative Technique Coverage')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print some insights
print(f"\nDataset Insights:")
print(f"Top 10% of techniques cover {cumulative_pct[len(cumulative_pct)//10]:.1f}% of all occurrences")
print(f"Techniques appearing only once: {sum(1 for c in all_counts if c == 1)} ({sum(1 for c in all_counts if c == 1)/len(all_counts)*100:.1f}%)")
print(f"Most frequent technique: {top_techniques[0][0]} ({top_techniques[0][1]} occurrences)")

## 3. Dataset Creation and Preprocessing {#dataset-creation}

In [None]:
class DatasetBuilder:
    """Helper class for building custom datasets"""
    
    def __init__(self, enterprise_attack_path: str):
        self.enterprise_attack_path = enterprise_attack_path
        self.attack_techniques = get_mitre_technique_ids_to_names(enterprise_attack_path)
        
    def create_filtered_dataset(self, 
                               original_dataset: Dict[str, Any],
                               min_frequency: int = 5,
                               max_frequency: Optional[int] = None) -> Dict[str, Any]:
        """Create a filtered dataset based on technique frequency"""
        
        reports = original_dataset['reports']
        
        # Count technique frequencies
        tech_counts = {}
        for report in reports:
            for tech in report['mitre_techniques']:
                tech_counts[tech] = tech_counts.get(tech, 0) + 1
        
        # Filter techniques by frequency
        valid_techniques = set()
        for tech, count in tech_counts.items():
            if count >= min_frequency:
                if max_frequency is None or count <= max_frequency:
                    valid_techniques.add(tech)
        
        print(f"Techniques before filtering: {len(tech_counts)}")
        print(f"Techniques after filtering: {len(valid_techniques)}")
        
        # Filter reports
        filtered_reports = []
        for report in reports:
            filtered_techniques = {
                tech: val for tech, val in report['mitre_techniques'].items()
                if tech in valid_techniques
            }
            if filtered_techniques:  # Only keep reports with remaining techniques
                new_report = report.copy()
                new_report['mitre_techniques'] = filtered_techniques
                filtered_reports.append(new_report)
        
        print(f"Reports before filtering: {len(reports)}")
        print(f"Reports after filtering: {len(filtered_reports)}")
        
        return {
            'reports': filtered_reports,
            'metadata': {
                'filter_type': 'frequency',
                'min_frequency': min_frequency,
                'max_frequency': max_frequency,
                'original_report_count': len(reports),
                'filtered_report_count': len(filtered_reports),
                'original_technique_count': len(tech_counts),
                'filtered_technique_count': len(valid_techniques)
            }
        }
    
    def augment_dataset(self, dataset: Dict[str, Any], 
                       dropout_rate: float = 0.3) -> Dict[str, Any]:
        """Apply data augmentation using technique dropout"""
        
        original_reports = dataset['reports']
        augmented_reports = original_reports.copy()
        
        for i, report in enumerate(original_reports):
            techniques = list(report['mitre_techniques'].keys())
            if len(techniques) > 2:  # Only augment if sufficient techniques
                # Create version with some techniques dropped
                num_to_keep = max(1, int(len(techniques) * (1 - dropout_rate)))
                kept_techniques = np.random.choice(techniques, num_to_keep, replace=False)
                
                augmented_report = {
                    'id': f"aug_{i}",
                    'mitre_techniques': {tech: 1 for tech in kept_techniques},
                    'metadata': {'augmented': True, 'original_id': report.get('id', i)}
                }
                augmented_reports.append(augmented_report)
        
        print(f"Original reports: {len(original_reports)}")
        print(f"Augmented reports: {len(augmented_reports)}")
        print(f"Added: {len(augmented_reports) - len(original_reports)} reports")
        
        return {
            'reports': augmented_reports,
            'metadata': {
                **dataset.get('metadata', {}),
                'augmented': True,
                'dropout_rate': dropout_rate,
                'augmented_count': len(augmented_reports) - len(original_reports)
            }
        }

# Initialize dataset builder
enterprise_attack_path = project_root / "data" / "datasets" / "stix" / "enterprise-attack.json"
dataset_builder = DatasetBuilder(str(enterprise_attack_path))

print(f"Dataset builder initialized with {len(dataset_builder.attack_techniques)} ATT&CK techniques")

In [None]:
# Create a filtered dataset focusing on more common techniques
filtered_dataset = dataset_builder.create_filtered_dataset(
    original_dataset=dataset,
    min_frequency=5  # Techniques appearing in at least 5 reports
)

# Apply data augmentation
augmented_dataset = dataset_builder.augment_dataset(
    dataset=filtered_dataset,
    dropout_rate=0.3
)

# Save the processed dataset
processed_dataset_path = project_root / "data" / "datasets" / "processed_dataset.json"
with open(processed_dataset_path, 'w') as f:
    json.dump(augmented_dataset, f, indent=2)

print(f"\nProcessed dataset saved to: {processed_dataset_path}")

## 4. Model Training Experiments {#model-training}

In [None]:
class SimpleModelTrainer:
    """Simplified model trainer for experimentation"""
    
    def __init__(self, enterprise_attack_path: str):
        self.enterprise_attack_path = enterprise_attack_path
        self.experiment_results = []
    
    def train_and_evaluate_model(self, 
                                dataset_path: str,
                                model_type: str = "wals",
                                embedding_dimension: int = 4,
                                test_ratio: float = 0.2,
                                validation_ratio: float = 0.1) -> Dict[str, Any]:
        """Train and evaluate a single model configuration"""
        
        print(f"Training {model_type} model with embedding dimension {embedding_dimension}")
        
        try:
            # Build data matrices
            data_builder = ReportTechniqueMatrixBuilder(
                combined_dataset_filepath=dataset_path,
                enterprise_attack_filepath=self.enterprise_attack_path
            )
            
            training_data, test_data, validation_data = data_builder.build_train_test_validation(
                test_ratio, validation_ratio
            )
            
            print(f"Training data shape: {training_data.shape}")
            print(f"Test data shape: {test_data.shape}")
            print(f"Validation data shape: {validation_data.shape}")
            
            # Create model
            if model_type == "wals":
                model = WalsRecommender(m=training_data.m, n=training_data.n, k=embedding_dimension)
                prediction_method = PredictionMethod.DOT
            elif model_type == "bpr":
                model = BPRRecommender(m=training_data.m, n=training_data.n, k=embedding_dimension)
                prediction_method = PredictionMethod.COSINE
            elif model_type == "top_items":
                model = TopItemsRecommender(m=training_data.m, n=training_data.n, k=embedding_dimension)
                prediction_method = PredictionMethod.DOT
            else:
                raise ValueError(f"Unsupported model type: {model_type}")
            
            # Create TIE engine
            tie = TechniqueInferenceEngine(
                training_data=training_data,
                validation_data=validation_data,
                test_data=test_data,
                model=model,
                prediction_method=prediction_method,
                enterprise_attack_filepath=self.enterprise_attack_path
            )
            
            # Train with simple hyperparameters
            start_time = datetime.now()
            
            if model_type == "wals":
                mse = tie.fit(epochs=25, c=0.01, regularization_coefficient=0.001)
            elif model_type == "bpr":
                mse = tie.fit(epochs=20, learning_rate=0.001, regularization=0.01)
            else:  # top_items
                mse = tie.fit()
            
            training_time = (datetime.now() - start_time).total_seconds()
            
            # Evaluate model
            metrics = {}
            for k in [10, 20, 50]:
                metrics[f'precision_at_{k}'] = tie.precision(k=k)
                metrics[f'recall_at_{k}'] = tie.recall(k=k)
                metrics[f'ndcg_at_{k}'] = tie.normalized_discounted_cumulative_gain(k=k)
            
            result = {
                'model_type': model_type,
                'embedding_dimension': embedding_dimension,
                'training_time_seconds': training_time,
                'mse': mse,
                'metrics': metrics,
                'dataset_info': {
                    'training_reports': training_data.m,
                    'test_reports': test_data.m,
                    'validation_reports': validation_data.m,
                    'num_techniques': training_data.n
                }
            }
            
            self.experiment_results.append(result)
            
            print(f"Training completed in {training_time:.2f} seconds")
            print(f"NDCG@20: {metrics['ndcg_at_20']:.4f}")
            print(f"Precision@20: {metrics['precision_at_20']:.4f}")
            print(f"Recall@20: {metrics['recall_at_20']:.4f}")
            
            return result
            
        except Exception as e:
            print(f"Error training model: {str(e)}")
            return {'error': str(e), 'model_type': model_type}
    
    def compare_models(self, dataset_path: str) -> pd.DataFrame:
        """Compare different model types"""
        
        model_types = ["wals", "bpr", "top_items"]
        embedding_dims = [2, 4, 8]
        
        comparison_results = []
        
        for model_type in model_types:
            for embedding_dim in embedding_dims:
                print(f"\n--- Training {model_type} with embedding dim {embedding_dim} ---")
                result = self.train_and_evaluate_model(
                    dataset_path=dataset_path,
                    model_type=model_type,
                    embedding_dimension=embedding_dim
                )
                
                if 'error' not in result:
                    comparison_results.append(result)
        
        if comparison_results:
            # Convert to DataFrame for easier analysis
            df_data = []
            for result in comparison_results:
                row = {
                    'model_type': result['model_type'],
                    'embedding_dimension': result['embedding_dimension'],
                    'training_time': result['training_time_seconds'],
                    'mse': result['mse']
                }
                # Add metrics
                for metric_name, metric_value in result['metrics'].items():
                    row[metric_name] = metric_value
                
                df_data.append(row)
            
            return pd.DataFrame(df_data)
        else:
            return pd.DataFrame()

# Initialize trainer
trainer = SimpleModelTrainer(str(enterprise_attack_path))
print("Model trainer initialized")

In [None]:
# Run model comparison experiments
print("Starting model comparison experiments...")
comparison_df = trainer.compare_models(str(processed_dataset_path))

if not comparison_df.empty:
    print("\nExperiment Results Summary:")
    print(comparison_df.groupby('model_type')[['ndcg_at_20', 'precision_at_20', 'recall_at_20', 'training_time']].mean())
else:
    print("No successful experiments to analyze")

## 5. Model Evaluation and Visualization {#model-evaluation}

In [None]:
# Visualize experiment results if available
if not comparison_df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Model Comparison Results', fontsize=16)
    
    # NDCG@20 by model type and embedding dimension
    pivot_ndcg = comparison_df.pivot(index='model_type', columns='embedding_dimension', values='ndcg_at_20')
    sns.heatmap(pivot_ndcg, annot=True, fmt='.4f', ax=axes[0, 0], cmap='YlOrRd')
    axes[0, 0].set_title('NDCG@20 by Model Type and Embedding Dimension')
    
    # Training time comparison
    comparison_df.boxplot(column='training_time', by='model_type', ax=axes[0, 1])
    axes[0, 1].set_title('Training Time by Model Type')
    axes[0, 1].set_ylabel('Training Time (seconds)')
    
    # Precision vs Recall scatter plot
    for model_type in comparison_df['model_type'].unique():
        model_data = comparison_df[comparison_df['model_type'] == model_type]
        axes[1, 0].scatter(model_data['recall_at_20'], model_data['precision_at_20'], 
                          label=model_type, s=60, alpha=0.7)
    axes[1, 0].set_xlabel('Recall@20')
    axes[1, 0].set_ylabel('Precision@20')
    axes[1, 0].set_title('Precision vs Recall@20')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # Performance vs training time trade-off
    axes[1, 1].scatter(comparison_df['training_time'], comparison_df['ndcg_at_20'], 
                      c=comparison_df['embedding_dimension'], s=60, alpha=0.7)
    axes[1, 1].set_xlabel('Training Time (seconds)')
    axes[1, 1].set_ylabel('NDCG@20')
    axes[1, 1].set_title('Performance vs Training Time')
    plt.colorbar(axes[1, 1].collections[0], ax=axes[1, 1], label='Embedding Dimension')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Find best configuration
    best_config = comparison_df.loc[comparison_df['ndcg_at_20'].idxmax()]
    print(f"\nBest Configuration:")
    print(f"Model Type: {best_config['model_type']}")
    print(f"Embedding Dimension: {best_config['embedding_dimension']}")
    print(f"NDCG@20: {best_config['ndcg_at_20']:.4f}")
    print(f"Precision@20: {best_config['precision_at_20']:.4f}")
    print(f"Recall@20: {best_config['recall_at_20']:.4f}")
    print(f"Training Time: {best_config['training_time']:.2f} seconds")
else:
    print("No experiment results to visualize")

## 6. Advanced Data Analysis {#advanced-data-framing}

In [None]:
# Analyze technique co-occurrence patterns
def analyze_technique_cooccurrence(dataset: Dict[str, Any], top_n: int = 20) -> pd.DataFrame:
    """Analyze which techniques commonly appear together"""
    
    reports = dataset['reports']
    
    # Get all unique techniques
    all_techniques = set()
    for report in reports:
        all_techniques.update(report['mitre_techniques'].keys())
    
    all_techniques = sorted(list(all_techniques))
    
    # Create co-occurrence matrix
    cooccurrence_matrix = np.zeros((len(all_techniques), len(all_techniques)))
    technique_to_idx = {tech: i for i, tech in enumerate(all_techniques)}
    
    for report in reports:
        report_techniques = list(report['mitre_techniques'].keys())
        for i, tech1 in enumerate(report_techniques):
            for tech2 in report_techniques[i:]:
                idx1, idx2 = technique_to_idx[tech1], technique_to_idx[tech2]
                cooccurrence_matrix[idx1, idx2] += 1
                if idx1 != idx2:
                    cooccurrence_matrix[idx2, idx1] += 1
    
    # Find most common pairs
    cooccurrence_pairs = []
    for i in range(len(all_techniques)):
        for j in range(i + 1, len(all_techniques)):
            if cooccurrence_matrix[i, j] > 1:  # Appears together more than once
                cooccurrence_pairs.append({
                    'technique_1': all_techniques[i],
                    'technique_2': all_techniques[j],
                    'cooccurrence_count': cooccurrence_matrix[i, j]
                })
    
    cooccurrence_df = pd.DataFrame(cooccurrence_pairs)
    cooccurrence_df = cooccurrence_df.sort_values('cooccurrence_count', ascending=False)
    
    return cooccurrence_df.head(top_n)

# Analyze the processed dataset
print("Analyzing technique co-occurrence patterns...")
cooccurrence_analysis = analyze_technique_cooccurrence(augmented_dataset, top_n=15)

print("\nTop 15 Technique Co-occurrence Patterns:")
for _, row in cooccurrence_analysis.iterrows():
    print(f"{row['technique_1']} + {row['technique_2']}: {row['cooccurrence_count']} times")

In [None]:
# Analyze dataset quality metrics
def analyze_dataset_quality(dataset: Dict[str, Any]) -> Dict[str, Any]:
    """Analyze various quality metrics of the dataset"""
    
    reports = dataset['reports']
    
    # Basic statistics
    report_sizes = [len(report['mitre_techniques']) for report in reports]
    
    # Technique frequency distribution
    technique_counts = {}
    for report in reports:
        for technique in report['mitre_techniques']:
            technique_counts[technique] = technique_counts.get(technique, 0) + 1
    
    # Sparsity analysis
    total_possible_entries = len(reports) * len(technique_counts)
    actual_entries = sum(len(report['mitre_techniques']) for report in reports)
    sparsity = 1 - (actual_entries / total_possible_entries)
    
    # Coverage analysis
    single_occurrence_techniques = sum(1 for count in technique_counts.values() if count == 1)
    coverage_ratio = single_occurrence_techniques / len(technique_counts)
    
    quality_metrics = {
        'total_reports': len(reports),
        'total_unique_techniques': len(technique_counts),
        'average_techniques_per_report': np.mean(report_sizes),
        'median_techniques_per_report': np.median(report_sizes),
        'std_techniques_per_report': np.std(report_sizes),
        'sparsity': sparsity,
        'single_occurrence_techniques': single_occurrence_techniques,
        'coverage_ratio': coverage_ratio,
        'most_frequent_technique': max(technique_counts.items(), key=lambda x: x[1]),
        'least_frequent_techniques': min(technique_counts.values())
    }
    
    return quality_metrics

# Analyze quality of different datasets
print("\n=== Dataset Quality Analysis ===")

print("\nOriginal Dataset:")
original_quality = analyze_dataset_quality(dataset)
for key, value in original_quality.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

print("\nFiltered Dataset:")
filtered_quality = analyze_dataset_quality(filtered_dataset)
for key, value in filtered_quality.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

print("\nAugmented Dataset:")
augmented_quality = analyze_dataset_quality(augmented_dataset)
for key, value in augmented_quality.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

## 7. Model Deployment Preparation {#model-deployment}

In [None]:
# Save experiment results for deployment
experiment_summary = {
    'timestamp': datetime.now().isoformat(),
    'dataset_info': {
        'original_reports': len(dataset['reports']),
        'filtered_reports': len(filtered_dataset['reports']),
        'augmented_reports': len(augmented_dataset['reports']),
        'processing_steps': [
            'frequency_filtering_min_5',
            'data_augmentation_dropout_30pct'
        ]
    },
    'experiment_results': trainer.experiment_results if hasattr(trainer, 'experiment_results') else [],
    'quality_metrics': {
        'original': original_quality,
        'filtered': filtered_quality,
        'augmented': augmented_quality
    },
    'cooccurrence_analysis': cooccurrence_analysis.to_dict('records') if not cooccurrence_analysis.empty else []
}

# Save experiment summary
summary_path = project_root / "data" / "configs" / "experiment_summary.json"
summary_path.parent.mkdir(exist_ok=True)
with open(summary_path, 'w') as f:
    json.dump(experiment_summary, f, indent=2, default=str)

print(f"Experiment summary saved to: {summary_path}")

# Create deployment configuration
if not comparison_df.empty:
    best_config = comparison_df.loc[comparison_df['ndcg_at_20'].idxmax()]
    
    deployment_config = {
        'recommended_model': {
            'model_type': best_config['model_type'],
            'embedding_dimension': int(best_config['embedding_dimension']),
            'expected_performance': {
                'ndcg_at_20': float(best_config['ndcg_at_20']),
                'precision_at_20': float(best_config['precision_at_20']),
                'recall_at_20': float(best_config['recall_at_20'])
            }
        },
        'dataset_path': str(processed_dataset_path),
        'enterprise_attack_path': str(enterprise_attack_path),
        'training_parameters': {
            'test_ratio': 0.2,
            'validation_ratio': 0.1,
            'auto_hyperparameter_tuning': True
        }
    }
    
    config_path = project_root / "data" / "configs" / "deployment_config.json"
    with open(config_path, 'w') as f:
        json.dump(deployment_config, f, indent=2)
    
    print(f"Deployment configuration saved to: {config_path}")
    print("\nRecommended deployment configuration:")
    print(f"Model Type: {deployment_config['recommended_model']['model_type']}")
    print(f"Embedding Dimension: {deployment_config['recommended_model']['embedding_dimension']}")
    print(f"Expected NDCG@20: {deployment_config['recommended_model']['expected_performance']['ndcg_at_20']:.4f}")
else:
    print("No successful experiments for deployment configuration")

print("\n=== Notebook Complete ===")
print("This notebook has demonstrated:")
print("✓ Dataset loading and exploration")
print("✓ Data preprocessing and augmentation")
print("✓ Model training and evaluation")
print("✓ Performance comparison across configurations")
print("✓ Dataset quality analysis")
print("✓ Deployment preparation")
print("\nYou can now use the TIE MCP Server with the optimized configurations!")