In [1]:
# modeling notebook enhanced by https://claude.ai/

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import (classification_report, accuracy_score, 
                            confusion_matrix, ConfusionMatrixDisplay, 
                            roc_curve, auc)
from sklearn.preprocessing import label_binarize
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from scipy.stats import randint, uniform
import time
import os

In [2]:
# Create directories for saving results
os.makedirs('results', exist_ok=True)
os.makedirs('results/baseline', exist_ok=True)
os.makedirs('results/hybrid', exist_ok=True)

In [3]:
# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

In [4]:
def run_modeling_pipeline(dataset_type='baseline'):
    print(f"\n{'='*80}")
    print(f"    MODELING PIPELINE FOR {dataset_type.upper()} DATASET")
    print(f"{'='*80}\n")
    
    # Step 1: Exploratory Data Analysis (EDA)
    print("\n" + "="*50)
    print("1. EXPLORATORY DATA ANALYSIS")
    print("="*50)
    
    # Load the dataset
    train_path = f'../data/train_{dataset_type}.csv'
    test_path = f'../data/test_{dataset_type}.csv'
    
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        print(f"Successfully loaded {dataset_type} datasets:")
        print(f"Training shape: {train_df.shape}")
        print(f"Testing shape: {test_df.shape}")
    except FileNotFoundError:
        print(f"Error: Could not find {dataset_type} dataset files. Please check file paths.")
        return
    
    # Basic dataset information
    print("\nTraining Dataset Overview:")
    print(f"Columns: {train_df.columns.tolist()}")
    print("\nData Types:")
    print(train_df.dtypes)
    
    # Target distribution
    target_col = 'Event Classification'
    target_dist = train_df[target_col].value_counts(normalize=True).reset_index()
    target_dist.columns = ['Class', 'Proportion']
    print("\nTarget Distribution:")
    print(target_dist)
    
    # Visualize target distribution
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Class', y='Proportion', data=target_dist)
    plt.title(f'Target Distribution - {dataset_type.capitalize()} Dataset')
    plt.ylabel('Proportion')
    plt.savefig(f'results/{dataset_type}/target_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Step 2: Data Cleaning (Validation)
    print("\n" + "="*50)
    print("2. DATA CLEANING VALIDATION")
    print("="*50)
    
    # Check for missing values
    missing_values = train_df.isnull().sum()
    print("\nMissing values in training data:")
    print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values found")
    
    # Step 3: Data Preparation
    print("\n" + "="*50)
    print("3. DATA PREPARATION")
    print("="*50)
    
    # Define features (X) and target (y)
    X_train = train_df.drop(target_col, axis=1)
    y_train = train_df[target_col]
    X_test = test_df.drop(target_col, axis=1)
    y_test = test_df[target_col]
    
    # Ensure test data has the same columns as train data
    missing_cols = set(X_train.columns) - set(X_test.columns)
    for col in missing_cols:
        X_test[col] = 0
    X_test = X_test[X_train.columns]
    
    # Encode target
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)
    
    class_names = le.classes_
    print(f"Target Classes: {class_names}")
    
    # Step 4: Train-Test Split Validation
    print("\n" + "="*50)
    print("4. TRAIN-TEST SPLIT VALIDATION")
    print("="*50)
    
    # The split was done in preprocessing, but validate proportions
    print("Training set size:", X_train.shape[0])
    print("Test set size:", X_test.shape[0])
    print("Test set proportion: {:.2f}%".format(100 * X_test.shape[0] / (X_train.shape[0] + X_test.shape[0])))
    
    print("\nTraining class distribution:")
    print(pd.Series(y_train_encoded).value_counts(normalize=True))
    
    print("\nTest class distribution:")
    print(pd.Series(y_test_encoded).value_counts(normalize=True))
    
    # Step 5: Feature Engineering Validation
    print("\n" + "="*50)
    print("5. FEATURE ENGINEERING VALIDATION")
    print("="*50)
    
    # Display number of features
    print(f"Number of features: {X_train.shape[1]}")
    
    # Display feature categories
    temporal_features = [col for col in X_train.columns if any(x in col for x in ['Month', 'Day', 'Year', 'Week'])]
    text_features = [col for col in X_train.columns if 'text_svd_' in col]
    categorical_features = [col for col in X_train.columns if any(x in col for x in ['Classification', 'Type', 'Status', 'Structure'])]
    
    print(f"Temporal features: {len(temporal_features)}")
    print(f"Text-derived features: {len(text_features)}")
    print(f"Categorical features: {len(categorical_features)}")
    
    # Step 6: Pre-processed Dataset Validation
    print("\n" + "="*50)
    print("6. PRE-PROCESSED DATASET VALIDATION")
    print("="*50)
    
    # Check for any remaining issues
    print("Checking for infinity or NaN values...")
    inf_count = np.isinf(X_train.values).sum()
    nan_count = np.isnan(X_train.values).sum()
    
    if inf_count > 0 or nan_count > 0:
        print(f"Warning: Found {inf_count} infinity values and {nan_count} NaN values")
        # Handle inf/nan values if necessary
        X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0)
        X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)
    else:
        print("No infinity or NaN values found.")
    
    # Step 7: Modeling
    print("\n" + "="*50)
    print("7. MODELING")
    print("="*50)
    
    # Initialize models to test
    models = {
        'Random Forest': RandomForestClassifier(random_state=42, class_weight='balanced'),
        'XGBoost': XGBClassifier(eval_metric='mlogloss', random_state=42),
        'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
        'MLPClassifier': MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
    }
    
    # Step 8-13: Cross-validation with SMOTE and Feature Selection
    print("\n" + "="*50)
    print("8-13. CROSS-VALIDATION WITH SMOTE & FEATURE SELECTION")
    print("="*50)
    
    # Set up K-fold cross-validation
    n_folds = 5
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    # Track results for each model
    cv_results = {}
    feature_importance_counts = pd.DataFrame(0, index=X_train.columns, columns=list(models.keys()))
    
    # For each model
    for model_name, model in models.items():
        print(f"\nEvaluating {model_name}...")
        
        fold_accuracies = []
        selected_features_per_fold = []
        
        # For each fold
        for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train_encoded), 1):
            print(f"  Fold {fold}/{n_folds}")
            
            # Split data for this fold
            X_fold_train, X_fold_valid = X_train.iloc[train_idx], X_train.iloc[valid_idx]
            y_fold_train, y_fold_valid = y_train_encoded[train_idx], y_train_encoded[valid_idx]
            
            # Step 9: Apply SMOTE on training portion
            sm = SMOTE(random_state=42)
            X_fold_train_resampled, y_fold_train_resampled = sm.fit_resample(X_fold_train, y_fold_train)
            print(f"    Applied SMOTE: {X_fold_train.shape} → {X_fold_train_resampled.shape}")
            
            # Step 10: Feature Selection on training portion of fold
            if model_name in ['Random Forest', 'XGBoost', 'CatBoost']:
                # Initialize selector
                if model_name == 'Random Forest':
                    selector_model = RandomForestClassifier(random_state=42)
                elif model_name == 'XGBoost':
                    selector_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
                else:  # CatBoost
                    selector_model = CatBoostClassifier(verbose=0, random_state=42)
                
                # Fit selector model
                selector_model.fit(X_fold_train_resampled, y_fold_train_resampled)
                
                # Select features based on importance
                selector = SelectFromModel(selector_model, threshold='mean', prefit=True)
                X_fold_train_selected = selector.transform(X_fold_train_resampled)
                X_fold_valid_selected = selector.transform(X_fold_valid)
                
                # Get selected feature names
                selected_mask = selector.get_support()
                selected_features = X_train.columns[selected_mask].tolist()
                selected_features_per_fold.append(selected_features)
                
                # Update feature importance count
                for feature in selected_features:
                    feature_importance_counts.loc[feature, model_name] += 1
                
                print(f"    Selected {len(selected_features)} features")
            else:
                # For MLP, use all features
                X_fold_train_selected = X_fold_train_resampled
                X_fold_valid_selected = X_fold_valid
                selected_features_per_fold.append(X_train.columns.tolist())
            
            # Step 11: Train model with selected features
            current_model = models[model_name]
            current_model.fit(X_fold_train_selected, y_fold_train_resampled)
            
            # Step 13: Evaluate on validation portion
            y_fold_pred = current_model.predict(X_fold_valid_selected)
            fold_accuracy = accuracy_score(y_fold_valid, y_fold_pred)
            fold_accuracies.append(fold_accuracy)
            
            print(f"    Fold accuracy: {fold_accuracy:.4f}")
        
        # Calculate average performance
        mean_accuracy = np.mean(fold_accuracies)
        std_accuracy = np.std(fold_accuracies)
        
        print(f"  {model_name} - {n_folds}-Fold CV Results:")
        print(f"  Mean Accuracy: {mean_accuracy:.4f} (±{std_accuracy:.4f})")
        
        # Count most frequently selected features
        if model_name in ['Random Forest', 'XGBoost', 'CatBoost']:
            all_selected_features = [feature for fold_features in selected_features_per_fold for feature in fold_features]
            feature_counts = pd.Series(all_selected_features).value_counts()
            most_common_features = feature_counts[feature_counts >= 3].index.tolist()
            print(f"  Features selected in at least 3 folds: {len(most_common_features)}")
        
        # Save model results
        cv_results[model_name] = {
            'fold_accuracies': fold_accuracies,
            'mean_accuracy': mean_accuracy,
            'std_accuracy': std_accuracy,
            'selected_features_per_fold': selected_features_per_fold
        }
    
    # Step 14: Select best model and feature set
    print("\n" + "="*50)
    print("14. SELECT BEST MODEL AND FEATURE SET")
    print("="*50)
    
    # Create summary DataFrame
    cv_summary = pd.DataFrame({
        'Model': list(cv_results.keys()),
        'Mean Accuracy': [results['mean_accuracy'] for results in cv_results.values()],
        'Std Dev': [results['std_accuracy'] for results in cv_results.values()]
    }).sort_values('Mean Accuracy', ascending=False)
    
    print("Cross-validation results summary:")
    print(cv_summary)
    
    # Select best model
    best_model_name = cv_summary.iloc[0]['Model']
    print(f"\nBest model: {best_model_name}")
    
    # For the best model, identify most frequently selected features
    if best_model_name in ['Random Forest', 'XGBoost', 'CatBoost']:
        feature_importance = feature_importance_counts[best_model_name].sort_values(ascending=False)
        
        # Features selected in at least 3 folds
        best_features = feature_importance[feature_importance >= 3].index.tolist()
        
        # If less than 20 features, take top 20
        if len(best_features) < 20:
            best_features = feature_importance.nlargest(20).index.tolist()
        
        print(f"Selected {len(best_features)} features for final model")
    else:
        # For MLP, use all features
        best_features = X_train.columns.tolist()
        print(f"Using all {len(best_features)} features for final model (MLP)")
    
    # Step 12: Hyperparameter Tuning for best model
    print("\n" + "="*50)
    print("12. HYPERPARAMETER TUNING")
    print("="*50)
    
    # Select subset of data with best features
    X_train_best = X_train[best_features]
    X_test_best = X_test[best_features]
    
    # Define hyperparameter grid for the best model
    if best_model_name == "Random Forest":
        model = RandomForestClassifier(random_state=42)
        param_grid = {
            'n_estimators': randint(100, 300),
            'max_depth': randint(5, 30),
            'min_samples_split': randint(2, 10),
            'min_samples_leaf': randint(1, 5),
            'max_features': ['sqrt', 'log2', None]
        }
    
    elif best_model_name == "XGBoost":
        model = XGBClassifier(eval_metric='mlogloss', random_state=42)
        param_grid = {
            'n_estimators': randint(100, 300),
            'max_depth': randint(3, 15),
            'learning_rate': uniform(0.01, 0.3),
            'subsample': uniform(0.5, 0.5),
            'colsample_bytree': uniform(0.5, 0.5)
        }
    
    elif best_model_name == "CatBoost":
        model = CatBoostClassifier(verbose=0, random_state=42)
        param_grid = {
            'iterations': randint(100, 500),
            'depth': randint(4, 10),
            'learning_rate': uniform(0.01, 0.3),
            'l2_leaf_reg': uniform(1, 10),
            'border_count': randint(32, 255)
        }
    
    else:  # MLPClassifier
        model = MLPClassifier(random_state=42)
        param_grid = {
            'hidden_layer_sizes': [(50,), (100,), (100, 50), (50, 50, 50)],
            'activation': ['relu', 'tanh'],
            'solver': ['adam'],
            'alpha': [1e-5, 1e-4, 1e-3],
            'learning_rate': ['constant', 'adaptive'],
            'max_iter': [300, 500]
        }
    
    # Run RandomizedSearchCV
    print(f"Tuning hyperparameters for {best_model_name}...")
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=20,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    # Fit the model
    start_time = time.time()
    random_search.fit(X_train_best, y_train_encoded)
    end_time = time.time()
    
    print(f"Hyperparameter tuning completed in {end_time - start_time:.2f} seconds")
    print(f"Best parameters: {random_search.best_params_}")
    print(f"Best CV accuracy: {random_search.best_score_:.4f}")
    
    # Step 15: Train best model on full training set with best parameters
    print("\n" + "="*50)
    print("15. TRAIN FINAL MODEL")
    print("="*50)
    
    # Create final model with best parameters
    if best_model_name == "Random Forest":
        final_model = RandomForestClassifier(random_state=42, **random_search.best_params_)
    elif best_model_name == "XGBoost":
        final_model = XGBClassifier(eval_metric='mlogloss', random_state=42, **random_search.best_params_)
    elif best_model_name == "CatBoost":
        final_model = CatBoostClassifier(verbose=0, random_state=42, **random_search.best_params_)
    else:  # MLPClassifier
        final_model = MLPClassifier(random_state=42, **random_search.best_params_)
    
    # Apply SMOTE on full training set
    print("Applying SMOTE on full training set...")
    sm = SMOTE(random_state=42)
    X_train_best_resampled, y_train_encoded_resampled = sm.fit_resample(X_train_best, y_train_encoded)
    print(f"Training data shape after SMOTE: {X_train_best_resampled.shape}")
    
    # Train final model
    print(f"Training final {best_model_name} model...")
    final_model.fit(X_train_best_resampled, y_train_encoded_resampled)
    print("Final model training complete.")
    
    # Step 16: Model Evaluation on Test Set
    print("\n" + "="*50)
    print("16. MODEL EVALUATION ON TEST SET")
    print("="*50)
    
    # Predict on test set
    y_pred = final_model.predict(X_test_best)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test_encoded, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    
    # Print classification report
    print("\nClassification Report:")
    report = classification_report(y_test_encoded, y_pred, target_names=class_names)
    print(report)
    
    # Create confusion matrix
    cm = confusion_matrix(y_test_encoded, y_pred)
    plt.figure(figsize=(10, 8))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(cmap='Blues', values_format='d')
    plt.title(f'Confusion Matrix - {best_model_name}')
    plt.savefig(f'results/{dataset_type}/{best_model_name}_confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Normalized confusion matrix
    plt.figure(figsize=(10, 8))
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalized, display_labels=class_names)
    disp.plot(cmap='Blues', values_format='.2f')
    plt.title(f'Normalized Confusion Matrix - {best_model_name}')
    plt.savefig(f'results/{dataset_type}/{best_model_name}_normalized_confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Feature importance for tree-based models
    if best_model_name in ["Random Forest", "XGBoost", "CatBoost"]:
        # Get feature importances
        if best_model_name == "Random Forest":
            importances = final_model.feature_importances_
        elif best_model_name == "XGBoost":
            importances = final_model.feature_importances_
        else:  # CatBoost
            importances = final_model.get_feature_importance()
        
        # Create DataFrame for plotting
        importance_df = pd.DataFrame({
            'Feature': best_features,
            'Importance': importances
        }).sort_values('Importance', ascending=False)
        
        # Plot top 15 features
        plt.figure(figsize=(10, 8))
        sns.barplot(x='Importance', y='Feature', data=importance_df.head(15))
        plt.title(f'Top 15 Feature Importances - {best_model_name}')
        plt.tight_layout()
        plt.savefig(f'results/{dataset_type}/{best_model_name}_feature_importance.png', dpi=300, bbox_inches='tight')
        plt.close()
    
    # ROC curves for multiclass
    if hasattr(final_model, "predict_proba"):
        # Binarize the output
        y_test_bin = label_binarize(y_test_encoded, classes=range(len(class_names)))
        y_score = final_model.predict_proba(X_test_best)
        
        # Compute ROC curve and ROC area for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        
        for i in range(len(class_names)):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        
        # Plot all ROC curves
        plt.figure(figsize=(10, 8))
        
        for i in range(len(class_names)):
            plt.plot(fpr[i], tpr[i], lw=2,
                    label=f'{class_names[i]} (AUC = {roc_auc[i]:.2f})')
        
        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curves - {best_model_name}')
        plt.legend(loc="lower right")
        plt.savefig(f'results/{dataset_type}/{best_model_name}_roc_curves.png', dpi=300, bbox_inches='tight')
        plt.close()
    
    # Save final results
    results = {
        'dataset_type': dataset_type,
        'best_model': best_model_name,
        'best_features': best_features,
        'best_params': random_search.best_params_,
        'cv_accuracy': random_search.best_score_,
        'test_accuracy': accuracy,
        'classification_report': classification_report(y_test_encoded, y_pred, target_names=class_names, output_dict=True)
    }
    
    return results

Run the complete pipeline for both datasets

In [5]:
print("\n" + "*"*80)
print("RUNNING MODELING PIPELINE FOR BASELINE DATASET")
print("*"*80)
baseline_results = run_modeling_pipeline('baseline')


********************************************************************************
RUNNING MODELING PIPELINE FOR BASELINE DATASET
********************************************************************************

    MODELING PIPELINE FOR BASELINE DATASET


1. EXPLORATORY DATA ANALYSIS
Successfully loaded baseline datasets:
Training shape: (76065, 30)
Testing shape: (19017, 30)

Training Dataset Overview:
Columns: ['Month_sin', 'Month_cos', 'Day_sin', 'Day_cos', 'DayOfWeek_sin', 'DayOfWeek_cos', 'Years_Since_First', 'Is_US', 'ProductClassification_Class II', 'ProductClassification_Class III', 'ProductType_Devices', 'ProductType_Drugs', 'ProductType_Food/Cosmetics', 'ProductType_Tobacco', 'ProductType_Veterinary', 'Status_Ongoing', 'Status_Terminated', 'Business_Structure_Association', 'Business_Structure_Company', 'Business_Structure_Corporation', 'Business_Structure_Inc', 'Business_Structure_LLC', 'Business_Structure_LLP', 'Business_Structure_LP', 'Business_Structure_Ltd', 'Business_Str

<Figure size 1000x800 with 0 Axes>

<Figure size 1000x800 with 0 Axes>

In [6]:
print("\n" + "*"*80)
print("RUNNING MODELING PIPELINE FOR HYBRID DATASET")
print("*"*80)
hybrid_results = run_modeling_pipeline('hybrid')


********************************************************************************
RUNNING MODELING PIPELINE FOR HYBRID DATASET
********************************************************************************

    MODELING PIPELINE FOR HYBRID DATASET


1. EXPLORATORY DATA ANALYSIS
Successfully loaded hybrid datasets:
Training shape: (76065, 330)
Testing shape: (19017, 330)

Training Dataset Overview:
Columns: ['Month_sin', 'Month_cos', 'Day_sin', 'Day_cos', 'DayOfWeek_sin', 'DayOfWeek_cos', 'Years_Since_First', 'Is_US', 'ProductClassification_Class II', 'ProductClassification_Class III', 'ProductType_Devices', 'ProductType_Drugs', 'ProductType_Food/Cosmetics', 'ProductType_Tobacco', 'ProductType_Veterinary', 'Status_Ongoing', 'Status_Terminated', 'Business_Structure_Association', 'Business_Structure_Company', 'Business_Structure_Corporation', 'Business_Structure_Inc', 'Business_Structure_LLC', 'Business_Structure_LLP', 'Business_Structure_LP', 'Business_Structure_Ltd', 'Business_Structu

<Figure size 1000x800 with 0 Axes>

<Figure size 1000x800 with 0 Axes>

In [7]:
# Compare baseline and hybrid results
if baseline_results and hybrid_results:
    print("\n" + "*"*80)
    print("COMPARING BASELINE VS HYBRID MODELS")
    print("*"*80)
    
    # Create comparison table
    comparison_df = pd.DataFrame({
        'Metric': ['Model Type', 'CV Accuracy', 'Test Accuracy', 'Number of Features'],
        'Baseline': [
            baseline_results['best_model'],
            f"{baseline_results['cv_accuracy']:.4f}",
            f"{baseline_results['test_accuracy']:.4f}",
            len(baseline_results['best_features'])
        ],
        'Hybrid': [
            hybrid_results['best_model'],
            f"{hybrid_results['cv_accuracy']:.4f}",
            f"{hybrid_results['test_accuracy']:.4f}",
            len(hybrid_results['best_features'])
        ]
    })
    
    print("\nComparison Summary:")
    print(comparison_df)
    
    # Calculate improvement
    accuracy_diff = hybrid_results['test_accuracy'] - baseline_results['test_accuracy']
    percent_improvement = (accuracy_diff / baseline_results['test_accuracy']) * 100
    
    print(f"\nAccuracy difference: {accuracy_diff:.4f}")
    print(f"Percent improvement: {percent_improvement:.2f}%")
    
    if accuracy_diff > 0:
        print("\nConclusion: The hybrid model (with text features) performs better than the baseline model.")
    elif accuracy_diff < 0:
        print("\nConclusion: The baseline model performs better than the hybrid model with text features.")
    else:
        print("\nConclusion: Both models perform similarly. Text features did not significantly impact performance.")
    
    # Class-specific performance comparison
    baseline_report = pd.DataFrame(baseline_results['classification_report'])
    hybrid_report = pd.DataFrame(hybrid_results['classification_report'])
    
    print("\nClass-specific Performance Comparison:")
    for class_name in baseline_report.columns:
        if class_name not in ['accuracy', 'macro avg', 'weighted avg']:
            print(f"\n{class_name} Performance:")
            for metric in ['precision', 'recall', 'f1-score']:
                baseline_val = baseline_report.loc[metric, class_name]
                hybrid_val = hybrid_report.loc[metric, class_name]
                diff = hybrid_val - baseline_val
                print(f"  {metric}: Baseline={baseline_val:.4f}, Hybrid={hybrid_val:.4f}, Diff={diff:.4f}")
    
    # Save comparison results
    comparison_df.to_csv('results/baseline_vs_hybrid_comparison.csv', index=False)
    
    # Create performance comparison bar chart
    metrics = ['CV Accuracy', 'Test Accuracy']
    baseline_values = [baseline_results['cv_accuracy'], baseline_results['test_accuracy']]
    hybrid_values = [hybrid_results['cv_accuracy'], hybrid_results['test_accuracy']]
    
    plt.figure(figsize=(10, 6))
    x = np.arange(len(metrics))
    width = 0.35
    
    plt.bar(x - width/2, baseline_values, width, label='Baseline')
    plt.bar(x + width/2, hybrid_values, width, label='Hybrid (with text)')
    
    plt.ylabel('Accuracy')
    plt.title('Performance Comparison: Baseline vs Hybrid Models')
    plt.xticks(x, metrics)
    plt.legend()
    plt.ylim(0.95, 1.0)  # Adjust as needed to highlight differences
    
    plt.savefig('results/baseline_vs_hybrid_accuracy_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print("\nResults saved. Analysis complete.")


********************************************************************************
COMPARING BASELINE VS HYBRID MODELS
********************************************************************************

Comparison Summary:
               Metric       Baseline         Hybrid
0          Model Type  MLPClassifier  MLPClassifier
1         CV Accuracy         0.9899         0.9937
2       Test Accuracy         0.9895         0.9924
3  Number of Features             29            329

Accuracy difference: 0.0029
Percent improvement: 0.30%

Conclusion: The hybrid model (with text features) performs better than the baseline model.

Class-specific Performance Comparison:

Class I Performance:
  precision: Baseline=0.9943, Hybrid=0.9967, Diff=0.0025
  recall: Baseline=0.9930, Hybrid=0.9903, Diff=-0.0027
  f1-score: Baseline=0.9937, Hybrid=0.9935, Diff=-0.0001

Class II Performance:
  precision: Baseline=0.9960, Hybrid=0.9953, Diff=-0.0007
  recall: Baseline=0.9896, Hybrid=0.9942, Diff=0.0046
  f1-s