In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix



#"C:\Users\stanv\AppData\Roaming\Python\Python312"

In [2]:
file_name = '../data/filtered_events_country_code.csv'

df = pd.read_csv(file_name, delimiter=',')

In [5]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    fbeta_score, recall_score, classification_report,
    confusion_matrix, roc_auc_score, f1_score
)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore')

# --- Load your DataFrame here ---
# Replace 'your_dataset.csv' with the actual path to your dataset
df = pd.read_csv('../data/filtered_events_country_code.csv')

# --- Step 1: Feature Engineering and Preparation ---

# Define 'violent' based on 'event_type'
# If 'event_type' is 'Riots', it's classified as violent (1), otherwise 0.
df['violent'] = (df['event_type'] == 'Riots').astype(int)

# Convert 'event_date' to datetime objects to enable sorting
df['event_date'] = pd.to_datetime(df['event_date'])

# Create 'month' and 'is_weekend' features
df['month'] = df['event_date'].dt.month
df['is_weekend'] = df['event_date'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0)

# Sort by event_date for historical violence rate features
df = df.sort_values('event_date').reset_index(drop=True)

# Define columns for which to calculate historical violence rates
# Excludes 'region' and 'admin1' as per your last specification
columns_for_violence_rates = ['country', 'assoc_actor_1']

for col in columns_for_violence_rates:
    # Ensure column exists and is not entirely NaN before processing
    if col in df.columns and not df[col].isnull().all():
        df[f'{col}_total'] = df.groupby(col).cumcount()
        df[f'{col}_violent_sum'] = df.groupby(col)['violent'].cumsum().shift(1).fillna(0)
        # Avoid division by zero: replace 0 with NaN before division, then fill NaNs with 0
        df[f'{col}_violence_rate'] = df[f'{col}_violent_sum'] / df[f'{col}_total'].replace(0, np.nan)
        df[f'{col}_violence_rate'] = df[f'{col}_violence_rate'].fillna(0)

# Define features for the model
# Excludes 'region' and 'admin1' as per your last specification
categorical_features = ['country', 'assoc_actor_1']
numerical_features = ['year', 'month', 'is_weekend']

# Add violence rate features to numerical features, checking for their existence
violence_rate_features = [f'{col}_violence_rate' for col in columns_for_violence_rates]
for feature in violence_rate_features:
    if feature in df.columns:
        numerical_features.append(feature)

features = categorical_features + numerical_features

# Check if all features exist in the DataFrame before proceeding
missing_features = [f for f in features if f not in df.columns]

X = df[features]
y = df['violent']

# --- Step 2: Train/test split ---
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# --- Step 3: Preprocessing (with sparse output to handle memory) ---
# Ensure only existing categorical features are passed to OneHotEncoder
existing_categorical_features = [f for f in categorical_features if f in X.columns]
existing_numerical_features = [f for f in numerical_features if f in X.columns]

preprocessor = ColumnTransformer(
    transformers=[
        # Set sparse_output=True to return a sparse matrix for memory efficiency
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), existing_categorical_features),
        ('num', StandardScaler(), existing_numerical_features)
    ],
    remainder='drop'
)

# --- Step 4: Compute class weights ---
# This computes weights based on the training target distribution
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# --- Step 5: Define models ---
# Ensure models that support it use sparse input directly or handle class weights
models = {
    'XGBoost_Optimized': XGBClassifier(
        n_estimators=500,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        # scale_pos_weight is for imbalanced datasets in XGBoost
        scale_pos_weight=class_weights[0] / class_weights[1],
        use_label_encoder=False, # Deprecated in newer XGBoost versions, good practice to include for compatibility
        eval_metric='logloss', # Common evaluation metric for binary classification
        random_state=42,
        n_jobs=-1, # Use all available CPU cores
    ),
    'Random_Forest_Optimized': RandomForestClassifier(
        n_estimators=500,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced', # Handles class imbalance for RandomForest
        random_state=42,
        n_jobs=-1,
    ),
    'Gradient_Boosting_Optimized': GradientBoostingClassifier(
        n_estimators=500,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.8,
        random_state=42,
        # Note: GradientBoostingClassifier in scikit-learn does not have a direct 'class_weight' parameter
        # It's more sensitive to imbalanced data; you might need to adjust 'sample_weight' during fit
        # or consider alternative boosting libraries (like LightGBM/CatBoost) for severe imbalance.
    ),
    'Logistic_Regression': LogisticRegression(
        max_iter=2000, # Increased max_iter for convergence
        random_state=42,
        class_weight='balanced', # Handles class imbalance for Logistic Regression
        C=0.1, # Regularization parameter
        solver='liblinear' # Good for small datasets and sparse data, faster for L1/L2 regularization
    )
}

# --- Evaluation function with threshold tuning ---
def evaluate_model_with_threshold(model, model_name):
    print(f"\n{'='*60}")
    print(f"Evaluating {model_name}")
    print(f"{'='*60}")
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train the pipeline
    pipeline.fit(X_train, y_train)
    
    # Predict probabilities on the validation set
    y_proba = pipeline.predict_proba(X_val)[:, 1]
    
    # Tune threshold to maximize F2 score
    thresholds = np.linspace(0.1, 0.9, 17) # 17 points between 0.1 and 0.9 inclusive
    best_threshold = 0.5
    best_f2 = 0
    
    for t in thresholds:
        preds = (y_proba > t).astype(int)
        # Handle potential errors if a class is entirely missing in predictions
        # (e.g., if all preds are 0, and y_val contains 1s)
        f2 = fbeta_score(y_val, preds, beta=2)
        if f2 > best_f2:
            best_f2 = f2
            best_threshold = t
    
    print(f"Best threshold for F2 score: {best_threshold:.2f} with F2: {best_f2:.3f}")
    
    # Final predictions using the best threshold
    y_pred = (y_proba > best_threshold).astype(int)
    
    # Confusion matrix and classification report
    cm = confusion_matrix(y_val, y_pred)
    print("\nConfusion Matrix:")
    print(pd.DataFrame(
        cm,
        index=['Actual 0', 'Actual 1'],
        columns=['Pred 0', 'Pred 1']
    ))
    
    print("\nClassification Report:")
    # The `zero_division=0` parameter prevents warnings/errors if a class has no predicted samples.
    print(classification_report(y_val, y_pred, digits=4, zero_division=0))
    
    # Explicit F1 for class 0 and class 1 (robust to missing classes in prediction)
    f1_class_0 = f1_score(y_val, y_pred, pos_label=0, zero_division=0)
    f1_class_1 = f1_score(y_val, y_pred, pos_label=1, zero_division=0)
    print(f"F1 Score (Class 0): {f1_class_0:.4f}")
    print(f"F1 Score (Class 1): {f1_class_1:.4f}")
    
    # Recall for the positive class (Class 1)
    recall = recall_score(y_val, y_pred, zero_division=0)
    print(f"Recall (Class 1): {recall:.4f}")
    
    # ROC-AUC Score (requires positive class to be present in y_val)
    # Check if y_val has at least two unique classes for ROC-AUC
    if len(np.unique(y_val)) > 1:
        roc_auc = roc_auc_score(y_val, y_proba)
        print(f"ROC-AUC Score: {roc_auc:.4f}")

    # Cross-validation ROC-AUC (requires positive class to be present in y_train)
    if len(np.unique(y_train)) > 1:
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='roc_auc', n_jobs=-1)
        print(f"Cross-validation ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    return pipeline, y_pred, y_proba, best_threshold, roc_auc, cv_scores.mean() # Return ROC-AUC for comparison

# --- Run evaluations for all models ---
results = {}
for name, model in models.items():
    try:
        pipeline, y_pred, y_proba, threshold, roc_auc_val, cv_roc_auc = evaluate_model_with_threshold(model, name)
        results[name] = {
            'pipeline': pipeline,
            'y_pred': y_pred,
            'y_proba': y_proba,
            'best_threshold': threshold,
            'roc_auc_val': roc_auc_val, # Store validation ROC-AUC
            'cv_roc_auc': cv_roc_auc     # Store cross-validation ROC-AUC mean
        }
    except Exception as e:
        print(f"Error in {name}: {e}")

# --- Best model summary ---
if results:
    # Filter for models that successfully ran and have a valid ROC-AUC
    valid_results = {k: v for k, v in results.items() if not np.isnan(v['roc_auc_val'])}
    
    if valid_results:
        # Determine the best model based on validation ROC-AUC
        best_model_name = max(valid_results.keys(), key=lambda n: valid_results[n]['roc_auc_val'])
        print(f"\n{'='*60}")
        print(f"BEST MODEL: {best_model_name}")
        best_model = results[best_model_name]['pipeline']
        print(f"{'='*60}")

        # Feature importance for tree-based models
        try:
            clf = best_model.named_steps['classifier']
            if hasattr(clf, 'feature_importances_'):
                # Dynamically get feature names from the preprocessor
                all_transformed_features = []
                # Ensure the preprocessor has been fitted at least once
                if hasattr(preprocessor, 'named_transformers_'):
                    for transformer_name, _, original_cols in preprocessor.transformers:
                        if transformer_name == 'cat':
                            # Get feature names for one-hot encoded columns
                            all_transformed_features.extend(preprocessor.named_transformers_['cat'].get_feature_names_out(original_cols))
                        elif transformer_name == 'num':
                            # For numerical features, the names remain the same
                            all_transformed_features.extend(original_cols)

                importances = clf.feature_importances_
                
                # Make sure the number of importances matches the number of features
                if len(importances) == len(all_transformed_features):
                    feat_imp_df = pd.DataFrame({
                        'feature': all_transformed_features,
                        'importance': importances
                    }).sort_values('importance', ascending=False)
                    
                    print("\nFeature Importances:")
                    print(feat_imp_df.head(15).to_string(index=False))
                else:
                    print(f"Mismatch between number of feature importances ({len(importances)}) and transformed features ({len(all_transformed_features)}). Cannot display feature importances.")
            else:
                print("Selected model does not have feature_importances_ attribute.")

        except Exception as e:
            print(f"Could not extract feature importances: {e}")
    else:
        print("No valid model results to determine the best model (all ROC-AUC scores were NaN or errors occurred).")
else:
    print("No models were successfully evaluated.")


Evaluating XGBoost_Optimized
Best threshold for F2 score: 0.10 with F2: 0.157

Confusion Matrix:
          Pred 0  Pred 1
Actual 0   53582      46
Actual 1    1842     277

Classification Report:
              precision    recall  f1-score   support

           0     0.9668    0.9991    0.9827     53628
           1     0.8576    0.1307    0.2269      2119

    accuracy                         0.9661     55747
   macro avg     0.9122    0.5649    0.6048     55747
weighted avg     0.9626    0.9661    0.9540     55747

F1 Score (Class 0): 0.9827
F1 Score (Class 1): 0.2269
Recall (Class 1): 0.1307
ROC-AUC Score: 0.8130
Cross-validation ROC-AUC: 0.8129 (+/- 0.0058)

Evaluating Random_Forest_Optimized
Best threshold for F2 score: 0.50 with F2: 0.333

Confusion Matrix:
          Pred 0  Pred 1
Actual 0   42993   10635
Actual 1     754    1365

Classification Report:
              precision    recall  f1-score   support

           0     0.9828    0.8017    0.8830     53628
           1     

In [None]:
#Entire code AI:
"""
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    fbeta_score, recall_score, classification_report,
    confusion_matrix, roc_auc_score, f1_score
)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore')

# --- Load your DataFrame here ---
# Replace 'your_dataset.csv' with the actual path to your dataset
df = pd.read_csv('../data/filtered_events_country_code.csv')

# --- Step 1: Feature Engineering and Preparation ---

# Define 'violent' based on 'event_type'
# If 'event_type' is 'Riots', it's classified as violent (1), otherwise 0.
df['violent'] = (df['event_type'] == 'Riots').astype(int)

# Convert 'event_date' to datetime objects to enable sorting
df['event_date'] = pd.to_datetime(df['event_date'])

# Create 'month' and 'is_weekend' features
df['month'] = df['event_date'].dt.month
df['is_weekend'] = df['event_date'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0)

# Sort by event_date for historical violence rate features
df = df.sort_values('event_date').reset_index(drop=True)

# Define columns for which to calculate historical violence rates
# Excludes 'region' and 'admin1' as per your last specification
columns_for_violence_rates = ['country', 'assoc_actor_1']

for col in columns_for_violence_rates:
    # Ensure column exists and is not entirely NaN before processing
    if col in df.columns and not df[col].isnull().all():
        df[f'{col}_total'] = df.groupby(col).cumcount()
        df[f'{col}_violent_sum'] = df.groupby(col)['violent'].cumsum().shift(1).fillna(0)
        # Avoid division by zero: replace 0 with NaN before division, then fill NaNs with 0
        df[f'{col}_violence_rate'] = df[f'{col}_violent_sum'] / df[f'{col}_total'].replace(0, np.nan)
        df[f'{col}_violence_rate'] = df[f'{col}_violence_rate'].fillna(0)
    else:
        # If column is missing or all NaN, create a dummy column with zeros
        df[f'{col}_violence_rate'] = 0.0
        print(f"Warning: Column '{col}' not found or all missing values. Skipping violence rate calculation for this column.")

# Define features for the model
# Excludes 'region' and 'admin1' as per your last specification
categorical_features = ['country', 'assoc_actor_1']
numerical_features = ['year', 'month', 'is_weekend']

# Add violence rate features to numerical features, checking for their existence
violence_rate_features = [f'{col}_violence_rate' for col in columns_for_violence_rates]
for feature in violence_rate_features:
    if feature in df.columns:
        numerical_features.append(feature)
    else:
        print(f"Warning: Violence rate feature '{feature}' not found, skipping.")

features = categorical_features + numerical_features

# Check if all features exist in the DataFrame before proceeding
missing_features = [f for f in features if f not in df.columns]
if missing_features:
    print(f"Error: The following features are missing from the DataFrame: {missing_features}")
    # It's safer to exit or raise an error if critical features are missing
    # For now, we'll remove them and proceed, but this might impact model performance.
    features = [f for f in features if f not in missing_features] 

X = df[features]
y = df['violent']

# Fill missing values for categorical and numerical features before splitting
# This ensures consistency between train and validation sets
for col in categorical_features:
    if col in X.columns:
        X[col] = X[col].fillna('Unknown')
for col in numerical_features:
    if col in X.columns:
        X[col] = X[col].fillna(X[col].median())


# --- Step 2: Train/test split ---
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# --- Step 3: Preprocessing (with sparse output to handle memory) ---
# Ensure only existing categorical features are passed to OneHotEncoder
existing_categorical_features = [f for f in categorical_features if f in X.columns]
existing_numerical_features = [f for f in numerical_features if f in X.columns]

preprocessor = ColumnTransformer(
    transformers=[
        # Set sparse_output=True to return a sparse matrix for memory efficiency
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), existing_categorical_features),
        ('num', StandardScaler(), existing_numerical_features)
    ],
    remainder='drop'
)

# --- Step 4: Compute class weights ---
# This computes weights based on the training target distribution
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# --- Step 5: Define models ---
# Ensure models that support it use sparse input directly or handle class weights
models = {
    'XGBoost_Optimized': XGBClassifier(
        n_estimators=500,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        # scale_pos_weight is for imbalanced datasets in XGBoost
        scale_pos_weight=class_weights[0] / class_weights[1],
        use_label_encoder=False, # Deprecated in newer XGBoost versions, good practice to include for compatibility
        eval_metric='logloss', # Common evaluation metric for binary classification
        random_state=42,
        n_jobs=-1, # Use all available CPU cores
    ),
    'Random_Forest_Optimized': RandomForestClassifier(
        n_estimators=500,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced', # Handles class imbalance for RandomForest
        random_state=42,
        n_jobs=-1,
    ),
    'Gradient_Boosting_Optimized': GradientBoostingClassifier(
        n_estimators=500,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.8,
        random_state=42,
        # Note: GradientBoostingClassifier in scikit-learn does not have a direct 'class_weight' parameter
        # It's more sensitive to imbalanced data; you might need to adjust 'sample_weight' during fit
        # or consider alternative boosting libraries (like LightGBM/CatBoost) for severe imbalance.
    ),
    'Logistic_Regression': LogisticRegression(
        max_iter=2000, # Increased max_iter for convergence
        random_state=42,
        class_weight='balanced', # Handles class imbalance for Logistic Regression
        C=0.1, # Regularization parameter
        solver='liblinear' # Good for small datasets and sparse data, faster for L1/L2 regularization
    )
}

# --- Evaluation function with threshold tuning ---
def evaluate_model_with_threshold(model, model_name):
    print(f"\n{'='*60}")
    print(f"Evaluating {model_name}")
    print(f"{'='*60}")
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train the pipeline
    pipeline.fit(X_train, y_train)
    
    # Predict probabilities on the validation set
    y_proba = pipeline.predict_proba(X_val)[:, 1]
    
    # Tune threshold to maximize F2 score
    thresholds = np.linspace(0.1, 0.9, 17) # 17 points between 0.1 and 0.9 inclusive
    best_threshold = 0.5
    best_f2 = 0
    
    for t in thresholds:
        preds = (y_proba > t).astype(int)
        # Handle potential errors if a class is entirely missing in predictions
        # (e.g., if all preds are 0, and y_val contains 1s)
        try:
            f2 = fbeta_score(y_val, preds, beta=2)
            if f2 > best_f2:
                best_f2 = f2
                best_threshold = t
        except ValueError:
            # This can happen if all predictions are of one class and the true labels contain both.
            # It indicates a very poor model or an edge case in data.
            print(f"Warning: Could not compute F2 score for threshold {t:.2f}. Skipping.")
            continue
    
    print(f"Best threshold for F2 score: {best_threshold:.2f} with F2: {best_f2:.3f}")
    
    # Final predictions using the best threshold
    y_pred = (y_proba > best_threshold).astype(int)
    
    # Confusion matrix and classification report
    cm = confusion_matrix(y_val, y_pred)
    print("\nConfusion Matrix:")
    print(pd.DataFrame(
        cm,
        index=['Actual 0', 'Actual 1'],
        columns=['Pred 0', 'Pred 1']
    ))
    
    print("\nClassification Report:")
    # The `zero_division=0` parameter prevents warnings/errors if a class has no predicted samples.
    print(classification_report(y_val, y_pred, digits=4, zero_division=0))
    
    # Explicit F1 for class 0 and class 1 (robust to missing classes in prediction)
    f1_class_0 = f1_score(y_val, y_pred, pos_label=0, zero_division=0)
    f1_class_1 = f1_score(y_val, y_pred, pos_label=1, zero_division=0)
    print(f"F1 Score (Class 0): {f1_class_0:.4f}")
    print(f"F1 Score (Class 1): {f1_class_1:.4f}")
    
    # Recall for the positive class (Class 1)
    recall = recall_score(y_val, y_pred, zero_division=0)
    print(f"Recall (Class 1): {recall:.4f}")
    
    # ROC-AUC Score (requires positive class to be present in y_val)
    # Check if y_val has at least two unique classes for ROC-AUC
    if len(np.unique(y_val)) > 1:
        roc_auc = roc_auc_score(y_val, y_proba)
        print(f"ROC-AUC Score: {roc_auc:.4f}")
    else:
        print("ROC-AUC Score: N/A (Validation set does not contain both classes)")
        roc_auc = np.nan # Assign NaN if not computable

    # Cross-validation ROC-AUC (requires positive class to be present in y_train)
    if len(np.unique(y_train)) > 1:
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='roc_auc', n_jobs=-1)
        print(f"Cross-validation ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    else:
        print("Cross-validation ROC-AUC: N/A (Training set does not contain both classes)")
        cv_scores = np.array([np.nan]) # Assign NaN if not computable
    
    return pipeline, y_pred, y_proba, best_threshold, roc_auc, cv_scores.mean() # Return ROC-AUC for comparison

# --- Run evaluations for all models ---
results = {}
for name, model in models.items():
    try:
        pipeline, y_pred, y_proba, threshold, roc_auc_val, cv_roc_auc = evaluate_model_with_threshold(model, name)
        results[name] = {
            'pipeline': pipeline,
            'y_pred': y_pred,
            'y_proba': y_proba,
            'best_threshold': threshold,
            'roc_auc_val': roc_auc_val, # Store validation ROC-AUC
            'cv_roc_auc': cv_roc_auc     # Store cross-validation ROC-AUC mean
        }
    except Exception as e:
        print(f"Error in {name}: {e}")

# --- Best model summary ---
if results:
    # Filter for models that successfully ran and have a valid ROC-AUC
    valid_results = {k: v for k, v in results.items() if not np.isnan(v['roc_auc_val'])}
    
    if valid_results:
        # Determine the best model based on validation ROC-AUC
        best_model_name = max(valid_results.keys(), key=lambda n: valid_results[n]['roc_auc_val'])
        print(f"\n{'='*60}")
        print(f"🏆 BEST MODEL: {best_model_name}")
        best_model = results[best_model_name]['pipeline']
        print(f"{'='*60}")

        # Feature importance for tree-based models
        try:
            clf = best_model.named_steps['classifier']
            if hasattr(clf, 'feature_importances_'):
                # Dynamically get feature names from the preprocessor
                all_transformed_features = []
                # Ensure the preprocessor has been fitted at least once
                if hasattr(preprocessor, 'named_transformers_'):
                    for transformer_name, _, original_cols in preprocessor.transformers:
                        if transformer_name == 'cat':
                            # Get feature names for one-hot encoded columns
                            all_transformed_features.extend(preprocessor.named_transformers_['cat'].get_feature_names_out(original_cols))
                        elif transformer_name == 'num':
                            # For numerical features, the names remain the same
                            all_transformed_features.extend(original_cols)

                importances = clf.feature_importances_
                
                # Make sure the number of importances matches the number of features
                if len(importances) == len(all_transformed_features):
                    feat_imp_df = pd.DataFrame({
                        'feature': all_transformed_features,
                        'importance': importances
                    }).sort_values('importance', ascending=False)
                    
                    print("\nFeature Importances:")
                    print(feat_imp_df.head(15).to_string(index=False))
                else:
                    print(f"Mismatch between number of feature importances ({len(importances)}) and transformed features ({len(all_transformed_features)}). Cannot display feature importances.")
            else:
                print("Selected model does not have feature_importances_ attribute.")

        except Exception as e:
            print(f"Could not extract feature importances: {e}")
    else:
        print("No valid model results to determine the best model (all ROC-AUC scores were NaN or errors occurred).")
else:
    print("No models were successfully evaluated.")
"""