In [115]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import time

In [117]:
# Load the processed dataset
processed_file_path = '/Users/parisakamizi/Downloads/datasets/train_hybrid.csv'
train_df = pd.read_csv(processed_file_path)

print("\nProcessed Dataset Information:")
print(f"Shape: {train_df.shape}")
print("\nFirst 5 rows:")
train_df.head()


Processed Dataset Information:
Shape: (76065, 332)

First 5 rows:


Unnamed: 0,Month_sin,Month_cos,Day_sin,Day_cos,DayOfWeek_sin,DayOfWeek_cos,Years_Since_First,Is_US,ProductClassification_Class II,ProductClassification_Class III,...,text_svd_293,text_svd_294,text_svd_295,text_svd_296,text_svd_297,text_svd_298,text_svd_299,Product Classification,combined_text,Event Classification
0,-1.0,-1.83697e-16,-0.937752,0.347305,0.781831,0.62349,6,0,1.0,0.0,...,0.021836,-0.015965,-0.016958,0.004899,0.018882,-0.017736,0.034006,Class II,supplier agfa system noted potential steel sup...,Class II
1,0.866025,0.5,0.897805,-0.440394,0.0,1.0,8,1,1.0,0.0,...,0.00326,-0.003569,-0.009412,-0.003837,0.012086,0.005998,-0.00499,Class II,blood collected donor whose suitability donate...,Class II
2,-1.0,-1.83697e-16,0.101168,-0.994869,0.433884,-0.900969,10,1,1.0,0.0,...,-0.007428,-0.007476,-0.002405,0.020428,-0.023333,0.044938,0.019889,Class II,lack assurance sterility tri mix injectable ml...,Class II
3,-0.866025,0.5,0.299363,-0.954139,0.781831,0.62349,2,1,1.0,0.0,...,0.005633,0.012897,-0.006782,-0.001761,0.028885,-0.003592,-0.022891,Class II,received several complaints head deck actuator...,Class II
4,-0.866025,0.5,0.848644,0.528964,-0.433884,-0.900969,6,1,1.0,0.0,...,0.013164,-0.029533,0.008793,0.033197,-0.025611,0.022909,-0.014812,Class II,sensors reported fluid inside posterior latera...,Class II


In [119]:
# Define features (X) and target (y)
X = train_df.drop('Event Classification', axis=1)
y = train_df['Event Classification']

In [121]:
# Encode features and target
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
if categorical_cols.any():
    X[categorical_cols] = OrdinalEncoder().fit_transform(X[categorical_cols])
X = X.astype(int)

le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [123]:
# Define fold configurations to test
fold_configs = [3, 5, 10]  

# xperiment with different fold counts
for n_folds in fold_configs:
    print(f"\n{'='*40}\nEvaluating with {n_folds}-fold CV\n{'='*40}")
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    fold_accuracies = []
    fold_reports = []
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y_encoded)):
        print(f"\nFold {fold+1}/{n_folds}")
        
        # Split and resample
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y_encoded[train_idx], y_encoded[valid_idx]
        
        sm = SMOTE(random_state=42)
        X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
        
        # Train and evaluate
        model = RandomForestClassifier(class_weight='balanced', random_state=42)
        model.fit(X_resampled, y_resampled)
        
        y_pred = model.predict(X_valid)
        accuracy = accuracy_score(y_valid, y_pred)
        report = classification_report(y_valid, y_pred, target_names=le.classes_)
        
        fold_accuracies.append(accuracy)
        fold_reports.append(report)
        
        print(f"Accuracy: {accuracy:.4f}")
        print("Classification Report:")
        print(report)
    
    # Summary for this fold configuration
    print(f"\n{'='*30}")
    print(f"{n_folds}-Fold CV Summary:")
    print(f"Mean Accuracy: {np.mean(fold_accuracies):.4f}")
    print(f"Std Dev: {np.std(fold_accuracies):.4f}")
    print(f"Min Accuracy: {np.min(fold_accuracies):.4f}")
    print(f"Max Accuracy: {np.max(fold_accuracies):.4f}")
    print('='*30)


Evaluating with 3-fold CV

Fold 1/3
Accuracy: 0.9916
Classification Report:
              precision    recall  f1-score   support

     Class I       0.99      0.99      0.99      5363
    Class II       1.00      0.99      0.99     17953
   Class III       0.95      0.98      0.97      2039

    accuracy                           0.99     25355
   macro avg       0.98      0.99      0.98     25355
weighted avg       0.99      0.99      0.99     25355


Fold 2/3
Accuracy: 0.9925
Classification Report:
              precision    recall  f1-score   support

     Class I       1.00      0.99      0.99      5363
    Class II       1.00      0.99      0.99     17953
   Class III       0.95      0.98      0.97      2039

    accuracy                           0.99     25355
   macro avg       0.98      0.99      0.99     25355
weighted avg       0.99      0.99      0.99     25355


Fold 3/3
Accuracy: 0.9918
Classification Report:
              precision    recall  f1-score   support

     C

In [125]:
# For each fold after applying SMOTE
for train_idx, valid_idx in skf.split(X, y_encoded):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y_encoded[train_idx], y_encoded[valid_idx]

    # Apply SMOTE on training portion
    sm = SMOTE(random_state=42)
    X_train_smote, y_train_smote = sm.fit_resample(X_train_fold, y_train_fold)
    
    # Initialize RandomForestClassifier for feature selection
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train_smote, y_train_smote)
    
    # Select features based on model's importance
    selector = SelectFromModel(rf, threshold="mean", max_features=20, importance_getter="auto")
    X_train_selected = selector.transform(X_train_smote)
    X_valid_selected = selector.transform(X_valid_fold)
    
    # Check the number of selected features
    print(f"Selected Features: {X_train_selected.shape[1]}")

    # Now you can proceed with training your model using selected features
    # Train your model (for example, RandomForest, XGBoost, etc.) with X_train_selected
    # and evaluate on X_valid_selected


Selected Features: 18
Selected Features: 18
Selected Features: 18
Selected Features: 18
Selected Features: 18
Selected Features: 18
Selected Features: 18
Selected Features: 18
Selected Features: 18
Selected Features: 18


In [127]:
# Initialize RandomForest model
rf_model = RandomForestClassifier(random_state=42)

# For each fold after SMOTE and feature selection
for train_idx, valid_idx in skf.split(X, y_encoded):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y_encoded[train_idx], y_encoded[valid_idx]

    # Apply SMOTE on training portion
    sm = SMOTE(random_state=42)
    X_train_smote, y_train_smote = sm.fit_resample(X_train_fold, y_train_fold)
    
    # Feature Selection using Random Forest model
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train_smote, y_train_smote)
    selector = SelectFromModel(rf, threshold="mean", max_features=20, importance_getter="auto")
    X_train_selected = selector.transform(X_train_smote)
    X_valid_selected = selector.transform(X_valid_fold)

    # Train the Random Forest model on selected features
    rf_model.fit(X_train_selected, y_train_smote)

    # Predict on validation set
    y_valid_pred = rf_model.predict(X_valid_selected)

    # Evaluate performance (accuracy)
    accuracy = accuracy_score(y_valid_fold, y_valid_pred)
    print(f"Random Forest Accuracy: {accuracy:.4f}")


Random Forest Accuracy: 0.9933
Random Forest Accuracy: 0.9930
Random Forest Accuracy: 0.9913
Random Forest Accuracy: 0.9912
Random Forest Accuracy: 0.9932
Random Forest Accuracy: 0.9930
Random Forest Accuracy: 0.9905
Random Forest Accuracy: 0.9918
Random Forest Accuracy: 0.9929
Random Forest Accuracy: 0.9933


Compute and print the mean and standard deviation of these accuracies.

Save the selected features used in each fold and look for overlap — helps you identify consistently important features.

Log a confusion matrix or classification report for one of the folds to see where errors occur (e.g., is Class III being confused with Class II?).



In [136]:
xgb_accuracies = []

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y_encoded), 1):
    print(f"=== Fold {fold} ===")
    
    # Split the data
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y_encoded[train_idx], y_encoded[valid_idx]
    
    # Apply SMOTE
    sm = SMOTE(random_state=42)
    X_train_smote, y_train_smote = sm.fit_resample(X_train_fold, y_train_fold)
    
    # Fit XGBoost
    xgb = XGBClassifier(eval_metric='mlogloss', random_state=42)
    xgb.fit(X_train_smote, y_train_smote)
    
    importances = pd.Series(xgb.feature_importances_, index=X.columns)
    top_features = importances.sort_values(ascending=False).head(20).index.tolist()
    
    X_train_selected = X_train_smote[top_features]
    X_valid_selected = X_valid_fold[top_features]
    
    xgb_selected = XGBClassifier(eval_metric='mlogloss', random_state=42)
    xgb_selected.fit(X_train_selected, y_train_smote)
    
    y_pred = xgb_selected.predict(X_valid_selected)
    acc = accuracy_score(y_valid_fold, y_pred)
    xgb_accuracies.append(acc)
    
    print(f"XGBoost Accuracy: {acc:.4f}")

print("\n=== Final XGBoost Cross-Validation Results ===")
print(f"Mean Accuracy: {np.mean(xgb_accuracies):.4f} (±{np.std(xgb_accuracies):.4f})")


=== Fold 1 ===
XGBoost Accuracy: 0.9888
=== Fold 2 ===
XGBoost Accuracy: 0.9909
=== Fold 3 ===
XGBoost Accuracy: 0.9897
=== Fold 4 ===
XGBoost Accuracy: 0.9883
=== Fold 5 ===
XGBoost Accuracy: 0.9900
=== Fold 6 ===
XGBoost Accuracy: 0.9915
=== Fold 7 ===
XGBoost Accuracy: 0.9884
=== Fold 8 ===
XGBoost Accuracy: 0.9890
=== Fold 9 ===
XGBoost Accuracy: 0.9890
=== Fold 10 ===
XGBoost Accuracy: 0.9907

=== Final XGBoost Cross-Validation Results ===
Mean Accuracy: 0.9896 (±0.0010)


In [138]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

mlp_accuracies = []

fold = 1
for train_idx, valid_idx in skf.split(X, y_encoded):
    print(f"=== Fold {fold} ===")
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y_encoded[train_idx], y_encoded[valid_idx]

    # Apply SMOTE to training data
    sm = SMOTE(random_state=42)
    X_train_smote, y_train_smote = sm.fit_resample(X_train_fold, y_train_fold)

    # Initialize MLPClassifier
    mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)

    # Fit model
    mlp.fit(X_train_smote, y_train_smote)

    # Predict and evaluate
    y_pred = mlp.predict(X_valid_fold)
    acc = accuracy_score(y_valid_fold, y_pred)
    mlp_accuracies.append(acc)
    print(f"MLPClassifier Accuracy: {acc:.4f}")
    
    fold += 1

mean_acc = sum(mlp_accuracies) / len(mlp_accuracies)
std_acc = (sum((x - mean_acc) ** 2 for x in mlp_accuracies) / len(mlp_accuracies)) ** 0.5

print("\n=== Final MLPClassifier Cross-Validation Results ===")
print(f"Mean Accuracy: {mean_acc:.4f} (±{std_acc:.4f})")


=== Fold 1 ===
MLPClassifier Accuracy: 0.8691
=== Fold 2 ===
MLPClassifier Accuracy: 0.5839
=== Fold 3 ===
MLPClassifier Accuracy: 0.9590
=== Fold 4 ===
MLPClassifier Accuracy: 0.9744
=== Fold 5 ===
MLPClassifier Accuracy: 0.7477
=== Fold 6 ===
MLPClassifier Accuracy: 0.5456
=== Fold 7 ===
MLPClassifier Accuracy: 0.9716
=== Fold 8 ===
MLPClassifier Accuracy: 0.8776
=== Fold 9 ===
MLPClassifier Accuracy: 0.9666
=== Fold 10 ===
MLPClassifier Accuracy: 0.9716

=== Final MLPClassifier Cross-Validation Results ===
Mean Accuracy: 0.8467 (±0.1567)


In [140]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Set up cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
catboost_accuracies = []

fold = 1
for train_idx, valid_idx in skf.split(X, y_encoded):
    print(f"=== Fold {fold} ===")
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y_encoded[train_idx], y_encoded[valid_idx]

    # Apply SMOTE
    sm = SMOTE(random_state=42)
    X_train_smote, y_train_smote = sm.fit_resample(X_train_fold, y_train_fold)

    # Initialize CatBoost (silent training)
    cat_model = CatBoostClassifier(verbose=0, random_state=42)

    # Fit and predict
    cat_model.fit(X_train_smote, y_train_smote)
    y_pred = cat_model.predict(X_valid_fold)

    # Evaluate
    acc = accuracy_score(y_valid_fold, y_pred)
    catboost_accuracies.append(acc)
    print(f"CatBoost Accuracy: {acc:.4f}")

    fold += 1

mean_acc = sum(catboost_accuracies) / len(catboost_accuracies)
std_acc = (sum((x - mean_acc) ** 2 for x in catboost_accuracies) / len(catboost_accuracies)) ** 0.5

print("\n=== Final CatBoost Cross-Validation Results ===")
print(f"Mean Accuracy: {mean_acc:.4f} (±{std_acc:.4f})")


=== Fold 1 ===
CatBoost Accuracy: 0.9892
=== Fold 2 ===
CatBoost Accuracy: 0.9896
=== Fold 3 ===
CatBoost Accuracy: 0.9878
=== Fold 4 ===
CatBoost Accuracy: 0.9884
=== Fold 5 ===
CatBoost Accuracy: 0.9887
=== Fold 6 ===
CatBoost Accuracy: 0.9901
=== Fold 7 ===
CatBoost Accuracy: 0.9874
=== Fold 8 ===
CatBoost Accuracy: 0.9884
=== Fold 9 ===
CatBoost Accuracy: 0.9891
=== Fold 10 ===
CatBoost Accuracy: 0.9905

=== Final CatBoost Cross-Validation Results ===
Mean Accuracy: 0.9889 (±0.0009)


In [146]:
# Define the model
rf_model = RandomForestClassifier(random_state=42)

# Define parameter grid
rf_param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(5, 30),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2', None]
}

# Setup RandomizedSearchCV
rf_random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=rf_param_dist,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

start = time.time()
rf_random_search.fit(X, y)
end = time.time()

print(f"\nBest Random Forest Params: {rf_random_search.best_params_}")
print(f"Best Accuracy: {rf_random_search.best_score_:.4f}")
print(f"Tuning Time: {end - start:.2f} seconds")


Fitting 5 folds for each of 20 candidates, totalling 100 fits

Best Random Forest Params: {'max_depth': 25, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 174}
Best Accuracy: 0.9919
Tuning Time: 608.12 seconds


In [197]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import cross_val_score
import optuna

def catboost_objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 500),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "random_strength": trial.suggest_float("random_strength", 0.1, 1),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "verbose": 0
    }

    model = CatBoostClassifier(**params)
    scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")
    return scores.mean()

study = optuna.create_study(direction="maximize")
study.optimize(catboost_objective, n_trials=20)

print("Best CatBoost Params:", study.best_params)
print("Best Accuracy:", study.best_value)


[I 2025-04-10 23:20:25,217] A new study created in memory with name: no-name-7c80adcc-3671-4eac-9a1b-6781e3be41af
[I 2025-04-10 23:21:02,337] Trial 0 finished with value: 0.9867218826004075 and parameters: {'iterations': 323, 'depth': 9, 'learning_rate': 0.05499635560911883, 'l2_leaf_reg': 6.088201699358175, 'random_strength': 0.2536876811795584, 'border_count': 196}. Best is trial 0 with value: 0.9867218826004075.
[I 2025-04-10 23:21:14,897] Trial 1 finished with value: 0.9734043252481431 and parameters: {'iterations': 242, 'depth': 5, 'learning_rate': 0.01134061009680689, 'l2_leaf_reg': 1.1795192959426974, 'random_strength': 0.7508737219380018, 'border_count': 58}. Best is trial 0 with value: 0.9867218826004075.
[I 2025-04-10 23:21:38,819] Trial 2 finished with value: 0.9884309472161966 and parameters: {'iterations': 436, 'depth': 6, 'learning_rate': 0.1515952932447508, 'l2_leaf_reg': 4.054509983652165, 'random_strength': 0.8871152507039605, 'border_count': 149}. Best is trial 2 with

Best CatBoost Params: {'iterations': 452, 'depth': 10, 'learning_rate': 0.13471139496824774, 'l2_leaf_reg': 5.988581181994652, 'random_strength': 0.6291669384495817, 'border_count': 228}
Best Accuracy: 0.9897850522579373


In [193]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (50, 50, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'alpha': [1e-5, 1e-4, 1e-3],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [300, 500]
}

mlp = MLPClassifier(random_state=42)
mlp_search = RandomizedSearchCV(mlp, param_distributions=param_dist, n_iter=20, cv=5, scoring='accuracy', n_jobs=-1)
mlp_search.fit(X, y)

print("Best MLP Params:", mlp_search.best_params_)
print("Best Accuracy:", mlp_search.best_score_)


Best MLP Params: {'solver': 'adam', 'max_iter': 500, 'learning_rate': 'constant', 'hidden_layer_sizes': (50,), 'alpha': 1e-05, 'activation': 'relu'}
Best Accuracy: 0.9387366068494052


In [201]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import time

# Define the model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Parameter grid
xgb_param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(3, 15),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5)
}

# Setup RandomizedSearchCV
xgb_random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_param_dist,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

start = time.time()
xgb_random_search.fit(X, y)
end = time.time()

print(f"\nBest XGBoost Params: {xgb_random_search.best_params_}")
print(f"Best Accuracy: {xgb_random_search.best_score_:.4f}")
print(f"Tuning Time: {end - start:.2f} seconds")


AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# Load Datasets 
train_df = pd.read_csv('/Users/parisakamizi/Downloads/datasets/train_hybrid.csv')
test_df = pd.read_csv('/Users/parisakamizi/Downloads/datasets/test_hybrid.csv')

# Separate Features and Target 
X_train = train_df.drop('Event Classification', axis=1)
y_train = train_df['Event Classification']
X_test = test_df.drop('Event Classification', axis=1)
y_test = test_df['Event Classification']

# Encode Categorical Features 
# One-Hot Encoding 
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Ensure test data has same columns as train data
missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0
X_test = X_test[X_train.columns]

# Encode Target Variable 
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Apply SMOTE to Handle Imbalance 
sm = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = sm.fit_resample(X_train, y_train_encoded)

# Final Model Training 
final_rf = RandomForestClassifier(
    n_estimators=174,
    max_depth=25,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features=None,
    random_state=42
)
final_rf.fit(X_train_balanced, y_train_balanced)

# Final Evaluation 
y_pred = final_rf.predict(X_test)
acc = accuracy_score(y_test_encoded, y_pred)
report = classification_report(y_test_encoded, y_pred)
conf_matrix = confusion_matrix(y_test_encoded, y_pred)

print(f"Final Test Accuracy: {acc:.4f}")
print("\n=== Classification Report ===")
print(report)
print("\n=== Confusion Matrix ===")
print(conf_matrix)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importances
importances = final_rf.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15), palette='viridis')
plt.title('Top 15 Feature Importances (Random Forest)')
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on the test set
catboost_preds = best_cat.predict(X_test)

# Accuracy
catboost_test_accuracy = accuracy_score(y_test, catboost_preds)
print(f"CatBoost Test Accuracy: {catboost_test_accuracy:.4f}")

# Classification report
print("\nClassification Report (CatBoost):")
print(classification_report(y_test, catboost_preds))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, catboost_preds)
print("\nConfusion Matrix:")
print(conf_matrix)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on the test set
xgb_preds = best_xgb.predict(X_test)

# Accuracy
xgb_test_accuracy = accuracy_score(y_test, xgb_preds)
print(f"XGBoost Test Accuracy: {xgb_test_accuracy:.4f}")

# Classification report
print("\nClassification Report (XGBoost):")
print(classification_report(y_test, xgb_preds))

# Confusion matrix
print("\nConfusion Matrix (XGBoost):")
print(confusion_matrix(y_test, xgb_preds))


In [None]:
# Predict on the test set
mlp_preds = best_mlp.predict(X_test)

# Accuracy
mlp_test_accuracy = accuracy_score(y_test, mlp_preds)
print(f"MLPClassifier Test Accuracy: {mlp_test_accuracy:.4f}")

# Classification report
print("\nClassification Report (MLPClassifier):")
print(classification_report(y_test, mlp_preds))

# Confusion matrix
print("\nConfusion Matrix (MLPClassifier):")
print(confusion_matrix(y_test, mlp_preds))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Function to plot confusion matrix
def plot_conf_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"Confusion Matrix - {title}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# Confusion Matrix for XGBoost
plot_conf_matrix(y_test, y_pred_xgb, "XGBoost")

# Confusion Matrix for MLPClassifier
plot_conf_matrix(y_test, y_pred_mlp, "MLPClassifier")


In [None]:
# Classification Report as Heatmap
def plot_classification_report(y_true, y_pred, title):
    report = classification_report(y_true, y_pred, output_dict=True)
    df = pd.DataFrame(report).iloc[:-1, :].T  # Exclude 'accuracy' row
    plt.figure(figsize=(10, 6))
    sns.heatmap(df, annot=True, cmap="YlGnBu")
    plt.title(f"Classification Report - {title}")
    plt.show()

# XGBoost Report
plot_classification_report(y_test, y_pred_xgb, "XGBoost")

# MLPClassifier Report
plot_classification_report(y_test, y_pred_mlp, "MLPClassifier")


In [None]:
# Accuracy comparison
acc_xgb = accuracy_score(y_test, y_pred_xgb)
acc_mlp = accuracy_score(y_test, y_pred_mlp)

plt.figure(figsize=(6,4))
sns.barplot(x=["XGBoost", "MLPClassifier"], y=[acc_xgb, acc_mlp], palette="pastel")
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.ylim(0.0, 1.0)
plt.grid(axis="y")
plt.show()


In [None]:
# Confusion Matrix for CatBoost
plot_conf_matrix(y_test, y_pred_catboost, "CatBoost")

# Confusion Matrix for Random Forest
plot_conf_matrix(y_test, y_pred_rf, "Random Forest")


In [None]:
# CatBoost Report
plot_classification_report(y_test, y_pred_catboost, "CatBoost")

# Random Forest Report
plot_classification_report(y_test, y_pred_rf, "Random Forest")


In [None]:
# Accuracy scores
acc_rf = accuracy_score(y_test, y_pred_rf)
acc_cat = accuracy_score(y_test, y_pred_catboost)

# Extended bar chart with all models
model_names = ["Random Forest", "CatBoost", "XGBoost", "MLPClassifier"]
accuracies = [acc_rf, acc_cat, acc_xgb, acc_mlp]

plt.figure(figsize=(8, 5))
sns.barplot(x=model_names, y=accuracies, palette="pastel")
plt.title("Accuracy Comparison Across Models")
plt.ylabel("Accuracy")
plt.ylim(0.0, 1.0)
plt.grid(axis="y")
plt.show()


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# Confusion Matrices (Counts and Normalized)
for model_name, y_pred in zip(
    ["Random Forest", "CatBoost", "XGBoost", "MLPClassifier"],
    [y_pred_rf, y_pred_catboost, y_pred_xgb, y_pred_mlp]
):
    fig, ax = plt.subplots(1, 2, figsize=(14, 5))
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax[0], cmap="Blues")
    ax[0].set_title(f"{model_name} - Confusion Matrix (Counts)")
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, normalize='true', ax=ax[1], cmap="Oranges")
    ax[1].set_title(f"{model_name} - Confusion Matrix (Normalized)")
    plt.tight_layout()
    plt.show()


In [None]:
from sklearn.metrics import classification_report
import seaborn as sns

def plot_class_report(y_true, y_pred, title):
    report = classification_report(y_true, y_pred, output_dict=True)
    df_report = pd.DataFrame(report).iloc[:-1, :].T  # drop accuracy row
    plt.figure(figsize=(8, 4))
    sns.heatmap(df_report, annot=True, cmap="YlGnBu", fmt=".2f")
    plt.title(f"{title} - Classification Report")
    plt.ylabel("Metrics")
    plt.xlabel("Labels")
    plt.show()

# Plot for all models
plot_class_report(y_test, y_pred_rf, "Random Forest")
plot_class_report(y_test, y_pred_catboost, "CatBoost")
plot_class_report(y_test, y_pred_xgb, "XGBoost")
plot_class_report(y_test, y_pred_mlp, "MLPClassifier")


In [None]:
from sklearn.metrics import accuracy_score

# Accuracy scores
accuracies = {
    "Random Forest": accuracy_score(y_test, y_pred_rf),
    "CatBoost": accuracy_score(y_test, y_pred_catboost),
    "XGBoost": accuracy_score(y_test, y_pred_xgb),
    "MLPClassifier": accuracy_score(y_test, y_pred_mlp)
}

# Bar chart
plt.figure(figsize=(8, 5))
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette="pastel")
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.ylim(0.5, 1.0)
plt.grid(axis="y")
plt.show()


In [None]:
# Random Forest
importances_rf = pd.Series(best_rf_model.feature_importances_, index=X_test.columns)
importances_rf.sort_values().tail(10).plot(kind="barh", title="Random Forest - Top Features", figsize=(8, 4))
plt.show()

# CatBoost
importances_cat = pd.Series(best_cat_model.get_feature_importance(), index=X_test.columns)
importances_cat.sort_values().tail(10).plot(kind="barh", title="CatBoost - Top Features", figsize=(8, 4))
plt.show()

# XGBoost
importances_xgb = pd.Series(best_xgb_model.feature_importances_, index=X_test.columns)
importances_xgb.sort_values().tail(10).plot(kind="barh", title="XGBoost - Top Features", figsize=(8, 4))
plt.show()


In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler

# Binarize the output
classes = np.unique(y_test)
y_test_bin = label_binarize(y_test, classes=classes)

def plot_roc(model, X_test, y_test_bin, name):
    y_score = model.predict_proba(X_test)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    for i in range(len(classes)):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    plt.figure(figsize=(8, 5))
    for i in range(len(classes)):
        plt.plot(fpr[i], tpr[i], label=f"Class {i} (AUC = {roc_auc[i]:.2f})")
    
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title(f"Multi-class ROC Curve - {name}")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

# For tree-based models
plot_roc(best_rf_model, X_test, y_test_bin, "Random Forest")
plot_roc(best_cat_model, X_test, y_test_bin, "CatBoost")
plot_roc(best_xgb_model, X_test, y_test_bin, "XGBoost")


In [None]:
# Collecting performance metrics for each model
from sklearn.metrics import classification_report

# Creating a dictionary to store metrics
performance = {}

for model_name, y_pred in zip(
    ["Random Forest", "CatBoost", "XGBoost", "MLPClassifier"],
    [y_pred_rf, y_pred_catboost, y_pred_xgb, y_pred_mlp]
):
    report = classification_report(y_test, y_pred, output_dict=True)
    performance[model_name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision (Class 0)": report["0"]["precision"],
        "Precision (Class 1)": report["1"]["precision"],
        "Precision (Class 2)": report["2"]["precision"],
        "Recall (Class 0)": report["0"]["recall"],
        "Recall (Class 1)": report["1"]["recall"],
        "Recall (Class 2)": report["2"]["recall"],
        "F1 (Class 0)": report["0"]["f1-score"],
        "F1 (Class 1)": report["1"]["f1-score"],
        "F1 (Class 2)": report["2"]["f1-score"]
    }

# Creating DataFrame for better visualization
performance_df = pd.DataFrame(performance).T
performance_df = performance_df.round(4)  # Rounding for clarity

# Displaying the performance table
print("Final Model Performance Summary:")
print(performance_df)


In [None]:
# Saving performance summary to a CSV file
performance_df.to_csv("model_performance_summary.csv", index=True)
print("Performance summary exported as 'model_performance_summary.csv'.")


In [None]:
# Save confusion matrix plots
for model_name, y_pred in zip(
    ["Random Forest", "CatBoost", "XGBoost", "MLPClassifier"],
    [y_pred_rf, y_pred_catboost, y_pred_xgb, y_pred_mlp]
):
    fig, ax = plt.subplots(1, 2, figsize=(14, 5))
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax[0], cmap="Blues")
    ax[0].set_title(f"{model_name} - Confusion Matrix (Counts)")
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred, normalize='true', ax=ax[1], cmap="Oranges")
    ax[1].set_title(f"{model_name} - Confusion Matrix (Normalized)")
    plt.tight_layout()
    plt.savefig(f"{model_name}_confusion_matrix.png")

# Save Classification Report Heatmaps
for model_name, y_pred in zip(
    ["Random Forest", "CatBoost", "XGBoost", "MLPClassifier"],
    [y_pred_rf, y_pred_catboost, y_pred_xgb, y_pred_mlp]
):
    plot_class_report(y_test, y_pred, model_name)
    plt.savefig(f"{model_name}_classification_report_heatmap.png")

# Save Feature Importance Plots
importances_rf.sort_values().tail(10).plot(kind="barh", title="Random Forest - Top Features", figsize=(8, 4))
plt.savefig("Random_Forest_feature_importance.png")
importances_cat.sort_values().tail(10).plot(kind="barh", title="CatBoost - Top Features", figsize=(8, 4))
plt.savefig("CatBoost_feature_importance.png")
importances_xgb.sort_values().tail(10).plot(kind="barh", title="XGBoost - Top Features", figsize=(8, 4))
plt.savefig("XGBoost_feature_importance.png")

# Save ROC Curves
plot_roc(best_rf_model, X_test, y_test_bin, "Random Forest")
plt.savefig("Random_Forest_ROC_curve.png")
plot_roc(best_cat_model, X_test, y_test_bin, "CatBoost")
plt.savefig("CatBoost_ROC_curve.png")
plot_roc(best_xgb_model, X_test, y_test_bin, "XGBoost")
plt.savefig("XGBoost_ROC_curve.png")
