In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

CV = 5
# LABELS = ['2', '3', '4', '6', '7', '9', '11', '13', '14', '16', '17', '99']
PATH = 'feature_selection\\sncb_mrmr.csv'
SEP = ','
models_names = ['BNB', 'RFC', 'XGB', 'KNN']
reports_file_path = 'models\\reports.txt'
# PATH = 'sncb_final.csv'

data = pd.read_csv(PATH, sep=SEP) # !!!!!!!! CHANGE PATH !!!!!!!!
target = data['target']
LABELS = target.unique().tolist()
LABELS.sort()

def plot_confusion_matrix(cm, model_name, labels):
    cm = cm.astype('int') 
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=labels, yticklabels=labels)
    plt.savefig(f'models\\figures\\confusion_matrix_{model_name}.png')
    plt.close()

### Creating and storing the models

In [43]:
# Load the data
data = pd.read_csv(PATH, sep=SEP)

# Split the data into features and target
X = data.drop(columns=['target'])
y = data['target']

for model_name in models_names:
    for cv in range(CV):
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

        # Create a classifier
        if model_name == 'BNB':
            clf = BernoulliNB()
        elif model_name == 'RFC':
            clf = RandomForestClassifier()
        elif model_name == 'XGB':
            clf = XGBClassifier()
            xgb_lables = np.unique(y_train)
            xgb_lables.sort()
            lable_dict = {xgb_lables[i]: i for i in range(len(xgb_lables))}
            y_train = y_train.map(lable_dict)
            y_test = y_test.map(lable_dict)
            X_test.columns = X_test.columns.astype(str)
            X_test.columns = X_test.columns.str.replace(r"[^\w]", "_", regex=True)
            X_train.columns = X_train.columns.astype(str)
            X_train.columns = X_train.columns.str.replace(r"[^\w]", "_", regex=True)
        elif model_name == 'KNN':
            clf = KNeighborsClassifier()

        # Train the classifier
        clf.fit(X_train, y_train)

        # Save the model
        joblib.dump(clf, f'models\\models\\{model_name}_{cv}.pkl')

        # Save the test data
        X_test.to_csv(f'models\\test_data\\{model_name}_test_data_{cv}.csv', index=False)
        y_test.to_csv(f'models\\test_data\\{model_name}_test_target_{cv}.csv', index=False)

### Testing

In [44]:
with open(reports_file_path, 'w') as f:
    for model_name in models_names:
        mcc = 0
        reports_agg = None
        for cv in range(CV):
            clf = joblib.load(f'models\\models\\{model_name}_{cv}.pkl')
            X_test = pd.read_csv(f'models\\test_data\\{model_name}_test_data_{cv}.csv')
            y_test = pd.read_csv(f'models\\test_data\\{model_name}_test_target_{cv}.csv')

            y_pred = clf.predict(X_test)
            cm_cv = confusion_matrix(y_test, y_pred, labels=LABELS)
            mcc_cv = matthews_corrcoef(y_test, y_pred)
            report_cv = classification_report(y_test, y_pred, output_dict=True, target_names=LABELS, zero_division=0, labels=LABELS)
            # reports.append(classification_report(y_test, y_pred, target_names=LABELS, output_dict=True))
            if reports_agg is None:
                reports_agg = report_cv
            else:
                for label, metrics in report_cv.items():
                    if isinstance(metrics, dict):
                        for metric_name, value in metrics.items():
                            reports_agg[label][metric_name] += value

            mcc += mcc_cv
            cm = cm_cv if cv == 0 else cm + cm_cv

        # Normalize classification report metrics over CV folds
        for label, metrics in reports_agg.items():
            if label == "accuracy":
                continue
            if isinstance(metrics, dict):
                for metric_name in metrics:
                    reports_agg[label][metric_name] /= CV

        f.write(f"Model: {model_name}\n")
        f.write(f"Average Matthews Correlation Coefficient: {mcc / CV:.4f}\n")
        f.write("Confusion Matrix:\n")
        f.write(f"{cm}\n")
        f.write("Classification Report:\n")
        f.write(pd.DataFrame(reports_agg).transpose().to_string())
        f.write("\n\n")

        plot_confusion_matrix(cm, model_name, LABELS)
        
