In [1]:
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

In [2]:
# Folder where dataset is stored in Kaggle
folder = "/kaggle/input/research"

# Folder to save results (Kaggle working directory)
save_folder = "/kaggle/working"

# List all files in the dataset folder
files = os.listdir(folder)

In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from xgboost import XGBClassifier

for file_name in files:
    print(f"\nProcessing file: {file_name} ...")
    
    path = os.path.join(folder, file_name)
    dataframe = pd.read_csv(path)
    print("CSV file loaded successfully.")

    # Extract features and labels
    X = dataframe.values[:, :-1]
    y = dataframe.values[:, -1]
    print("Features and labels extracted.")

    # Normalize features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    print("Feature normalization done.")

    # Encode labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    print("Labels encoded.")

    # Define 5-fold cross-validation
    num_folds = 5
    class_accuracies = np.zeros((len(label_encoder.classes_), num_folds))
    class_precisions = np.zeros((len(label_encoder.classes_), num_folds))
    class_recalls = np.zeros((len(label_encoder.classes_), num_folds))
    class_f1_scores = np.zeros((len(label_encoder.classes_), num_folds))

    TPs = np.zeros((len(label_encoder.classes_), num_folds))
    TNs = np.zeros((len(label_encoder.classes_), num_folds))
    FPs = np.zeros((len(label_encoder.classes_), num_folds))
    FNs = np.zeros((len(label_encoder.classes_), num_folds))

    fold_accuracies = []
    fold_cms = []
    classifiers = []
    print("Initialized cross-validation metrics.")

    for fold in range(num_folds):
        print(f"\nStarting Fold {fold + 1} ...")
        
        # Initialize classifier
        classifier = XGBClassifier(max_depth=35, random_state=0)
        print("Classifier initialized.")

        X_train, y_train, X_test, y_test = [], [], [], []
        for camera_id in np.unique(X[:, -1]):
            indices = np.where(X[:, -1] == camera_id)[0]
            x_camera = X[indices][:, :-1]
            y_camera = y[indices]

            for label in np.unique(y_camera):
                indices = np.where(y_camera == label)[0]
                x_class = x_camera[indices]
                y_class = y_camera[indices]

                fold_size = int(len(indices) * 0.2)
                start = fold * fold_size
                end = start + fold_size

                train_idxs = np.setdiff1d(np.arange(len(indices)), np.arange(start, end))
                test_idxs = np.arange(start, end)

                X_train.append(x_class[train_idxs])
                y_train.append(y_class[train_idxs])
                X_test.append(x_class[test_idxs])
                y_test.append(y_class[test_idxs])
        
        X_train = np.concatenate(X_train, axis=0)
        y_train = np.concatenate(y_train, axis=0)
        print("Training and testing data prepared.")

        classifier.fit(X_train, y_train)
        print("Classifier trained.")

        # Test the classifier
        X_test = np.concatenate(X_test, axis=0)
        y_test = np.concatenate(y_test, axis=0)
        y_pred = classifier.predict(X_test)
        print("Classifier tested.")

        # Compute per-class metrics
        for label in np.unique(y_test):
            true_idxs = np.where(y_test == label)[0]
            pred_idxs = np.where(y_pred == label)[0]
            not_true_idxs = np.where(y_test != label)[0]
            not_pred_idxs = np.where(y_pred != label)[0]

            TP = len(set(true_idxs) & set(pred_idxs))
            FP = len(set(pred_idxs) - set(true_idxs))
            FN = len(set(true_idxs) - set(pred_idxs))
            TN = len(set(not_true_idxs) & set(not_pred_idxs))

            TPs[label][fold] = TP
            TNs[label][fold] = TN
            FPs[label][fold] = FP
            FNs[label][fold] = FN

            accuracy = TP / len(true_idxs) if len(true_idxs) > 0 else 0
            precision = TP / (TP + FP) if (TP + FP) > 0 else 0
            recall = TP / (TP + FN) if (TP + FN) > 0 else 0
            f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

            class_accuracies[label][fold] = accuracy
            class_precisions[label][fold] = precision
            class_recalls[label][fold] = recall
            class_f1_scores[label][fold] = f1_score

        print(f"Metrics computed for Fold {fold + 1}.")

        # Compute overall accuracy for this fold
        accuracy = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        fold_accuracies.append(accuracy)
        fold_cms.append(cm)
        classifiers.append(classifier)
        print(f"Overall accuracy computed for Fold {fold + 1}.")

        # Save results for this fold
        data = np.stack([
            class_accuracies[:, fold],
            class_precisions[:, fold],
            class_recalls[:, fold],
            class_f1_scores[:, fold]
        ], axis=1)

        dataframe = pd.DataFrame(data, columns=["accuracy", "precision", "recall", "f1_score"])
        dataframe["classes"] = label_encoder.classes_
        df1 = dataframe[['classes', 'accuracy', 'precision', 'recall', 'f1_score']]

        data = np.stack([
            TPs[:, fold],
            FNs[:, fold],
            FPs[:, fold],
            TNs[:, fold]
        ], axis=1)

        df2 = pd.DataFrame(data, columns=["TPs", "FNs", "FPs", "TNs"])
        df3 = pd.concat([df1, df2], axis=1)

        fold_filename = f"{file_name.split('.')[0]}_fold_{fold + 1}_stats.csv"
        df3.to_csv(os.path.join(save_folder, fold_filename), index=False)
        print(f"Results saved for Fold {fold + 1}.")

    # Compute and save the averaged results
    avg_data = np.stack([
        np.mean(class_accuracies, axis=1),
        np.mean(class_precisions, axis=1),
        np.mean(class_recalls, axis=1),
        np.mean(class_f1_scores, axis=1)
    ], axis=1)

    avg_dataframe = pd.DataFrame(avg_data, columns=["accuracy", "precision", "recall", "f1_score"])
    avg_dataframe["classes"] = label_encoder.classes_
    df1_avg = avg_dataframe[['classes', 'accuracy', 'precision', 'recall', 'f1_score']]

    avg_filename = f"{file_name.split('.')[0]}_avg_stats.csv"
    df1_avg.to_csv(os.path.join(save_folder, avg_filename), index=False)
    print(f"Averaged results saved for {file_name}.")

print("\nAll files processed successfully!")



Processing file: ALLvs1.csv ...
CSV file loaded successfully.
Features and labels extracted.
Feature normalization done.
Labels encoded.
Initialized cross-validation metrics.

Starting Fold 1 ...
Classifier initialized.
Training and testing data prepared.
Classifier trained.
Classifier tested.
Metrics computed for Fold 1.
Overall accuracy computed for Fold 1.
Results saved for Fold 1.

Starting Fold 2 ...
Classifier initialized.
Training and testing data prepared.
Classifier trained.
Classifier tested.
Metrics computed for Fold 2.
Overall accuracy computed for Fold 2.
Results saved for Fold 2.

Starting Fold 3 ...
Classifier initialized.
Training and testing data prepared.
Classifier trained.
Classifier tested.
Metrics computed for Fold 3.
Overall accuracy computed for Fold 3.
Results saved for Fold 3.

Starting Fold 4 ...
Classifier initialized.
Training and testing data prepared.
Classifier trained.
Classifier tested.
Metrics computed for Fold 4.
Overall accuracy computed for Fold 4