Since Semi-Supervised classification produces a desirable result, lets try Supervised Classification

In [10]:
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import os
import numpy as np

def load_data(train_path, test_path, target_col="target"):
    """Load train/test CSV and split into features/labels."""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    if "image_name" in test_df.columns:
        test_df = test_df.drop(columns=["image_name"])

    X_train = train_df.drop(columns=[target_col])
    y_train = train_df[target_col]

    X_test = test_df.drop(columns=[target_col])
    y_test = test_df[target_col]

    return X_train, y_train, X_test, y_test


def scale_data(X_train, X_test, scaler_path):
    """Standardize features and save scaler."""
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Save the scaler
    with open(scaler_path, "wb") as f:
        pickle.dump(scaler, f)

    return X_train_scaled, X_test_scaled


def feature_selection(X_train, y_train, estimator, k_features=14, forward=True, scoring='accuracy', cv=5):
    """Perform Sequential Feature Selection."""
    sfs = SFS(estimator, 
              k_features=k_features, 
              forward=forward, 
              floating=False, 
              scoring=scoring, 
              cv=cv,
              n_jobs=-1)
    sfs.fit(X_train, y_train)
    return list(sfs.k_feature_idx_)


def train_and_save_model(estimator, X_train, y_train, model_path):
    """Train the model and save to disk."""
    estimator.fit(X_train, y_train)
    with open(model_path, "wb") as f:
        pickle.dump(estimator, f)


def load_model(model_path):
    """Load a saved model."""
    with open(model_path, "rb") as f:
        return pickle.load(f)


def compute_metrics(y_true, y_pred, y_score, average_type="weighted"):
    """Compute Accuracy, F1-score, ROC-AUC, and PR-AUC."""
    metrics = {}
    metrics["Accuracy"] = accuracy_score(y_true, y_pred)
    metrics["F1-Score"] = f1_score(y_true, y_pred, average=average_type)

    # ROC-AUC & PR-AUC only if probability scores are available
    if y_score is not None:
        try:
            if len(np.unique(y_true)) > 2:
                metrics["ROC-AUC"] = roc_auc_score(y_true, y_score, multi_class="ovr", average=average_type)
                metrics["PR-AUC"] = average_precision_score(y_true, y_score, average=average_type)
            else:
                metrics["ROC-AUC"] = roc_auc_score(y_true, y_score[:, 1])
                metrics["PR-AUC"] = average_precision_score(y_true, y_score[:, 1])
        except Exception:
            metrics["ROC-AUC"] = None
            metrics["PR-AUC"] = None
    else:
        metrics["ROC-AUC"] = None
        metrics["PR-AUC"] = None

    return metrics


def run_pipeline(train_csv, test_csv, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    # Load data
    X_train, y_train, X_test, y_test = load_data(train_csv, test_csv)
    feature_names = X_train.columns.tolist()

    # Scale data
    scaler_path = os.path.join(output_dir, "scaler.pkl")
    X_train_scaled, X_test_scaled = scale_data(X_train, X_test, scaler_path)

    # Define classifiers
    classifiers = {
        "decision_tree": DecisionTreeClassifier(random_state=42),
        "knn": KNeighborsClassifier(),
        "logistic_regression": LogisticRegression(max_iter=1000),
        "gaussian_nb": GaussianNB()
    }

    results = {}

    for name, clf in classifiers.items():
        print(f"\n=== Processing {name} ===")
        
        # Feature selection
        selected_idx = feature_selection(X_train_scaled, y_train, clf, k_features=14)
        selected_features = [feature_names[i] for i in selected_idx]
        print(f"Selected feature names: {selected_features}")
        print("Selected feature count:", len(selected_features))
        
        # Train and save model
        model_path = os.path.join(output_dir, f"{name}_model.pkl")
        train_and_save_model(clf, X_train_scaled[:, selected_idx], y_train, model_path)

        # Load and test model
        loaded_model = load_model(model_path)
        y_pred = loaded_model.predict(X_test_scaled[:, selected_idx])

        # Get probability scores if available
        try:
            y_score = loaded_model.predict_proba(X_test_scaled[:, selected_idx])
        except AttributeError:
            try:
                y_score = loaded_model.decision_function(X_test_scaled[:, selected_idx])
            except AttributeError:
                y_score = None

        metrics = compute_metrics(y_test, y_pred, y_score)
        results[name] = metrics

        # Print metrics
        for m, v in metrics.items():
            print(f"{m} ({name}): {v:.4f}" if v is not None else f"{m} ({name}): N/A")

    return results


# Example usage:
train_csv_path = "/Users/ranjanumeshrao/Downloads/DAL_Lab/Mango/5CLASS_CLUSTERING/data/FE_data/Train-test/train_balanced_dataset.csv"
test_csv_path = "/Users/ranjanumeshrao/Downloads/DAL_Lab/Mango/5CLASS_CLUSTERING/data/FE_data/Train-test/val_no_outliers.csv"
output_folder = "/Users/ranjanumeshrao/Downloads/DAL_Lab/Mango/5CLASS_CLUSTERING/Classical_Classifier"

results = run_pipeline(train_csv_path, test_csv_path, output_folder)
print("\nFinal Results:", results)


=== Processing decision_tree ===
Selected feature names: ['edge_density', 'HSV_H_std', 'HSV_S_mean', 'HSV_S_std', 'HSV_V_mean', 'LAB_L_std', 'LAB_A_mean', 'LAB_A_std', 'LAB_B_mean', 'LAB_B_std', 'area', 'perimeter', 'aspect_ratio', 'eccentricity']
Selected feature count: 14
Accuracy (decision_tree): 0.7444
F1-Score (decision_tree): 0.7503
ROC-AUC (decision_tree): 0.8457
PR-AUC (decision_tree): 0.6626

=== Processing knn ===
Selected feature names: ['edge_density', 'edge_intensity_std', 'HSV_H_mean', 'HSV_H_std', 'HSV_S_mean', 'HSV_S_std', 'HSV_V_mean', 'HSV_V_std', 'LAB_A_mean', 'LAB_B_mean', 'LAB_B_std', 'area', 'perimeter', 'aspect_ratio']
Selected feature count: 14
Accuracy (knn): 0.8022
F1-Score (knn): 0.8124
ROC-AUC (knn): 0.9399
PR-AUC (knn): 0.8192

=== Processing logistic_regression ===
Selected feature names: ['edge_density', 'edge_intensity_mean', 'HSV_H_mean', 'HSV_H_std', 'HSV_S_mean', 'HSV_S_std', 'HSV_V_mean', 'HSV_V_std', 'LAB_L_std', 'LAB_A_mean', 'LAB_A_std', 'LAB_B_m