# Libraries

In [12]:
import pandas as pd
import numpy as np
import os
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, cohen_kappa_score
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm

In [4]:
# Checking len of documents

nevus_path_test = 'test'
nevus_path_train_0 = 'train/nevus'
nevus_path_train_1 = 'train/others'
nevus_path_val_0 = 'val/nevus'
nevus_path_val_1 = 'val/others'

nevus_train_0 = os.listdir(nevus_path_train_0)
nevus_train_1 = os.listdir(nevus_path_train_1)
total_train = nevus_train_0 + nevus_train_1
print('Number of images in the train set:',len(total_train))

nevus_val_0 = os.listdir(nevus_path_val_0)
nevus_val_1 = os.listdir(nevus_path_val_1)
total_val = nevus_val_0 + nevus_val_1
print('Number of images in the validation set:',len(total_val))

nevus_test = os.listdir(nevus_path_test)
print('Number of images in the test set:',len(nevus_test))

nevus_path_test = 'test'

nevus_path_train_HairRemoved_0 = 'train_HairRemoved/nevus'
nevus_path_train_HairRemoved_1 = 'train_HairRemoved/others'
nevus_path_val_HairRemoved_0 = 'val_HairRemoved/nevus'
nevus_path_val_HairRemoved_1 = 'val_HairRemoved/others'

nevus_train_HairRemoved_0 = os.listdir(nevus_path_train_HairRemoved_0)
nevus_train_HairRemoved_1 = os.listdir(nevus_path_train_HairRemoved_1)
total_train_HairRemoved = nevus_train_HairRemoved_0 + nevus_train_HairRemoved_1
print('Number of images in the train set(Hair removed):',len(total_train_HairRemoved))

nevus_val_HairRemoved_0 = os.listdir(nevus_path_val_HairRemoved_0)
nevus_val_HairRemoved_1 = os.listdir(nevus_path_val_HairRemoved_1)
total_HairRemoved_val = nevus_val_HairRemoved_0 + nevus_val_HairRemoved_1
print('Number of images in the validation set(Hair removed):',len(total_HairRemoved_val))

nevus_test = os.listdir(nevus_path_test)
print('Number of images in the test set(Hair removed):',len(nevus_test))

Number of images in the train set: 15195
Number of images in the validation set: 3796
Number of images in the test set: 6340
Number of images in the train set(Hair removed): 15195
Number of images in the validation set(Hair removed): 3796
Number of images in the test set(Hair removed): 6340


In [5]:
# Function to rename columns for consistent naming
def rename_columns(df, suffix):
    return df.rename(columns=lambda x: f"{x}_{suffix}" if 'color_hist' in x or 'lbp' in x else x)

# Load training datasets
RGB_features = pd.read_csv('results/train/RGB_features_train.csv')
HSV_features = pd.read_csv('results/train/HSV_features_train.csv')
Lab_features = pd.read_csv('results/train/Lab_features_train.csv')
lbp_features = pd.read_csv('results/train/lbp_features_train.csv')
glcm_features = pd.read_csv('results/train/glcm_features_train.csv')

# Apply consistent renaming
RGB_features = rename_columns(RGB_features, "rgb")
HSV_features = rename_columns(HSV_features, "hsv")
Lab_features = rename_columns(Lab_features, "lab")
lbp_features = rename_columns(lbp_features, "lbp")
glcm_features = rename_columns(glcm_features, "glcm")

# Merge features on 'image_path' and handle duplicate 'label' columns
merged_data = RGB_features.merge(HSV_features, on='image_path', suffixes=('', '_duplicate'))
merged_data = merged_data.drop(columns=[col for col in merged_data if col.endswith('_duplicate')])

merged_data = merged_data.merge(Lab_features, on='image_path', suffixes=('', '_duplicate'))
merged_data = merged_data.drop(columns=[col for col in merged_data if col.endswith('_duplicate')])

merged_data = merged_data.merge(lbp_features, on='image_path', suffixes=('', '_duplicate'))
merged_data = merged_data.drop(columns=[col for col in merged_data if col.endswith('_duplicate')])

merged_data = merged_data.merge(glcm_features, on='image_path', suffixes=('', '_duplicate'))
merged_data = merged_data.drop(columns=[col for col in merged_data if col.endswith('_duplicate')])


# Ensure only one 'label' column remains and drop 'image_path' before training
if 'label' in merged_data.columns:
    label_column = merged_data.pop('label')
    merged_data.insert(len(merged_data.columns), 'label', label_column)

X_train = merged_data.drop(columns=['image_path', 'label'])  # Drop 'image_path' and 'label' for training features
y_train = merged_data['label']

# Load validation datasets
RGB_features_val = pd.read_csv('results/val/RGB_features_val.csv')
HSV_features_val = pd.read_csv('results/val/HSV_features_val.csv')
Lab_features_val = pd.read_csv('results/val/Lab_features_val.csv')
lbp_features_val = pd.read_csv('results/val/lbp_features_val.csv')
glcm_features_val = pd.read_csv('results/val/glcm_features_val.csv')
sift_features_val = pd.read_csv('results/val/sift_features_val.csv')

# Apply consistent renaming
RGB_features_val = rename_columns(RGB_features_val, "rgb")
HSV_features_val = rename_columns(HSV_features_val, "hsv")
Lab_features_val = rename_columns(Lab_features_val, "lab")
lbp_features_val = rename_columns(lbp_features_val, "lbp")
glcm_features_val = rename_columns(glcm_features_val, "glcm")

# Merge validation features on 'image_path'
merged_val_data = RGB_features_val.merge(HSV_features_val, on='image_path', suffixes=('', '_duplicate'))
merged_val_data = merged_val_data.drop(columns=[col for col in merged_val_data if col.endswith('_duplicate')])

merged_val_data = merged_val_data.merge(Lab_features_val, on='image_path', suffixes=('', '_duplicate'))
merged_val_data = merged_val_data.drop(columns=[col for col in merged_val_data if col.endswith('_duplicate')])

merged_val_data = merged_val_data.merge(lbp_features_val, on='image_path', suffixes=('', '_duplicate'))
merged_val_data = merged_val_data.drop(columns=[col for col in merged_val_data if col.endswith('_duplicate')])

merged_val_data = merged_val_data.merge(glcm_features_val, on='image_path', suffixes=('', '_duplicate'))
merged_val_data = merged_val_data.drop(columns=[col for col in merged_val_data if col.endswith('_duplicate')])

# Drop 'image_path' and separate features and labels for validation
X_val = merged_val_data.drop(columns=['image_path', 'label'], errors='ignore')
y_val = merged_val_data.get('label')  # Include 'label' if it exists

# Repeat for test set, ensuring no 'label' column
RGB_features_test = pd.read_csv('results/test/RGB_features_test.csv')
HSV_features_test = pd.read_csv('results/test/HSV_features_test.csv')
Lab_features_test = pd.read_csv('results/test/Lab_features_test.csv')
lbp_features_test = pd.read_csv('results/test/lbp_features_test.csv')
glcm_features_test = pd.read_csv('results/test/glcm_features_test.csv')

# Apply consistent renaming
RGB_features_test = rename_columns(RGB_features_test, "rgb")
HSV_features_test = rename_columns(HSV_features_test, "hsv")
Lab_features_test = rename_columns(Lab_features_test, "lab")
lbp_features_test = rename_columns(lbp_features_test, "lbp")
glcm_features_test = rename_columns(glcm_features_test, "glcm")

# Merge test features on 'image_path' and drop 'image_path'
merged_test_data = RGB_features_test.merge(HSV_features_test, on='image_path', suffixes=('', '_duplicate'))
merged_test_data = merged_test_data.drop(columns=[col for col in merged_test_data if col.endswith('_duplicate')])

merged_test_data = merged_test_data.merge(Lab_features_test, on='image_path', suffixes=('', '_duplicate'))
merged_test_data = merged_test_data.drop(columns=[col for col in merged_test_data if col.endswith('_duplicate')])

merged_test_data = merged_test_data.merge(lbp_features_test, on='image_path', suffixes=('', '_duplicate'))
merged_test_data = merged_test_data.drop(columns=[col for col in merged_test_data if col.endswith('_duplicate')])

merged_test_data = merged_test_data.merge(glcm_features_test, on='image_path', suffixes=('', '_duplicate'))
merged_test_data = merged_test_data.drop(columns=[col for col in merged_test_data if col.endswith('_duplicate')])


# Drop 'image_path' for test features
X_test = merged_test_data.drop(columns=['image_path'])  # Test features only


In [6]:
# Define parameter grids for each model
param_grids = {
    # 'RandomForest': {
    #     'model': RandomForestClassifier(random_state=42),
    #     'params': {
    #         'n_estimators': [50, 100, 200],
    #         'max_depth': [None, 10, 20],
    #         'min_samples_split': [2, 5, 10]
    #     }
    # },
    # 'SVM': {
    #     'model': SVC(),
    #     'params': {
    #         'C': [0.1, 1, 10],
    #         'kernel': ['linear', 'rbf'],
    #         'gamma': ['scale', 'auto']
    #     }
    # },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        'params': {
            'n_estimators': [300],
            'max_depth': [10, 12],
            'learning_rate': [0.01, 0.1, 0.2]
        }
    },
    # 'LogisticRegression': {
    #     'model': LogisticRegression(solver='liblinear', random_state=42),
    #     'params': {
    #         'C': [0.01, 0.1, 1, 10, 100],      # Inverse of regularization strength
    #         'penalty': ['l1', 'l2']             # Regularization norms
    #     }
    # },
    # 'AdaBoost': {
    #     'model': AdaBoostClassifier(random_state=42),
    #     'params': {
    #         'n_estimators': [50, 100, 200],     # Number of boosting rounds
    #         'learning_rate': [0.01, 0.1, 1.0]   # Step size shrinkage
    #     }
    # },
    # 'GradientBoosting': {
    #     'model': GradientBoostingClassifier(random_state=42),
    #     'params': {
    #         'n_estimators': [50, 100, 200],
    #         'max_depth': [3, 5, 10],
    #         'learning_rate': [0.01, 0.1, 0.2]
    #     }
    # },
    # 'KNN': {
    #     'model': KNeighborsClassifier(),
    #     'params': {
    #         'n_neighbors': [3, 5, 7, 9, 11],     # Number of neighbors
    #         'weights': ['uniform', 'distance'],  # Weight function
    #         'metric': ['euclidean', 'manhattan'] # Distance metric
    #     }
    # }
}

In [7]:
# Checking there is features

print("Initial DataFrame Shapes:")
print("RGB Features:", RGB_features.shape)
print("HSV Features:", HSV_features.shape)
print("Lab Features:", Lab_features.shape)
print("LBP Features:", lbp_features.shape)
print("GLCM Features:", glcm_features.shape)

Initial DataFrame Shapes:
RGB Features: (15195, 10)
HSV Features: (15195, 26)
Lab Features: (15195, 26)
LBP Features: (15195, 20)
GLCM Features: (15195, 10)


In [8]:
merged_data = np.concatenate([RGB_features.iloc[:, 1:].to_numpy(),HSV_features.iloc[:, 1:].to_numpy(),Lab_features.iloc[:, 1:].to_numpy(),lbp_features.iloc[:, 1:].to_numpy(), glcm_features.iloc[:, 1:].to_numpy()], axis=1)

In [9]:
# Check if all CSVs contain 'image_path' and 'label' columns
required_columns = {'image_path', 'label'}
for df_name, df in zip(['RGB','HSV','Lab' 'LBP', 'GLCM', 'SIFT'], 
                        [RGB_features,HSV_features,Lab_features, lbp_features, glcm_features]):
    if not required_columns.issubset(df.columns):
        print(f"Error: {df_name} features CSV is missing required columns {required_columns}")
        exit()
        
merged_data = RGB_features.merge(HSV_features, on=['image_path', 'label'], how='inner')
print("After merging RGB + HSV:", merged_data.shape)

merged_data = merged_data.merge(Lab_features, on=['image_path', 'label'], how='inner')
print("After merging RGB + HSV + Lab:", merged_data.shape)

merged_data = merged_data.merge(lbp_features, on=['image_path', 'label'], how='inner')
print("After merging RGB + HSV + Lab + LBP:", merged_data.shape)

merged_data = merged_data.merge(glcm_features, on=['image_path', 'label'], how='inner')
print("After merging RGB + HSV + Lab + LBP + GLCM:", merged_data.shape)

# Check if the merged data is empty
if merged_data.empty:
    print("Error: Merged training data is empty after merging all features. Check CSV files for mismatches in image_path values.")
    exit()

After merging RGB + HSV: (15195, 34)
After merging RGB + HSV + Lab: (15195, 58)
After merging RGB + HSV + Lab + LBP: (15195, 76)
After merging RGB + HSV + Lab + LBP + GLCM: (15195, 84)


In [10]:
# Iterate over each model and perform GridSearchCV
best_estimators = {}
for model_name, config in param_grids.items():
    print(f"Training and tuning {model_name}...")
    grid_search = GridSearchCV(config['model'], config['params'], cv=5, scoring='accuracy', n_jobs=-1, verbose=4)
    grid_search.fit(X_train, y_train)
    
    best_estimators[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy for {model_name}: {grid_search.best_score_}")

# Evaluate best models on the validation set
for model_name, model in best_estimators.items():
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    kappa = cohen_kappa_score(y_val, y_pred)
    
    print(f"\nValidation Results for {model_name}:")
    print("Accuracy:", accuracy)
    print("Kappa:", kappa)
    print("Classification Report:\n", classification_report(y_val, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

Training and tuning XGBoost...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 12, 'n_estimators': 300}
Best cross-validation accuracy for XGBoost: 0.805923000987167

Validation Results for XGBoost:
Accuracy: 0.8250790305584826
Kappa: 0.650148446619595
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.82      0.83      1931
           1       0.82      0.83      0.82      1865

    accuracy                           0.83      3796
   macro avg       0.83      0.83      0.83      3796
weighted avg       0.83      0.83      0.83      3796

Confusion Matrix:
 [[1584  347]
 [ 317 1548]]


In [11]:
# Ensure X_test has the same columns as X_train
missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0  # Add missing columns to X_test with zeros

extra_cols = set(X_test.columns) - set(X_train.columns)
X_test = X_test.drop(columns=extra_cols)  # Drop extra columns not in X_train

# Reorder X_test columns to match X_train
X_test = X_test[X_train.columns]

best_model = best_estimators['XGBoost']
y_test_pred = best_model.predict(X_test)

# Save predictions to CSV
y_test_pred = best_model.predict(X_test)

output = pd.DataFrame({
    'predicted_label': y_test_pred
})
output_csv_path = 'results/test/predicted_labels.csv'
output.to_csv(output_csv_path, index=False)
print(f"Test set predictions saved to {output_csv_path}")

Test set predictions saved to results/test/predicted_labels.csv
