Process images

In [1]:
import os
from os.path import exists
import pandas as pd
import numpy as np
import cv2

# Import our own file that has the feature extraction functions
from extract_features import  process_images

#-------------------
# Main script
#-------------------

# Where is the raw data
file_data = '..' + os.sep + 'data' + os.sep + 'metadata.csv'
path_image = '..' + os.sep + 'data' + os.sep + 'images' + os.sep + 'images_original'
path_mask = '..' + os.sep + 'data' + os.sep + 'images' + os.sep + 'masks_original'    

# Where we will store the features
file_features = 'features/features_original.csv'
feature_names = ['assymetry', 'colours', 'dots and globules', 'compactness']

df_features=process_images(file_data, path_image, path_mask,feature_names)
# Save the image_id used + features to a file
#df_features.to_excel(file_features, index=False)
df_features.to_csv(file_features, index=False)

Train classifier

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pickle



metadata_df = pd.read_csv(file_data)
combined_df = df_features.merge(metadata_df[['img_id', 'diagnostic', 'patient_id']], left_on='image_id', right_on='img_id', how='left')
if combined_df.isnull().values.any():
    raise ValueError("NaN values detected after merge! Check the data integrity.")

# Prepare the dataset
X = combined_df[feature_names].to_numpy()
y = combined_df['diagnostic'].values == 'NEV'  # NEV is assumed to be the healthy class
patient_id = combined_df['patient_id'].values

# Prepare cross-validation
num_folds = 5
group_kfold = GroupKFold(n_splits=num_folds)

# Initialize classifiers
classifiers = [
    KNeighborsClassifier(1),
    KNeighborsClassifier(5),
    make_pipeline(StandardScaler(), SVC(probability=True)),
    RandomForestClassifier(n_estimators=100, random_state=42),
    GradientBoostingClassifier(n_estimators=100, random_state=42),
    AdaBoostClassifier(n_estimators=100, random_state=42),
    DecisionTreeClassifier(random_state=42),
    make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000)),
    make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3)),
    GaussianNB(),
    MLPClassifier(max_iter=1000)
]

# Initialize accuracy storage
acc_val = np.empty((num_folds, len(classifiers)))
classifier_names = []

# Perform cross-validation
for j, clf in enumerate(classifiers):
    classifier_name = (clf.named_steps['svc'].__class__.__name__ if 'pipeline' in str(clf)
                       else clf.__class__.__name__)
    classifier_names.append(classifier_name)
    fold_accuracies = []
    
    for i, (train_index, val_index) in enumerate(group_kfold.split(X, y, patient_id)):
        x_train, y_train = X[train_index], y[train_index]
        x_val, y_val = X[val_index], y[val_index]
        
        # Fit and predict
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_val)
        acc = accuracy_score(y_val, y_pred)
        fold_accuracies.append(acc)
        
        # Save the classifier after training on this fold
        fold_filename = f'classifier_{j}_fold_{i}.sav'
        pickle.dump(clf, open(fold_filename, 'wb'))
    
    acc_val[:, j] = fold_accuracies

# Calculate average accuracy for each classifier
average_acc = np.mean(acc_val, axis=0)
for idx, acc in enumerate(average_acc):
    print(f'Classifier {idx + 1} ({classifier_names[idx]}): average accuracy={acc:.3f}')

# Save and evaluate each classifier on the full dataset
eval_results = {}
for idx, clf in enumerate(classifiers):
    classifier_name = classifier_names[idx]
    classifier_filename = f'classifier_{idx}.sav'
    
    # Save the classifier
    pickle.dump(clf, open(classifier_filename, 'wb'))
    
    # Load the classifier
    loaded_clf = pickle.load(open(classifier_filename, 'rb'))
    
    # Predict on the full dataset and calculate evaluation metrics
    y_pred = loaded_clf.predict(X)
    acc = accuracy_score(y, y_pred)
    clf_report = classification_report(y, y_pred)
    
    eval_results[classifier_name] = {'accuracy': acc, 'report': clf_report}

# Display evaluation results
for clf_name, results in eval_results.items():
    print(f"Results for {clf_name}:")
    print(f"Accuracy: {results['accuracy']}")
    #print(f"Classification Report:\n{results['report']}\n")



Classifier 1 (KNeighborsClassifier): average accuracy=0.819
Classifier 2 (KNeighborsClassifier): average accuracy=0.889
Classifier 3 (Pipeline): average accuracy=0.897
Classifier 4 (RandomForestClassifier): average accuracy=0.866
Classifier 5 (GradientBoostingClassifier): average accuracy=0.866
Classifier 6 (AdaBoostClassifier): average accuracy=0.882
Classifier 7 (DecisionTreeClassifier): average accuracy=0.819
Classifier 8 (Pipeline): average accuracy=0.897
Classifier 9 (Pipeline): average accuracy=0.812
Classifier 10 (GaussianNB): average accuracy=0.842
Classifier 11 (MLPClassifier): average accuracy=0.897


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results for KNeighborsClassifier:
Accuracy: 0.8976377952755905
Results for Pipeline:
Accuracy: 0.8976377952755905
Results for RandomForestClassifier:
Accuracy: 0.968503937007874
Results for GradientBoostingClassifier:
Accuracy: 0.968503937007874
Results for AdaBoostClassifier:
Accuracy: 0.968503937007874
Results for DecisionTreeClassifier:
Accuracy: 0.968503937007874
Results for GaussianNB:
Accuracy: 0.8661417322834646
Results for MLPClassifier:
Accuracy: 0.8976377952755905


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluate classifier

In [3]:
import pickle
from sklearn.metrics import accuracy_score, confusion_matrix
import glob
#features_path = 'features/features_evaluate.csv'
file_data = '..' + os.sep + 'data' + os.sep + 'metadata.csv'
path_image = '..' + os.sep + 'data' + os.sep + 'images' + os.sep + 'images_evaluate'
path_mask = '..' + os.sep + 'data' + os.sep + 'images' + os.sep + 'masks_evaluate'    

metadata_df = pd.read_csv(file_data)

features_df = process_images(file_data, path_image, path_mask,feature_names)
feature_names = ['assymetry', 'colours', 'dots and globules', 'compactness']

# Merge features with metadata on 'image_id'/'img_id'
combined_df = features_df.merge(metadata_df[['img_id', 'diagnostic', 'patient_id']], left_on='image_id', right_on='img_id', how='left')

# Prepare the dataset
X = combined_df[feature_names].to_numpy()
y = combined_df['diagnostic'] == 'NEV'  # NEV is assumed to be the healthy class
patient_id = combined_df['patient_id']

# Function to load a classifier and evaluate it
def load_and_evaluate(model_filename, X, y):
    try:
        with open(model_filename, 'rb') as model_file:
            classifier = pickle.load(model_file)
        print(f"Loaded classifier from {model_filename}")
    except Exception as e:
        print(f"An error occurred while loading the model from {model_filename}: {e}")
        return None, None

    # Predict the labels and calculate accuracy and confusion matrix
    pred_labels = classifier.predict(X)
    accuracy = accuracy_score(y, pred_labels)
    cm = confusion_matrix(y, pred_labels)
    
    return accuracy, cm

# List of classifier filenames
#classifier_filenames = [f'classifier_{i}.sav' for i in range(len(os.listdir(r"C:\Users\tettret\OneDrive - DFDS\Desktop\ITU\Data Science Project\Project_data_science_queen_snakes-3\Classifier\fyp2024")))]
classifier_filenames = glob.glob(os.path.join('..', '*.sav'))
# Evaluate all classifiers
for model_filename in classifier_filenames:
    accuracy, cm = load_and_evaluate(model_filename, X, y)
    print(f"Results for {model_filename}:")
    print("Accuracy:", accuracy)
    print("Confusion Matrix:\n", cm)
