In [1]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from skimage.feature import graycomatrix,hog
scaler = StandardScaler()

import xgboost as xgb
from sklearn.metrics import accuracy_score
import cv2
import glob
from tqdm import tqdm
import time
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score  # For evaluation metrics
import joblib

In [2]:
malaria_image_dir = 'Parasitized'
non_malaria_image_dir = 'Uninfected'

malaria_image_paths = glob.glob(malaria_image_dir + '/*.png')  # Adjust file extension if needed
non_malaria_image_paths = glob.glob(non_malaria_image_dir + '/*.png')

all_image_paths = malaria_image_paths + non_malaria_image_paths
labels = [1] * len(malaria_image_paths) + [0] * len(non_malaria_image_paths)

accuracies = []
# Perform K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=None)
for train_index, val_index in kf.split(all_image_paths):
    X_train_paths, X_val_paths = [all_image_paths[i] for i in train_index], [all_image_paths[i] for i in val_index]
    y_train, y_val = [labels[i] for i in train_index], [labels[i] for i in val_index]
    
    # Load and preprocess training images
    X_train = []
    for path in tqdm(X_train_paths, desc="Training Images"):
        image = cv2.imread(path)  # Load image using OpenCV or any other library
        resized_image = cv2.resize(image, (118, 118))
        # (224, 224) is a common size for pre-trained models and feature extraction.
        gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)
        hist = cv2.calcHist([gray_image], [0], None, [256], [0, 256])
        # Normalize the histogram
        hist /= hist.sum()
        # Compute GLCM-like features
        mean = (hist * np.arange(256)).sum()
        variance = (hist * ((np.arange(256) - mean) ** 2)).sum()
        homogeneity = (hist / (1 + np.abs(np.arange(256) - np.arange(256)[:, None]))).sum()
        contrast = (hist * (np.abs(np.arange(256) - np.arange(256)[:, None]) ** 2)).sum()
        X_train.append([mean, variance, homogeneity, contrast])
        # Append extracted features to X_train or X_val based on the loop iteration
        # normalization
    X_train = scaler.fit_transform(X_train)
    joblib.dump(scaler, 'scaler.pkl')
        # Apply preprocessing steps (resizing, normalization, etc.) to the image
        # Append preprocessed image to X_train
        
        # Load and preprocess validation images
    X_val = []
    for path in tqdm(X_val_paths, desc="Validation Images"):
        image = cv2.imread(path) 
        resized_image = cv2.resize(image, (118, 118))
        gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)
        # Extract HOG features
        #fd = hog(gray_image, orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2))
        hist = cv2.calcHist([gray_image], [0], None, [256], [0, 256])

    # Normalize the histogram
        hist /= hist.sum()

    # Compute GLCM-like features
        mean = (hist * np.arange(256)).sum()
        variance = (hist * ((np.arange(256) - mean) ** 2)).sum()
        homogeneity = (hist / (1 + np.abs(np.arange(256) - np.arange(256)[:, None]))).sum()
        contrast = (hist * (np.abs(np.arange(256) - np.arange(256)[:, None]) ** 2)).sum()
        X_val.append([mean, variance, homogeneity, contrast])
        # Append extracted features to X_train or X_val based on the loop iteration
        # normalization
    X_val = scaler.transform(X_val)
    
    model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on validation data
    y_pred = model.predict(X_val)
    
    # Evaluate the model (using multiple metrics)
    accuracy = accuracy_score(y_val, y_pred)
    
    # Print results for this fold
    print(f"Accuracy for this fold: {accuracy}")
    
    # Store results in lists for later analysis
    accuracies.append(accuracy)

# After the loop, assess overall performance    
print("Mean accuracy:", np.mean(accuracies))
    
        

Training Images: 100%|██████████| 22065/22065 [03:09<00:00, 116.70it/s]
Validation Images: 100%|██████████| 5517/5517 [01:02<00:00, 88.64it/s] 


Accuracy for this fold: 0.5981511691136487


Training Images: 100%|██████████| 22065/22065 [00:36<00:00, 603.38it/s]
Validation Images: 100%|██████████| 5517/5517 [00:08<00:00, 631.77it/s]


Accuracy for this fold: 0.6008700380641653


Training Images: 100%|██████████| 22066/22066 [00:36<00:00, 611.05it/s]
Validation Images: 100%|██████████| 5516/5516 [00:09<00:00, 596.47it/s]


Accuracy for this fold: 0.5933647570703409


Training Images: 100%|██████████| 22066/22066 [00:36<00:00, 600.01it/s]
Validation Images: 100%|██████████| 5516/5516 [00:09<00:00, 576.93it/s]


Accuracy for this fold: 0.5975344452501813


Training Images: 100%|██████████| 22066/22066 [00:36<00:00, 605.99it/s]
Validation Images: 100%|██████████| 5516/5516 [00:09<00:00, 599.07it/s]


Accuracy for this fold: 0.5998912255257433
Mean accuracy: 0.5979623270048159


In [3]:
model.save_model('xgboost_model.json')

In [4]:
#For testing purpose only
# Assuming model and scaler are already trained and defined

# Load and preprocess a single X-ray image
def preprocess_single_image(image_path, scaler):
    # Load image
    image = cv2.imread(image_path)
    
    # Preprocess image (resize, convert to grayscale, extract HOG features, etc.)
    resized_image = cv2.resize(image, (118, 118))
    gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)
    # Compute GLCM-like features
    hist = cv2.calcHist([gray_image], [0], None, [256], [0, 256])
    hist /= hist.sum()
    mean = (hist * np.arange(256)).sum()
    variance = (hist * ((np.arange(256) - mean) ** 2)).sum()
    homogeneity = (hist / (1 + np.abs(np.arange(256) - np.arange(256)[:, None]))).sum()
    contrast = (hist * (np.abs(np.arange(256) - np.arange(256)[:, None]) ** 2)).sum()

    # Create feature vector
    X_processed = np.array([mean, variance, homogeneity, contrast]).reshape(1, -1)
    # Ensure the features are in the same format as used during training
    X_processed = scaler.transform(X_processed)  # Assuming you have a trained scaler
    
    return X_processed

# Path to the single X-ray image you want to test
image_path_to_test = 'C33P1thinF_IMG_20150619_121229a_cell_177.png'
# Preprocess the single image
X_single_image = preprocess_single_image(image_path_to_test, scaler)

# Make prediction
y_pred_single_image = model.predict(X_single_image)

# If you want probability scores for binary classification
y_proba_single_image = model.predict_proba(X_single_image)[:, 1]

# Display the prediction
print(f"Image: {image_path_to_test}, Prediction: {y_pred_single_image[0]}, Probability: {y_proba_single_image[0]}")


Image: C33P1thinF_IMG_20150619_121229a_cell_177.png, Prediction: 1, Probability: 0.7838847041130066
