In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import cv2
import glob
import numpy as np
import joblib
# Initialize a StandardScaler
scaler = StandardScaler()

In [2]:
# Paths to your TB and non-TB image directories
tb_image_dir = 'Tuberculosis'
non_tb_image_dir = 'Normal'

# Collect image paths
tb_image_paths = glob.glob(tb_image_dir + '/*.png')
non_tb_image_paths = glob.glob(non_tb_image_dir + '/*.png')

# Combine paths and create labels
all_image_paths = tb_image_paths + non_tb_image_paths
labels = [1] * len(tb_image_paths) + [0] * len(non_tb_image_paths)


In [3]:
# Initialize an empty list to store accuracies
accuracies = []

# Perform K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=None)

for train_index, val_index in kf.split(all_image_paths):
    X_train_paths, X_val_paths = [all_image_paths[i] for i in train_index], [all_image_paths[i] for i in val_index]
    y_train, y_val = [labels[i] for i in train_index], [labels[i] for i in val_index]
    
    # Load and preprocess training images
    X_train = []
    for path in tqdm(X_train_paths, desc="Training Images"):
        image = cv2.imread(path)
        resized_image = cv2.resize(image, (224, 224))
        gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)
        # Extract features - you can use GLCM or any other feature extraction method here
        # For example, using GLCM:
        hist = cv2.calcHist([gray_image], [0], None, [256], [0, 256])
        hist /= hist.sum()
        mean = (hist * np.arange(256)).sum()
        variance = (hist * ((np.arange(256) - mean) ** 2)).sum()
        homogeneity = (hist / (1 + np.abs(np.arange(256) - np.arange(256)[:, None]))).sum()
        contrast = (hist * (np.abs(np.arange(256) - np.arange(256)[:, None]) ** 2)).sum()
        X_train.append([mean, variance, homogeneity, contrast])
        
    # Normalize and transform features
    X_train = scaler.fit_transform(X_train)
    joblib.dump(scaler, 'scaler.pkl') 
    # Load and preprocess validation images
    X_val = []
    for path in tqdm(X_val_paths, desc="Validation Images"):
        image = cv2.imread(path)
        resized_image = cv2.resize(image, (224, 224))
        gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)
        # Extract features - same as in the training set
        hist = cv2.calcHist([gray_image], [0], None, [256], [0, 256])
        hist /= hist.sum()
        mean = (hist * np.arange(256)).sum()
        variance = (hist * ((np.arange(256) - mean) ** 2)).sum()
        homogeneity = (hist / (1 + np.abs(np.arange(256) - np.arange(256)[:, None]))).sum()
        contrast = (hist * (np.abs(np.arange(256) - np.arange(256)[:, None]) ** 2)).sum()
        X_val.append([mean, variance, homogeneity, contrast])

    # Normalize and transform features
    X_val = scaler.transform(X_val)
    
    # Initialize Random Forest classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the hyperparameters
    
    # Train the model
    model.fit(X_train, y_train)
    

    # Make predictions on validation data
    y_pred = model.predict(X_val)
    
    # Evaluate the model
    accuracy = accuracy_score(y_val, y_pred)
    
    # Print results for this fold
    print(f"Accuracy for this fold: {accuracy}")
    
    # Store results in the list for later analysis
    accuracies.append(accuracy)

# After the loop, assess overall performance    
print("Mean accuracy:", np.mean(accuracies))

Training Images: 100%|█████████████████████████████████████████████████████████████| 3864/3864 [03:05<00:00, 20.88it/s]
Validation Images: 100%|█████████████████████████████████████████████████████████████| 967/967 [00:45<00:00, 21.45it/s]


Accuracy for this fold: 0.7538779731127198


Training Images: 100%|█████████████████████████████████████████████████████████████| 3865/3865 [02:16<00:00, 28.35it/s]
Validation Images: 100%|█████████████████████████████████████████████████████████████| 966/966 [00:35<00:00, 27.39it/s]


Accuracy for this fold: 0.7556935817805382


Training Images: 100%|█████████████████████████████████████████████████████████████| 3865/3865 [02:14<00:00, 28.80it/s]
Validation Images: 100%|█████████████████████████████████████████████████████████████| 966/966 [00:36<00:00, 26.45it/s]


Accuracy for this fold: 0.7681159420289855


Training Images: 100%|█████████████████████████████████████████████████████████████| 3865/3865 [02:15<00:00, 28.52it/s]
Validation Images: 100%|█████████████████████████████████████████████████████████████| 966/966 [00:34<00:00, 28.00it/s]


Accuracy for this fold: 0.7536231884057971


Training Images: 100%|█████████████████████████████████████████████████████████████| 3865/3865 [02:20<00:00, 27.43it/s]
Validation Images: 100%|█████████████████████████████████████████████████████████████| 966/966 [00:34<00:00, 28.25it/s]


Accuracy for this fold: 0.7453416149068323
Mean accuracy: 0.7553304600469747


In [4]:
joblib.dump(model, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [5]:
# Load and preprocess a single X-ray image
def preprocess_single_image(image_path):
    # Load image
    image = cv2.imread(image_path)
    
    # Preprocess image (resize, convert to grayscale, extract HOG features, etc.)
    resized_image = cv2.resize(image, (224, 224))
    gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)
    # Compute GLCM-like features
    hist = cv2.calcHist([gray_image], [0], None, [256], [0, 256])
    hist /= hist.sum()
    mean = (hist * np.arange(256)).sum()
    variance = (hist * ((np.arange(256) - mean) ** 2)).sum()
    homogeneity = (hist / (1 + np.abs(np.arange(256) - np.arange(256)[:, None]))).sum()
    contrast = (hist * (np.abs(np.arange(256) - np.arange(256)[:, None]) ** 2)).sum()

    # Create feature vector
    X_processed = np.array([mean, variance, homogeneity, contrast]).reshape(1, -1)
    # Ensure the features are in the same format as used during training
    X_processed = scaler.transform(X_processed)  # Assuming you have a trained scaler
    
    return X_processed

# Path to the single X-ray image you want to test
image_path_to_test = 'unseenTB2.png'

# Preprocess the single image
X_single_image = preprocess_single_image(image_path_to_test)

# Make prediction
y_pred_single_image = model.predict(X_single_image)

# If you want probability scores for binary classification
y_proba_single_image = model.predict_proba(X_single_image)[:, 1]

# Display the prediction
print(f"Image: {image_path_to_test}, Prediction: {y_pred_single_image[0]}, Probability: {y_proba_single_image[0]}")


Image: unseenTB2.png, Prediction: 1, Probability: 0.88
