In [1]:
import os
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import torch

# List of possible image extensions
image_formats = {'.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG', '.heic', '.HEIC'}

def get_image_files(patient_path):
    """ Get all image files in a patient's folder """
    image_files = []
    for file in os.listdir(patient_path):
        if any(file.lower().endswith(ext) for ext in image_formats):
            image_files.append(os.path.join(patient_path, file))
    return image_files

class AnemiaDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        self.image_paths = []
        self.labels = []

        # Walk through the dataset directories (IRON DEFICIENCY ANEMIA and NON - IRON DEFICIENCY ANEMIA)
        for condition in ['IRON DEFICIENCY ANEMIA', 'NON - IRON DEFICIENCY ANEMIA']:
            condition_path = os.path.join(data_dir, condition)
            if not os.path.exists(condition_path):
                continue
            
            # Iterate through each patient folder
            for patient_folder in os.listdir(condition_path):
                patient_path = os.path.join(condition_path, patient_folder)
                if os.path.isdir(patient_path):
                    # Get all image files for the patient (regardless of their name)
                    img_files = get_image_files(patient_path)
                    
                    # If any images are found, add them to the dataset
                    for img_path in img_files:
                        self.image_paths.append(img_path)
                        # Set label: 1 for "IRON DEFICIENCY ANEMIA", 0 for "NON - IRON DEFICIENCY ANEMIA"
                        self.labels.append(1 if condition == 'IRON DEFICIENCY ANEMIA' else 0)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]
        
        # Load the image
        image = Image.open(img_path).convert('RGB')  # Ensure it's RGB format
        
        # Apply any transformations (resize, normalize, etc.)
        if self.transform:
            image = self.transform(image)

        return image, label


# Define the transformations to apply to each image
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to 224x224 pixels (common input size for CNNs)
    transforms.ToTensor(),  # Convert image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalization (ImageNet values)
])

# Define the path to your dataset
dataset_dir = r".\real_data"

# Create the custom dataset
dataset = AnemiaDataset(data_dir=dataset_dir, transform=transform)

# Print the total number of images found
print(f"Total number of images found: {len(dataset)}")

# Split the dataset into training and validation
train_size = int(0.8 * len(dataset))  # 80% for training
val_size = len(dataset) - train_size  # 20% for validation
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create DataLoader for training and validation
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Example of how to access the data
for images, labels in train_loader:
    print(images.shape)  # Batch of images
    print(labels)        # Corresponding labels
    break


Total number of images found: 441
torch.Size([64, 3, 224, 224])
tensor([1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1])


In [2]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np

# Load the pre-trained ResNet50 model
device='cuda'
resnet = models.resnet50(pretrained=True)
resnet.fc = torch.nn.Identity()  # Remove the last fully connected layer (so we get features)
resnet = resnet.to(device)
resnet.eval()  # Set the model to evaluation mode

# Function to extract features
def extract_features(data_loader, model, device):
    features = []
    labels_list = []
    
    with torch.no_grad():
        for images, labels in tqdm(data_loader, desc="Extracting Features"):
            images = images.to(device)
            output = model(images)  # Extract features
            
            features.append(output.cpu().numpy())  # Move to CPU and convert to NumPy
            labels_list.append(labels.cpu().numpy())
    
    features = np.concatenate(features, axis=0)
    labels = np.concatenate(labels_list, axis=0)
    
    return features, labels

# Extract features from training and validation sets
train_features, train_labels = extract_features(train_loader, resnet, device)
val_features, val_labels = extract_features(val_loader, resnet, device)


Extracting Features: 100%|██████████| 6/6 [00:14<00:00,  2.36s/it]
Extracting Features: 100%|██████████| 2/2 [00:03<00:00,  1.86s/it]


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Train Logistic Regression
logistic_clf = LogisticRegression(max_iter=1000)
logistic_clf.fit(train_features, train_labels)

# Validate Logistic Regression
logistic_preds = logistic_clf.predict(val_features)
logistic_accuracy = accuracy_score(val_labels, logistic_preds)
print(f"Logistic Regression Accuracy: {logistic_accuracy * 100:.2f}%")

# Train Support Vector Machine (SVM)
svm_clf = SVC(kernel='poly')  # You can change kernel to 'rbf', 'poly', etc.
svm_clf.fit(train_features, train_labels)

# Validate SVM
svm_preds = svm_clf.predict(val_features)
svm_accuracy = accuracy_score(val_labels, svm_preds)
print(f"SVM Accuracy: {svm_accuracy * 100:.2f}%")


Logistic Regression Accuracy: 83.15%
SVM Accuracy: 86.52%


In [7]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

def visualize_ml_models(logistic_clf, svm_clf, 
                         train_features, train_labels, 
                         val_features, val_labels, 
                         save_dir='ml_visualizations'):
    # Create visualizations directory
    os.makedirs(save_dir, exist_ok=True)
    
    # Predictions
    logistic_preds = logistic_clf.predict(val_features)
    svm_preds = svm_clf.predict(val_features)
    
    # Performance comparison plot
    plt.figure(figsize=(12, 5), facecolor='#2C3E50')
    plt.style.use('dark_background')
    
    # Bar plot for model accuracies
    models = ['Logistic Regression', 'Support Vector Machine']
    accuracies = [
        accuracy_score(val_labels, logistic_preds) * 100,
        accuracy_score(val_labels, svm_preds) * 100
    ]
    
    plt.bar(models, accuracies, color=['#3498DB', '#E74C3C'])
    plt.title('Model Performance Comparison', color='white', fontsize=15)
    plt.ylabel('Accuracy (%)', color='white')
    plt.ylim(0, 100)
    
    # Add accuracy values on top of bars
    for i, acc in enumerate(accuracies):
        plt.text(i, acc+1, f'{acc:.2f}%', 
                 horizontalalignment='center', 
                 color='white', 
                 fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'model_accuracy_comparison.png'), 
                facecolor='#2C3E50', 
                edgecolor='none', 
                dpi=300)
    plt.close()
    
    # Confusion Matrices
    plt.figure(figsize=(12, 5), facecolor='#2C3E50')
    plt.style.use('dark_background')
    
    # Logistic Regression Confusion Matrix
    plt.subplot(1, 2, 1)
    cm_logistic = confusion_matrix(val_labels, logistic_preds)
    sns.heatmap(cm_logistic, annot=True, fmt='d', cmap='Blues', 
                cbar=False, square=True)
    plt.title('Logistic Regression\nConfusion Matrix', color='white')
    plt.xlabel('Predicted', color='white')
    plt.ylabel('Actual', color='white')
    
    # SVM Confusion Matrix
    plt.subplot(1, 2, 2)
    cm_svm = confusion_matrix(val_labels, svm_preds)
    sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Reds', 
                cbar=False, square=True)
    plt.title('SVM\nConfusion Matrix', color='white')
    plt.xlabel('Predicted', color='white')
    plt.ylabel('Actual', color='white')
    
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'confusion_matrices.png'), 
                facecolor='#2C3E50', 
                edgecolor='none', 
                dpi=300)
    plt.close()
    
    # Classification Reports
    report_logistic = classification_report(val_labels, logistic_preds)
    report_svm = classification_report(val_labels, svm_preds)
    
    with open(os.path.join(save_dir, 'classification_reports.txt'), 'w') as f:
        f.write("LOGISTIC REGRESSION REPORT\n")
        f.write("-" * 30 + "\n")
        f.write(report_logistic + "\n\n")
        f.write("SVM REPORT\n")
        f.write("-" * 30 + "\n")
        f.write(report_svm)
    
    print(f"Visualizations saved in {save_dir}")

# Usage would be:
visualize_ml_models(logistic_clf, svm_clf, 
                    train_features, train_labels, 
                    val_features, val_labels)

Visualizations saved in ml_visualizations
