In [1]:
import os
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
import torch
import torch.nn as nn
import torch.optim as optim
import timm
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score  # <-- Add this line


# Step 1: Define your base directories
base_dir = r'/kaggle/input/cbis-ddsm-breast-cancer-image-dataset'
jpg_dir = os.path.join(base_dir, 'jpeg')
csv_dir = os.path.join(base_dir, 'csv')

In [2]:
import pandas as pd
import numpy as np

# Step 2: Load your CSV data
df_mass_train = pd.read_csv(os.path.join(csv_dir, 'mass_case_description_train_set.csv'))

df_mass_test = pd.read_csv(os.path.join(csv_dir, 'mass_case_description_test_set.csv'))

In [3]:
# Step 3: Correct the file paths in the DataFrame to point to cropped images
def correct_file_path(row):
    directory = os.path.basename(os.path.dirname(row['cropped image file path']))
    full_dir_path = os.path.join(jpg_dir, directory)
    all_files_in_dir = os.listdir(full_dir_path)
    
    if len(all_files_in_dir) > 0:
        correct_filename = all_files_in_dir[0]
        correct_path = os.path.join(full_dir_path, correct_filename)
    else:
        correct_path = None

    return correct_path

# Apply the corrected path to your DataFrame using the cropped images
df_mass_train['image file path'] = df_mass_train.apply(correct_file_path, axis=1)

In [4]:
# Step 4: Filter out missing files and ensure both CC and MLO views are present
df_mass_filtered = df_mass_train[df_mass_train['image file path'].notnull() & df_mass_train['image file path'].apply(os.path.exists)]

# Ensure both CC and MLO views are present for each patient
df_mass_filtered = df_mass_filtered.groupby('patient_id').filter(lambda x: len(x['image view'].unique()) == 2)

print(f"Remaining files after correction: {len(df_mass_filtered)}")

Remaining files after correction: 1111


In [5]:
# Step 5: Define transformations and create DataLoaders
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [6]:
df_mass_filtered.head()

Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00001,3,LEFT,CC,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,/kaggle/input/cbis-ddsm-breast-cancer-image-da...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
1,P_00001,3,LEFT,MLO,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,/kaggle/input/cbis-ddsm-breast-cancer-image-da...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...
2,P_00004,3,LEFT,CC,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,/kaggle/input/cbis-ddsm-breast-cancer-image-da...,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.95...
3,P_00004,3,LEFT,MLO,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,/kaggle/input/cbis-ddsm-breast-cancer-image-da...,Mass-Training_P_00004_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00004_LEFT_MLO_1/1.3.6.1.4.1.9...
4,P_00004,3,RIGHT,MLO,1,mass,OVAL,CIRCUMSCRIBED,4,BENIGN,5,/kaggle/input/cbis-ddsm-breast-cancer-image-da...,Mass-Training_P_00004_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_00004_RIGHT_MLO_1/1.3.6.1.4.1....


In [7]:
df_mass_filtered = df_mass_filtered[['pathology', 'image file path']]
df_mass_filtered.head()

Unnamed: 0,pathology,image file path
0,MALIGNANT,/kaggle/input/cbis-ddsm-breast-cancer-image-da...
1,MALIGNANT,/kaggle/input/cbis-ddsm-breast-cancer-image-da...
2,BENIGN,/kaggle/input/cbis-ddsm-breast-cancer-image-da...
3,BENIGN,/kaggle/input/cbis-ddsm-breast-cancer-image-da...
4,BENIGN,/kaggle/input/cbis-ddsm-breast-cancer-image-da...


In [8]:
def map_pathology(value):
    if value == 'MALIGNANT':
        return 1
    else:  # 'BENIGN' or 'NORMAL'
        return 0

df_mass_filtered['pathology'] = df_mass_filtered['pathology'].apply(map_pathology)

df_mass_filtered.head()

Unnamed: 0,pathology,image file path
0,1,/kaggle/input/cbis-ddsm-breast-cancer-image-da...
1,1,/kaggle/input/cbis-ddsm-breast-cancer-image-da...
2,0,/kaggle/input/cbis-ddsm-breast-cancer-image-da...
3,0,/kaggle/input/cbis-ddsm-breast-cancer-image-da...
4,0,/kaggle/input/cbis-ddsm-breast-cancer-image-da...


In [9]:
df_mass_filtered.isnull().sum()

pathology          0
image file path    0
dtype: int64

In [10]:
!pip install open_clip_torch

Collecting open_clip_torch
  Downloading open_clip_torch-2.28.0-py3-none-any.whl.metadata (31 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.0-py3-none-any.whl.metadata (7.1 kB)
Downloading open_clip_torch-2.28.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading ftfy-6.3.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, open_clip_torch
Successfully installed ftfy-6.3.0 open_clip_torch-2.28.0


In [11]:
import torch
import open_clip
from PIL import Image
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

In [12]:
# Set device for model
device = "cuda" if torch.cuda.is_available() else "cpu"

In [13]:
# Load CLIP model and preprocessing using open_clip
model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion400m_e32")
model = model.to(device)

100%|███████████████████████████████████████| 605M/605M [00:06<00:00, 93.2MiB/s]


In [14]:
# Define dataset class
class CLIPDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

In [15]:
# Load and preprocess data
image_paths = df_mass_filtered['image file path'].tolist()  # List of image file paths
labels = df_mass_filtered['pathology'].tolist()       # List of labels (0 for Benign/Normal, 1 for Malignant)

In [16]:
# Apply CLIP's preprocessing
transform = transforms.Compose([
    preprocess,
])

In [17]:
# Create dataset and dataloader
dataset = CLIPDataset(image_paths, labels, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [18]:
# Define CLIP text tokens for classification labels
class_names = ["a photo of a benign tumor", "a photo of a malignant tumor"]
text_inputs = open_clip.tokenize(class_names).to(device)

In [19]:
# Training setup (parameters can be adjusted)
num_epochs = 5
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [20]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for images, labels in dataloader:
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass: calculate CLIP features for images and text
        image_features = model.encode_image(images)
        text_features = model.encode_text(text_inputs)
        
        # Calculate logits and loss
        logits_per_image = image_features @ text_features.T  # Matrix multiplication for similarity
        loss = loss_fn(logits_per_image, labels)
        epoch_loss += loss.item()
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(dataloader)}")

# Evaluation
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for images, labels in dataloader:
        images = images.to(device)
        labels = labels.to(device)
        
        image_features = model.encode_image(images)
        logits_per_image = image_features @ text_features.T  # Matrix multiplication for similarity
        probs = logits_per_image.softmax(dim=-1)
        preds = probs.argmax(dim=-1)
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Performance metrics
accuracy = accuracy_score(all_labels, all_preds)
report = classification_report(all_labels, all_preds, target_names=class_names)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

Epoch 1/5, Loss: 0.9334361689431326
Epoch 2/5, Loss: 0.6779544132096427
Epoch 3/5, Loss: 0.6122253554207938
Epoch 4/5, Loss: 0.5542237545762744
Epoch 5/5, Loss: 0.36146032001291
Accuracy: 0.9298
Classification Report:
                               precision    recall  f1-score   support

   a photo of a benign tumor       0.96      0.90      0.93       570
a photo of a malignant tumor       0.90      0.96      0.93       541

                    accuracy                           0.93      1111
                   macro avg       0.93      0.93      0.93      1111
                weighted avg       0.93      0.93      0.93      1111

