In [45]:
import os
from glob import glob
import pandas as pd

# ✅ Step 1: Load the CSV file (metadata)
csv_path = "/kaggle/input/data/Data_Entry_2017.csv"
df = pd.read_csv(csv_path)

# ✅ Step 2: Filter only PA (PosteroAnterior) view images
df_pa = df[df['View Position'] == 'PA'].copy()

print("Number of PA images:", len(df_pa))
print(df_pa[['Image Index', 'Finding Labels', 'View Position']].head())

# ✅ Step 3: Search for all image paths in the dataset folders
# Each folder is like: images_001, images_002, ..., images_012
image_folders = [f"/kaggle/input/data/images_{str(i).zfill(3)}/images" for i in range(1, 13)]

all_image_paths = []
for folder in image_folders:
    all_image_paths.extend(glob(os.path.join(folder, "*.png")))

# ✅ Step 4: Create a dictionary mapping image name ➜ full image path
image_paths_dict = {os.path.basename(p): p for p in all_image_paths}

# ✅ Step 5: Add a new column to df_pa with the full image path
df_pa['image_path'] = df_pa['Image Index'].map(image_paths_dict)

# ✅ Step 6: Check for missing paths (should be 0 if everything is correct)
print("Missing paths:", df_pa['image_path'].isnull().sum())
print(df_pa[['Image Index', 'image_path']].head())


Number of PA images: 67310
        Image Index          Finding Labels View Position
0  00000001_000.png            Cardiomegaly            PA
1  00000001_001.png  Cardiomegaly|Emphysema            PA
2  00000001_002.png   Cardiomegaly|Effusion            PA
3  00000002_000.png              No Finding            PA
4  00000003_000.png                  Hernia            PA
Missing paths: 0
        Image Index                                         image_path
0  00000001_000.png  /kaggle/input/data/images_001/images/00000001_...
1  00000001_001.png  /kaggle/input/data/images_001/images/00000001_...
2  00000001_002.png  /kaggle/input/data/images_001/images/00000001_...
3  00000002_000.png  /kaggle/input/data/images_001/images/00000002_...
4  00000003_000.png  /kaggle/input/data/images_001/images/00000003_...


In [2]:
import pandas as pd
import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch
from torchvision import transforms
from sklearn.model_selection import train_test_split
from glob import glob

# 🟢 1. Load the CSV
csv_path = "/kaggle/input/single-disease-only/single_disease_only.csv"
df = pd.read_csv(csv_path)

# 🟢 2. Extract disease label columns
all_labels = list(df.columns[1:])  # exclude 'Image Index'
print("Diseases:", all_labels)

# 🟢 3. Split data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# 🟢 4. Map image names to their full paths
images_dir = "/kaggle/input/data"
image_folders = [f"{images_dir}/images_{str(i).zfill(3)}/images" for i in range(1, 13)]

image_paths_dict = {}
for folder in image_folders:
    for path in glob(os.path.join(folder, "*.png")):
        image_paths_dict[os.path.basename(path)] = path

train_df["image_path"] = train_df["Image Index"].map(image_paths_dict)
val_df["image_path"] = val_df["Image Index"].map(image_paths_dict)

# 🟢 5. Define PyTorch Dataset
class ChestXrayDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.df = dataframe.reset_index(drop=True)
        self.transform = transform
        self.labels = dataframe[all_labels].values.astype("float32")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.loc[idx, "image_path"]
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        label = torch.tensor(self.labels[idx])
        return image, label

# 🟢 6. Transforms
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

# 🟢 7. Datasets & Loaders
train_dataset = ChestXrayDataset(train_df, transform=train_transform)
val_dataset = ChestXrayDataset(val_df, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)

print("✅ Data ready. Number of training samples:", len(train_dataset))
print("✅ Number of validation samples:", len(val_dataset))


Diseases: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'No Finding', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
✅ Data ready. Number of training samples: 45704
✅ Number of validation samples: 11426
