**Data Processing**

Quick Recap about we try to do :
1. Loaded and filtered the X-ray metadata (PA view).

2. Converted disease labels into multi-label binary format.

3. Created a PyTorch Dataset class to load and process each image.

4. Applied standard image transforms (resize, normalize).

5. Split data into training and validation sets.

6. Built DataLoaders to handle data batching and shuffling.



In [19]:
import pandas as pd
import pandas as pd
import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch
from torchvision import transforms
from sklearn.model_selection import train_test_split

df = pd.read_csv(r"C:\Users\LENOVO\Documents\Lung\chest-xray-diagnosis\Data_Entry_2017_v2020.csv")


df_pa = df[df['View Position'] == 'PA'].copy()

print("NUM Of Images PA:", len(df_pa))
df_pa.head()


NUM Of Images PA: 67310


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168


In [None]:
# Extract unique disease labels
# Multi-label Binarization
all_labels = set()
for label_str in df_pa["Finding Labels"]:
    for label in label_str.split('|'):
        all_labels.add(label.strip())
all_labels = sorted(list(all_labels))
print("NUM of Diseases", len(all_labels))
print(all_labels)        

NUM of Diseases 15
['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'No Finding', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']


In [7]:
for disease in all_labels:
    df_pa[disease] = 0

for index, row in df_pa.iterrows():
    for label in row["Finding Labels"].split('|'):
        label = label.strip()
        if label in all_labels:
            df_pa.at[index, label] = 1
columns_to_keep = ['Image Index'] + all_labels
df_pa_final = df_pa[columns_to_keep]     
save_path = r"C:\Users\LENOVO\Documents\Lung\train_processed.csv"
df_pa_final.to_csv(save_path, index=False)
       

In [None]:
#Define Custom PyTorch Dataset
class ChestXrayDataset(Dataset):
    def __init__(self, dataframe, images_folder, transform=None):
        self.df = dataframe.reset_index(drop=True)
        self.images_folder = images_folder
        self.transform = transform
        self.labels = dataframe[all_labels].values.astype('float32')  # Convert labels to float tensor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.loc[idx, 'Image Index']
        img_path = os.path.join(self.images_folder, img_name)

        image = Image.open(img_path).convert('RGB')  

        if self.transform:
            image = self.transform(image)

        label = torch.tensor(self.labels[idx])
        return image, label


In [21]:
#  Define Transformations & DataLoader
# Resizes all images to 224x224 to match DenseNet's expected input.

# Transformations for training (with augmentation)
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                        [0.229, 0.224, 0.225])
])

# Validation Transform (no augmentation)
val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                        [0.229, 0.224, 0.225])
])

images_dir = r"C:\Users\LENOVO\Documents\Lung\chest-xray-diagnosis\images"
dataset = ChestXrayDataset(df_pa_final, images_dir, transform=image_transform)
# Split dataset (80% train, 20% val)
train_df, val_df = train_test_split(df_pa_final, test_size=0.2, random_state=42)
train_dataset = ChestXrayDataset(train_df, images_dir, transform=train_transform)
val_dataset = ChestXrayDataset(val_df, images_dir, transform=val_transform)



# Create DataLoaders
# DataLoaders are used to  load data in batches,Shuffle training data and Feed data into the model during training.


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)

print("✅ DataLoader ready for training!")



TypeError: 'DataFrame' object is not callable

In [6]:
import pandas as pd

# 1. اقرأ الفايل
df_labels = pd.read_csv(r"C:\Users\LENOVO\Documents\Lung\chest-xray-diagnosis\train_processed.csv")  # عدلي الاسم حسب اللي عندك

# 2. احسبي عدد الأمراض المشخّصة في كل صف
disease_cols = df_labels.columns[1:]  # أول عمود هو Image Index
df_labels['disease_count'] = df_labels[disease_cols].sum(axis=1)

# 3. فلتر الصفوف اللي فيها مرض واحد فقط
df_single_disease = df_labels[df_labels['disease_count'] == 1].copy()

# 4. احذف عمود العد (لو مش عايزاه)
df_single_disease.drop(columns='disease_count', inplace=True)

# 5. احفظ الملف
df_single_disease.to_csv("single_disease_only.csv", index=False)


disease_counts = {
    col: int(df_single_disease[col].sum()) for col in disease_cols
}

# 7. رتبيهم تنازليًا
sorted_counts = sorted(disease_counts.items(), key=lambda x: x[1], reverse=True)

# 8. اطبعهم
print("✅ عدد الصور لكل مرض (مرتبة من الأعلى للأقل):\n")
for disease, count in sorted_counts:
    print(f"{disease}: {count}")

✅ عدد الصور لكل مرض (مرتبة من الأعلى للأقل):

No Finding: 39302
Infiltration: 5270
Atelectasis: 2210
Effusion: 2086
Nodule: 1924
Pneumothorax: 1506
Mass: 1367
Pleural_Thickening: 875
Cardiomegaly: 746
Fibrosis: 648
Emphysema: 525
Consolidation: 346
Pneumonia: 176
Hernia: 98
Edema: 51
