In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm


from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [2]:
# Define paths

GROUND_TRUTH_PATH = r"D:\Projects\Disease_Detection\Ground_Truth.csv"  # Update with correct path
OUTPUT_PATH = r"D:\Projects\Disease_Detection\yolov5\data_yolo\labels\train"
YOLO_LABELS_PATH = os.path.join(OUTPUT_PATH, "labels")

# Create output directory

os.makedirs(YOLO_LABELS_PATH, exist_ok=True)

# Load dataset
df = pd.read_csv(r"D:\Projects\Disease_Detection\Ground_Truth.csv")

# 🔹 Step 1: Normalize Age
scaler = MinMaxScaler()
df["Normalized Age"] = scaler.fit_transform(df[["Patient Age"]])

# 🔹 Step 2: Encode Gender (M → 0, F → 1)
df["Gender Encoded"] = df["Patient Gender"].map({"M": 0, "F": 1})

# 🔹 Step 3: One-Hot Encode View Position
encoder = OneHotEncoder(sparse_output=False)
view_encoded = encoder.fit_transform(df[["View Position"]])
view_encoded_df = pd.DataFrame(view_encoded, columns=encoder.get_feature_names_out(["View Position"]))
df = pd.concat([df, view_encoded_df], axis=1)

# 🔹 Step 4: Convert Labels to YOLO Format
# Define class mappings
disease_labels = {
    "Atelectasis": 0, "Effusion": 1, "Mass": 2, "Infiltration": 3, 
    "Pneumonia": 4, "Nodule": 5, "Pneumothorax": 6, "Consolidation": 7,
    "Edema": 8, "Emphysema": 9, "Fibrosis": 10, "Pleural_Thickening": 11,
    "Hernia": 12
}

def convert_to_yolo_format(image_name, labels):
    """ Convert multi-label conditions to YOLO format with dummy bounding box. """
    yolo_annotations = []
    for disease in labels:
        if disease in disease_labels:
            class_id = disease_labels[disease]
            # Dummy bounding box (full image)
            yolo_annotations.append(f"{class_id} 0.5 0.5 1.0 1.0") 

    return "\n".join(yolo_annotations)

# Process dataset
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    image_name = row["Image Index"]

    # Convert multi-labels to YOLO format
    labels = row["Finding Labels"].split("|")  # Multi-label splitting
    yolo_annotations = convert_to_yolo_format(image_name, labels)

    # Save YOLO annotation file
    if yolo_annotations:
        label_file = os.path.join(YOLO_LABELS_PATH, image_name.replace('.png', '.txt'))
        with open(label_file, "w") as f:
            f.write(yolo_annotations)

print("✅ Feature Engineering & YOLO Annotations Completed.")


100%|██████████| 111010/111010 [01:08<00:00, 1617.29it/s]

✅ Feature Engineering & YOLO Annotations Completed.





In [9]:
# Load the dataset
file_path = "Ground_Truth.csv"
df = pd.read_csv(file_path)

In [10]:
# 1️⃣ Normalize 'Patient Age'
scaler = MinMaxScaler()
df["Patient Age"] = scaler.fit_transform(df[["Patient Age"]])

In [11]:
# 2️⃣ Convert 'Patient Gender' to binary (M -> 1, F -> 0)
df["Patient Gender"] = df["Patient Gender"].map({"M": 1, "F": 0})

In [12]:
# 3️⃣ One-Hot Encode 'View Position'
encoder = OneHotEncoder(sparse_output=False)
view_encoded = encoder.fit_transform(df[["View Position"]])
view_labels = encoder.get_feature_names_out(["View Position"])
df_view = pd.DataFrame(view_encoded, columns=view_labels)

df = pd.concat([df, df_view], axis=1)
df.drop(columns=["View Position"], inplace=True)

In [13]:
# 4️⃣ Generate Synthetic Bounding Boxes (Placeholder Values)
# Assuming images are of fixed size (e.g., 1024x1024), we generate random bounding boxes
def generate_synthetic_bbox(image_size=(1024, 1024)):
    x_min = np.random.randint(100, 400)
    y_min = np.random.randint(100, 400)
    x_max = np.random.randint(600, 900)
    y_max = np.random.randint(600, 900)
    return x_min, y_min, x_max, y_max

df[['x_min', 'y_min', 'x_max', 'y_max']] = df.apply(lambda row: generate_synthetic_bbox(), axis=1, result_type="expand")

# Convert bounding boxes to YOLO format
def convert_to_yolo_format(row, img_width=1024, img_height=1024):
    x_center = (row['x_min'] + row['x_max']) / 2.0 / img_width
    y_center = (row['y_min'] + row['y_max']) / 2.0 / img_height
    width = (row['x_max'] - row['x_min']) / img_width
    height = (row['y_max'] - row['y_min']) / img_height
    return f"0 {x_center} {y_center} {width} {height}"  # Assuming single class '0'

df['yolo_annotation'] = df.apply(lambda row: convert_to_yolo_format(row), axis=1)

In [14]:
# 5️⃣ Save YOLO Annotations
output_dir = "yolo_labels"
os.makedirs(output_dir, exist_ok=True)

for index, row in df.iterrows():
    image_name = row["Image Index"].replace(".png", ".txt")
    with open(os.path.join(output_dir, image_name), "w") as f:
        f.write(row['yolo_annotation'])

print("Feature Engineering Completed Successfully! ✅")

Feature Engineering Completed Successfully! ✅
