# Download the RSNA dataset
- RSNA Pneumonia Detection Challenge for predicting whether pneumonia exists in a given image.
- first download the kaggle, follow commands below:
```
cd ~/datasets/
mkdir RSNA_Pneumonia
cd RSNA_Pneumonia
kaggle competitions download -c rsna-pneumonia-detection-challenge
unzip rsna-pneumonia-detection-challenge.zip

```

## Preprocess for classfication/detection.
- orinial rsna annotations, if for each brounding box, change the annotations for each image
- first change the boungdingbox x,y,w,h->x1,y1,x2,y2
- aggregate multiple boxes into one patient.
- group by pateint. 
- make label of patient-level, if have boundingbox, unhealthy:1, otherwise healthy:0
- len of annotations: 30227->26684, because change to pateint level


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Define dataset folder and annotations file
dataset_folder = r'C:\Users\Vishi\VSC Codes\VIsLM_seminar\VLP-Seminar\data\rsna-pneumonia-detection-challenge'
image_folder = f'{dataset_folder}/stage_2_train_images'
annotations = 'stage_2_train_labels.csv'

# Load the dataset
df = pd.read_csv(f'{dataset_folder}/{annotations}')
print('original-df len:', len(df))

# Create bounding boxes (x1, y1, x2, y2)
def create_bbox(row):
    if row["Target"] == 0:
        return [0.0, 0.0, 0.0, 0.0]  # Healthy case, bbox is [0, 0, 0, 0]
    else:
        x1 = row["x"]
        y1 = row["y"]
        x2 = x1 + row["width"]
        y2 = y1 + row["height"]
        return [x1, y1, x2, y2]

df["bbox"] = df.apply(lambda x: create_bbox(x), axis=1)

# Aggregate multiple bounding boxes by patientId
df = df[["patientId", "bbox"]]
df = df.groupby("patientId").agg(list)
df = df.reset_index()

# # Replace bounding box lists containing only [[0, 0, 0, 0]] with single [0, 0, 0, 0]
# df["bbox"] = df["bbox"].apply(lambda x: [0, 0, 0, 0] if x == [[0, 0, 0, 0]] else x)

# Create Target column, set to healthy (0) if bbox is [0, 0, 0, 0]
df["Target"] = df["bbox"].apply(lambda x: 0 if x == [[0.0, 0.0, 0.0, 0.0]] else 1)

# Add file path
df["path"] = df["patientId"].apply(lambda x: f"{image_folder}/{x}.dcm")

# Split the dataset
test_fac = 0.15
train_df, test_val_df = train_test_split(df, test_size=test_fac * 2, random_state=0)
test_df, valid_df = train_test_split(test_val_df, test_size=0.5, random_state=0)

print(f"Number of train samples: {len(train_df)}")
print(f"Number of valid samples: {len(valid_df)}")
print(f"Number of test samples: {len(test_df)}")

# Save to CSV files
RSNA_TRAIN_CSV = r'C:\Users\Vishi\VSC Codes\VIsLM_seminar\VLP-Seminar\annotations\rsna\train.csv'
RSNA_VALID_CSV = r'C:\Users\Vishi\VSC Codes\VIsLM_seminar\VLP-Seminar\annotations\rsna\val.csv'
RSNA_TEST_CSV = r'C:\Users\Vishi\VSC Codes\VIsLM_seminar\VLP-Seminar\annotations\rsna\test.csv'

# Save columns: path, patientId, Target, bbox
train_df.rename(columns={"Target": "label"}, inplace=True)
valid_df.rename(columns={"Target": "label"}, inplace=True)
test_df.rename(columns={"Target": "label"}, inplace=True)

train_df[["path", "patientId", "label", "bbox"]].to_csv(RSNA_TRAIN_CSV, index=False)
valid_df[["path", "patientId", "label", "bbox"]].to_csv(RSNA_VALID_CSV, index=False)
test_df[["path", "patientId", "label", "bbox"]].to_csv(RSNA_TEST_CSV, index=False)

print(f'Train CSV saved to: {RSNA_TRAIN_CSV}')
print(f'Valid CSV saved to: {RSNA_VALID_CSV}')
print(f'Test CSV saved to: {RSNA_TEST_CSV}')


original-df len: 30227
Number of train samples: 18678
Number of valid samples: 4003
Number of test samples: 4003
Train CSV saved to: C:\Users\Vishi\VSC Codes\VIsLM_seminar\VLP-Seminar\annotations\rsna\train.csv
Valid CSV saved to: C:\Users\Vishi\VSC Codes\VIsLM_seminar\VLP-Seminar\annotations\rsna\val.csv
Test CSV saved to: C:\Users\Vishi\VSC Codes\VIsLM_seminar\VLP-Seminar\annotations\rsna\test.csv
