# CLASSIFICATION DATA PREPROCESSING

### Imports

In [1]:
import pandas as pd
import torch

import os
os.chdir("..")

from utils.classification_data import DataProcessor
from utils.unet import UNet

### Loading CSV

In [2]:
csv_file = "data/clsif/stage_2_detailed_class_info.csv"

df = pd.read_csv(csv_file, sep=",")
df.head()

Unnamed: 0,patientId,class
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,No Lung Opacity / Not Normal
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,No Lung Opacity / Not Normal
2,00322d4d-1c29-4943-afc9-b6754be640eb,No Lung Opacity / Not Normal
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,Normal
4,00436515-870c-4b36-a041-de91049b9ab4,Lung Opacity


### Data Format Conversion

In [2]:
# Initiate the data processor
processor = DataProcessor()

dir = "data/clsif/stage_2_train_images" 
processor.dcm_to_png(dir)

### Training Data Reordering

In [4]:
input_dir = "data/clsif/stage_2_train_images" 
output_dirs = {
    "Normal": "data/clsif/train/normal",
    "Lung Opacity": "data/clsif/train/pneumonia",
    "No Lung Opacity / Not Normal": "data/clsif/train/unsure"
}

processor.reorder_files(df, input_dir, output_dirs)

Moved 26676 files.
Skipped 3551 missing files.


### Validation Data Reordering

In [5]:
output_dirs = {
    "Normal": "data/clsif/val/normal",
    "Lung Opacity": "data/clsif/val/pneumonia",
    "No Lung Opacity / Not Normal": "data/clsif/val/unsure"
}

processor.reorder_files(df, input_dir, output_dirs)

Moved 0 files.
Skipped 30227 missing files.


> Since there is no labels for `stage_2_test_images`, I will need to split the data from `stage_2_train_images` as described: `80%` train and `20%` validation data.

### Check the class distribution

In [3]:
dirs = [
    "data/clsif/train/normal",
    "data/clsif/train/pneumonia",
    "data/clsif/train/unsure"
]

processor.data_distribution_info(dirs)

data/clsif/train/normal: 8850 files (33.18%)
data/clsif/train/pneumonia: 6010 files (22.53%)
data/clsif/train/unsure: 11816 files (44.29%)

Total files: 26676


### Splitting The Data

In [4]:
train_dirs = {
    "normal": "data/clsif/train/normal",
    "pneumonia": "data/clsif/train/pneumonia",
    "unsure": "data/clsif/train/unsure",
}

val_dirs = {
    "normal": "data/clsif/val/normal",
    "pneumonia": "data/clsif/val/pneumonia",
    "unsure": "data/clsif/val/unsure",
}

processor.split_data(train_dirs, val_dirs)

Moved 1770/8850 files from data/clsif/train/normal -> data/clsif/val/normal
Moved 1202/6010 files from data/clsif/train/pneumonia -> data/clsif/val/pneumonia
Moved 2364/11816 files from data/clsif/train/unsure -> data/clsif/val/unsure


### Load The Segmentation Model

In [3]:
model = UNet(in_channels=3, out_channels=1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.load_state_dict(torch.load("model/segmentation_weights.pth", map_location=device))
print("Pretrained weights loaded successfully!")

Pretrained weights loaded successfully!


In [5]:
train_dirs = {
    "normal": "data/clsif/train/normal",
    "pneumonia": "data/clsif/train/pneumonia",
    "unsure": "data/clsif/train/unsure",
}

val_dirs = {
    "normal": "data/clsif/val/normal",
    "pneumonia": "data/clsif/val/pneumonia",
    "unsure": "data/clsif/val/unsure",
}

DataProcessor.data_segmentation(train_dirs, model)
DataProcessor.data_segmentation(val_dirs, model)

Data segmentation complete!
Data segmentation complete!
