### Goals of this notebook:
1. Figure out exactly how many of each label is present in the dataset (positive vs. negative)

In [1]:
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torchvision.transforms as T
import matplotlib.patches as mpatches
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import sys
import pickle as pkl
import glob

from tqdm.notebook import tqdm
sys.path.append("../")
import constants

### Goal 1a: Build object detection dataset and dataloader.

In [4]:
df = pd.read_csv("dataset/Data_Entry_2017.csv")
print("Num examples:", len(df))
df.head()

Num examples: 112120


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [26]:
# print(f"Disease counts:\n{df['Finding Labels'].value_counts()}")
counts = df['Finding Labels'].value_counts()
labels = counts.axes[0]
count_dict = dict()
for (label, count) in zip(labels, counts):
    all_labels_present = label.split('|')
    for label in all_labels_present:
        if label not in count_dict:
            count_dict[label] = 0
        count_dict[label] += count
print(count_dict)
for key in count_dict:
    count_dict[key] = count_dict[key] / 112120
print(count_dict)
for key in count_dict:
    count_dict[key] = count_dict[key] / 0.5383606849803781
print(count_dict)
for key in count_dict:
    count_dict[key] = 1 / count_dict[key]
print(count_dict) # --- These are the final weights! ---

{'No Finding': 60361, 'Infiltration': 19894, 'Atelectasis': 11559, 'Effusion': 13317, 'Nodule': 6331, 'Pneumothorax': 5302, 'Mass': 5782, 'Consolidation': 4667, 'Pleural_Thickening': 3385, 'Cardiomegaly': 2776, 'Emphysema': 2516, 'Fibrosis': 1686, 'Edema': 2303, 'Pneumonia': 1431, 'Hernia': 227}
{'No Finding': 0.5383606849803781, 'Infiltration': 0.17743489118801284, 'Atelectasis': 0.10309489832322512, 'Effusion': 0.11877452729218695, 'Nodule': 0.05646628612201213, 'Pneumothorax': 0.04728861933642526, 'Mass': 0.05156974669996432, 'Consolidation': 0.0416250445950767, 'Pleural_Thickening': 0.030190866928291118, 'Cardiomegaly': 0.024759186585800928, 'Emphysema': 0.022440242597217268, 'Fibrosis': 0.015037459864430967, 'Edema': 0.020540492329646807, 'Pneumonia': 0.012763110952550838, 'Hernia': 0.0020246164823403494}
{'No Finding': 1.0, 'Infiltration': 0.32958367157601764, 'Atelectasis': 0.19149782144099667, 'Effusion': 0.22062258743228244, 'Nodule': 0.10488560494358941, 'Pneumothorax': 0.087

In [86]:
def display_bbox_image(idx):
    path = df.iloc[idx]["Image Index"].split(".")[0] + ".npy"
    x, y, w, h = df.iloc[idx, 2:6]
    
    np_dir = "../dataset/numpy_classification_data/"
    for d in ["train/", "test/", "val/"]:
        try:
            img = np.load(np_dir + d + path)
        except FileNotFoundError:
            pass
    
    plt.figure(figsize=(10, 10))
    plt.imshow(img)
    print(img.shape)
    rect=mpatches.Rectangle((x, y),h,w, 
                            fill = False,
                            color = "red",
                            linewidth = 4,
                            linestyle="-.")
    plt.gca().add_patch(rect)
    plt.show()
    