In [None]:
import os
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd drive/MyDrive/'BIOMEDIN220-F2022'/

[Errno 2] No such file or directory: 'drive/MyDrive/BIOMEDIN220-F2022/'
/content/drive/.shortcut-targets-by-id/1LfeaAXd6DFfJ4uV-9TNT08K_8udJ1Lf6/BIOMEDIN220-F2022


In [None]:
!ls $OUTPUT_DIR

model			  train_meta.csv
multilabel_cls_train.csv  vinbigdata-chest-xray-resized-png-256x256.zip
test.csv		  vindrcxr_test.csv
train.csv		  vindrcxr_train.csv


In [None]:
class_name_to_id = {'Aortic enlargement': 0,
                    'Atelectasis': 1,
                    'Calcification': 2,
                    'Cardiomegaly': 3,
                    'Consolidation': 4,
                    'ILD': 5,
                    'Infiltration': 6,
                    'Lung Opacity': 7,
                    'Nodule/Mass': 8,
                    'Other lesion': 9,
                    'Pleural effusion': 10,
                    'Pleural thickening': 11,
                    'Pneumothorax': 12,
                    'Pulmonary fibrosis': 13,
                    'No finding': 14}
OUTPUT_DIR = "vinbigdata-chest-xray-resized-png-256x256"

### Build Training Dataset for Model

In [None]:
train_annotations_df = pd.read_csv("annotations_train.csv")
image_labels_train_df = pd.read_csv("image_labels_train.csv")

## From https://github.com/Scu-sen/VinBigData-Chest-X-ray-Abnormalities-Detection/tree/main/vinbigdata_classifierPP_2021
## Uses:
## 1) Use the 5-fold splits from this file.
## 2) Compare our training dataset to multilabel_cls_train.csv as a sanity check.
multilabel_cls_train_df = pd.read_csv("vinbigdata-chest-xray-resized-png-256x256/multilabel_cls_train.csv")

In [None]:
image_id_to_fold = dict(zip(multilabel_cls_train_df.image_id, multilabel_cls_train_df.fold))

#### Method 1: Create Training Dataset using `annotations_train.csv`

In [None]:
#@title
train_annotations_df["class_id"] = train_annotations_df["class_name"].apply(lambda class_name : class_name_to_id.get(class_name, -1))

In [None]:
#@title
train_annotations_df = train_annotations_df[train_annotations_df["class_id"] != -1]
train_annotations_df.head()

Unnamed: 0,image_id,rad_id,class_name,x_min,y_min,x_max,y_max,class_id
0,000434271f63a053c4128a0ba6352c7f,R2,No finding,,,,,14
1,000434271f63a053c4128a0ba6352c7f,R3,No finding,,,,,14
2,000434271f63a053c4128a0ba6352c7f,R6,No finding,,,,,14
3,00053190460d56c53cc3e57321387478,R11,No finding,,,,,14
4,00053190460d56c53cc3e57321387478,R2,No finding,,,,,14


In [None]:
#@title
df = train_annotations_df.groupby(["image_id", "class_id"], as_index=False)["rad_id"].nunique().pivot_table(columns='class_id', values='rad_id', index='image_id').fillna(0)/3
df = df.reset_index()
df = df.rename_axis(None, axis=1)
df["fold"] = df["image_id"].apply(lambda image_id : image_id_to_fold.get(image_id, -1))

In [None]:
#@title
df.head()

Unnamed: 0,image_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,fold
0,000434271f63a053c4128a0ba6352c7f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,00053190460d56c53cc3e57321387478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0
2,0005e8e3701dfb1dd93d53e2ff537b6e,0.0,0.0,0.0,0.0,0.333333,0.0,0.333333,0.666667,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0006e0a85696f6bb578e84fafa9a5607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,0007d316f756b3fa0baea2ff514ce945,0.666667,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,0.0,0.0


#### Method 2 (Preferred): Create Training Dataset using `image_labels_train.csv`

In [None]:
image_labels_train_df = image_labels_train_df[["image_id"] + list(class_name_to_id.keys())]
image_labels_train_df = image_labels_train_df.rename(columns = class_name_to_id)
image_labels_train_df.head()

Unnamed: 0,image_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,000434271f63a053c4128a0ba6352c7f,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,000434271f63a053c4128a0ba6352c7f,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,000434271f63a053c4128a0ba6352c7f,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,00053190460d56c53cc3e57321387478,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,00053190460d56c53cc3e57321387478,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [None]:
image_labels_train_agg_df = image_labels_train_df.groupby(['image_id'], as_index=False).agg(['mean'])
image_labels_train_agg_df.columns = image_labels_train_agg_df.columns.droplevel(1)
image_labels_train_agg_df = image_labels_train_agg_df.reset_index()
image_labels_train_agg_df = image_labels_train_agg_df.rename_axis(None, axis=1)
image_labels_train_agg_df["fold"] = image_labels_train_agg_df["image_id"].apply(lambda image_id : image_id_to_fold.get(image_id, -1))

In [None]:
image_labels_train_agg_df.head()

Unnamed: 0,image_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,fold
0,000434271f63a053c4128a0ba6352c7f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,00053190460d56c53cc3e57321387478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0
2,0005e8e3701dfb1dd93d53e2ff537b6e,0.0,0.0,0.0,0.0,0.333333,0.0,0.333333,0.666667,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0006e0a85696f6bb578e84fafa9a5607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,0007d316f756b3fa0baea2ff514ce945,0.666667,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,0.0,0.0


In [None]:
image_labels_train_agg_df.to_csv(os.path.join(OUTPUT_DIR, "vindrcxr_train.csv"), index=False)

### Build Test Dataset for Model

In [None]:
test_annotations_df = pd.read_csv("annotations_test.csv")
image_labels_test_df = pd.read_csv("image_labels_test.csv")

In [None]:
image_labels_test_df.head()
image_labels_test_df = image_labels_test_df[["image_id"] + list(class_name_to_id.keys())]
image_labels_test_df = image_labels_test_df.rename(columns = class_name_to_id)
image_labels_test_df

Unnamed: 0,image_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,e0dc2e79105ad93532484e956ef8a71a,0,1,1,1,0,1,0,0,0,0,1,0,1,0,0
1,0aed23e64ebdea798486056b4f174424,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0
2,aa15cfcfca7605465ca0513902738b95,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
3,665c4a6d2693dc0286d65ab479c9b169,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
4,42da2c134b53cb5594774d3d29faac59,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,a039af299f86007d0d77da077a6def9a,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2996,aba3d1f5b1c04236f52a8980929b2cfa,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2997,6d3d6b53f358a983b486e9e03144eb62,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2998,d6678cb7ae39f575d35ab9da6d7cb171,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [None]:
image_labels_test_df.to_csv(os.path.join(OUTPUT_DIR, "vindrcxr_test.csv"), index=False)

### Testing

#### Test that Train Dataframes obtained via Method 1 and 2 are the same

In [None]:
assert(image_labels_train_agg_df.equals(df))

#### Test Differences between our Train Dataframe and `multilabel_cls_train.csv`

In [None]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,fold
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,0.159733,0.005933,0.015378,0.120667,0.011222,0.013778,0.021089,0.045444,0.0318,0.036511,0.046911,0.072378,0.004422,0.072689,0.704111,1.999133
std,0.337886,0.058264,0.095923,0.301689,0.079538,0.093589,0.11363,0.161668,0.144956,0.141296,0.189311,0.206813,0.060393,0.229721,0.454374,1.413577
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0


In [None]:
multilabel_cls_train_df.describe()

Unnamed: 0,fold,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,1.999133,0.159044,0.005911,0.015244,0.120311,0.011222,0.013533,0.021089,0.045244,0.031378,0.036067,0.046778,0.071733,0.004422,0.0726,0.707067
std,1.413577,0.337871,0.058202,0.095712,0.301634,0.079538,0.093188,0.11363,0.161518,0.144357,0.140885,0.189227,0.206518,0.060393,0.229685,0.455123
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
NUM_CLASSES = 15

image_id_diffs = []
for i, row in df.iterrows():
  image_id = row["image_id"]
  matching_row = multilabel_cls_train_df[multilabel_cls_train_df["image_id"] == image_id]
  for j in range(NUM_CLASSES):
    try: 
      assert(row[j] == float(matching_row[str(j)]))
    except:
      image_id_diffs.append(image_id)

In [None]:
set(image_id_diffs)

{'0021df30f3fddef551eb3df4354b1d06',
 '01a93e09220daa5b0316d426b701c76c',
 '01ce66ee84ef4045a06269aa0f5c2756',
 '02f48a4f3bcfa77917203130505a12fc',
 '042dafdd2ec468835d672cb01f5dd136',
 '05aae40e8904366f92a7f738d7533338',
 '07eb2311cb7791dc47b03edfb9c357d8',
 '088869b047b7336c8d31bea5cf10cf24',
 '093e0876aa291878d6725561b08addcc',
 '0d2e947bb475050c9039828b754ff6fa',
 '0d40f562896c51b30af1949901fd2d69',
 '0ef13359541e4c9886d255ade0a95aba',
 '10eee9c43ac039dd0d0514d2e0f1eba0',
 '113f39bcbaf112a7d29c192340bb88ea',
 '11eda2bb6f6db5355ffc8ec8e36a8866',
 '12c57b0679d77f63009d79239f5cb1ba',
 '13fcd440ee477324ffe130645d60b350',
 '1406af2d31790c267b8b17d301ff1ab3',
 '1409d26e536b423b2a8d4fb117ea1db4',
 '1666f93daf79b5abe474279e391fc406',
 '1de6be43e9bff89c503cfc750aeb69e6',
 '2c223ca77aac52d4300da000ba30c7d8',
 '2e8298b8075f4bfad421b6a97a0b0d8a',
 '2e8abd47841a31c459718dca0700f486',
 '30d4ef50751edf174a36b9594a654b23',
 '31ad9a7330a3d2abe38bce55c4bf1109',
 '325bd96eac745c2604b9f7e7b60ebc36',
 

In [None]:
df.head()

Unnamed: 0,image_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,000434271f63a053c4128a0ba6352c7f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,00053190460d56c53cc3e57321387478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0005e8e3701dfb1dd93d53e2ff537b6e,0.0,0.0,0.0,0.0,0.333333,0.0,0.333333,0.666667,0.333333,0.0,0.0,0.0,0.0,0.0,0.0
3,0006e0a85696f6bb578e84fafa9a5607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0007d316f756b3fa0baea2ff514ce945,0.666667,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,0.0


### Debugging

In [None]:
image = "07eb2311cb7791dc47b03edfb9c357d8"
train_annotations_df.query("image_id==@image")

Unnamed: 0,image_id,rad_id,class_name,x_min,y_min,x_max,y_max,class_id
2182,07eb2311cb7791dc47b03edfb9c357d8,R10,No finding,,,,,14
2183,07eb2311cb7791dc47b03edfb9c357d8,R8,No finding,,,,,14
2184,07eb2311cb7791dc47b03edfb9c357d8,R9,Pleural thickening,1274.23999,420.342987,1367.650024,457.548004,11
2185,07eb2311cb7791dc47b03edfb9c357d8,R9,Other lesion,219.330002,1817.910034,780.807983,2230.23999,9
2186,07eb2311cb7791dc47b03edfb9c357d8,R9,Other lesion,1558.180054,1938.130005,1780.98999,2151.719971,9


In [None]:
dataset_path = "vinbigdata-chest-xray-resized-png-256x256"
vinbigdata_train_df = pd.read_csv(os.path.join(dataset_path, 'train.csv'))

In [None]:
vinbigdata_train_df.query("image_id==@image")

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,width,height
4592,07eb2311cb7791dc47b03edfb9c357d8,No finding,14,R8,,,,,2048,2500
13729,07eb2311cb7791dc47b03edfb9c357d8,No finding,14,R10,,,,,2048,2500
17823,07eb2311cb7791dc47b03edfb9c357d8,No finding,14,R9,,,,,2048,2500


In [None]:
image_labels_train_df.query("image_id==@image")

Unnamed: 0,image_id,rad_id,Aortic enlargement,Atelectasis,Calcification,Cardiomegaly,Clavicle fracture,Consolidation,Edema,Emphysema,...,Pneumothorax,Pulmonary fibrosis,Rib fracture,Other lesion,COPD,Lung tumor,Pneumonia,Tuberculosis,Other diseases,No finding
1398,07eb2311cb7791dc47b03edfb9c357d8,R10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1399,07eb2311cb7791dc47b03edfb9c357d8,R8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1400,07eb2311cb7791dc47b03edfb9c357d8,R9,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
