In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from glob import glob
import os 
from tqdm import tqdm
import shutil

In [2]:
!pip install -q torchxrayvision

In [3]:
import torchxrayvision as xrv

  from tqdm.autonotebook import tqdm


In [4]:
meta_df = pd.read_csv('/kaggle/input/padchest-small-dataset/PC/PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv')

In [5]:
meta_df['Projection'].value_counts()

Projection
PA               91728
L                49579
AP_horizontal    14346
AP                4559
COSTAL             630
EXCLUDE             11
UNK                  8
Name: count, dtype: int64

In [6]:
meta_df['ViewPosition_DICOM'].value_counts()

ViewPosition_DICOM
POSTEROANTERIOR    54869
LATERAL            24527
PA                  8506
LL                  3383
ANTEROPOSTERIOR      369
AP                   104
OBLICUA               11
RL                     5
LLD                    1
GENERICA               1
Name: count, dtype: int64

In [7]:
df_pc = xrv.datasets.PC_Dataset(imgpath='/kaggle/input/padchest-small-dataset/PC/images-224/images-224/',
                                csvpath='/kaggle/input/padchest-small-dataset/PC/PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv',
                                views=["PA","AP"], unique_patients=False)

In [8]:
df = df_pc.csv
df = df.reset_index(drop=True)

In [9]:
df['Labels'] = df['Labels'].apply(ast.literal_eval)
df['Labels'] = df['Labels'].apply(lambda x: [label.strip() for label in x])

In [10]:
df[df['Labels'].apply(lambda x: 'normal' in x)]['Labels'].value_counts()

Labels
[normal]                                                                                                             34562
[cardiomegaly, normal]                                                                                                  11
[infiltrates, normal]                                                                                                    8
[cardiomegaly, normal, aortic elongation]                                                                                3
[nodule, normal]                                                                                                         3
                                                                                                                     ...  
[COPD signs, normal, kyphosis, vertebral anterior compression]                                                           1
[normal, unchanged, alveolar pattern, cardiomegaly, costophrenic angle blunting, interstitial pattern, pneumonia]        1
[COPD sig

In [11]:
allowed_labels = ['normal',
                  'pleural effusion', 'loculated pleural effusion', 'pericardial effusion', 'loculated fissural effusion',
                  'pneumothorax', 'hydropneumothorax',
                  'consolidation',
                  'pulmonary fibrosis', 
                  'emphysema', 'subcutaneous emphysema',
                  'pulmonary edema', 
                  'atelectasis', 'laminar atelectasis','lobar atelectasis','segmental atelectasis',
                  'cardiomegaly',
                  'pneumonia','atypical pneumonia',]

In [12]:
tmp = df['Labels'].explode()

filtered_tmp = tmp.apply(lambda x: x if x in allowed_labels else None).dropna()
filtered_tmp = filtered_tmp.groupby(filtered_tmp.index).agg('|'.join)

df = df.loc[filtered_tmp.index]
df['Labels'] = filtered_tmp

In [13]:
def label_concat(row):
    if row in ['normal']:
        return 'No Finding'
    elif row in ['pleural effusion', 'loculated pleural effusion', 'pericardial effusion', 'loculated fissural effusion']:
        return 'Effusion'
    elif row in ['pneumothorax', 'hydropneumothorax']:
        return 'Pneumothorax'
    elif row in ['consolidation']:
        return 'Consolidation'
    elif row in ['pulmonary fibrosis']:
        return 'Fibrosis'
    elif row in ['emphysema', 'subcutaneous emphysema']:
        return 'Emphysema'
    elif row in ['pulmonary edema']:
        return 'Edema'
    elif row in ['atelectasis', 'laminar atelectasis','lobar atelectasis','segmental atelectasis']:
        return 'Atelectasis'
    elif row in ['cardiomegaly']:
        return 'Cardiomegaly'
    elif row in ['pneumonia','atypical pneumonia']:
        return 'Pneumonia'

In [14]:
tmp = df['Labels'].str.split('|').explode()

df['Labels'] = tmp.apply(lambda x: label_concat(x)).groupby(tmp.index).agg('|'.join)

In [15]:
df = df.reset_index(drop=True)

In [16]:
df = df[['ImageID','PatientID','Labels','view']]

In [17]:
%%time
tmp = {os.path.basename(x): x for x in glob('/kaggle/input/*/PC/images-224/images-224/*')}    
    
df['path'] = df['ImageID'].map(tmp)

CPU times: user 1.01 s, sys: 278 ms, total: 1.29 s
Wall time: 3.52 s


In [18]:
df = df[~df['Labels'].apply(lambda x: '|No Finding' in x)]
df = df[~df['Labels'].apply(lambda x: 'No Finding|' in x)]

In [19]:
tmp = df.groupby('PatientID')['Labels'].nunique()
tmp = tmp.reset_index().rename(columns={'Labels':'label_nunique'})

df = df.merge(tmp, on='PatientID', how='left')

tmp1 = df.loc[df['label_nunique'] > 1][df['Labels'] != 'No Finding']
tmp2 = df.loc[df['label_nunique'] == 1]

df = pd.concat([tmp1, tmp2], axis=0).reset_index(drop=True)

In [20]:
tmp1 = df[df['Labels'] == 'No Finding'].sample(10_000, random_state=42)
tmp2 = df[df['Labels'] != 'No Finding']

df = pd.concat([tmp1,tmp2])
df = df.reset_index(drop=True)

In [21]:
df['Labels'].str.split('|').explode().value_counts()

Labels
No Finding       10000
Cardiomegaly      8721
Atelectasis       4420
Effusion          4164
Pneumonia         3625
Consolidation     1032
Emphysema          961
Fibrosis           682
Edema              458
Pneumothorax       249
Name: count, dtype: int64

In [22]:
df.to_csv('metadata.csv', index=False)

In [23]:
PATH = '/kaggle/working/lung_diseases/'

os.makedirs(os.path.join(PATH), exist_ok=True)
    
sub_count = 0
    
for i, file in enumerate(tqdm(df['path'].unique())):
    index = df[df['path'] == file]['ImageID'].values[0]
    
    if (i == 0) or (i % 5000 == 0):
        sub_count += 1
        sub_path = os.path.join(PATH, f'image_00{sub_count}')
        os.makedirs(sub_path, exist_ok=True)
    
    shutil.copy(file, os.path.join(sub_path, index))

100%|██████████| 29907/29907 [09:08<00:00, 54.54it/s]
