In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

  from pandas.core import (
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was too old on your system - pyarrow 10.0.1 is the current minimum supported version as of this release.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
pd.set_option('display.max_rows', None)

In [3]:
metadata = pd.read_csv('metadata.csv', encoding='utf-8' )

In [4]:
metadata.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168


In [5]:
def replace_pipe_with_space(x):
    if isinstance(x, str):
        return x.replace("|", " ")
    else:
        return x

# Apply the custom function to all elements in the DataFrame
metadata = metadata.applymap(replace_pipe_with_space)

In [6]:
metadata.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168


In [7]:
metadata.rename(columns = {'Image Index':'imageindex', 'Finding Labels': 'labels'}, inplace = True)

In [8]:
### We need to replace the empty space in No Finding with an underscore so it's not counted as a disease
metadata['labels'] = metadata['labels'].replace('No Finding', 'No_Finding')

In [9]:
metadata.to_csv('final_metadata.csv', index=False)

## Count the number of images and how many disease they have

In [10]:
### first get the final_metadata to use the csv with empty spaces and not pipelines
df = pd.read_csv('final_metadata.csv', encoding='utf-8' )

In [11]:
# Counting empty spaces in each row and creating a new column
df['disease_count'] = df['labels'].apply(lambda x: x.count(' ') + 1)

In [12]:
df.disease_count.value_counts()

1    91324
2    14306
3     4856
4     1247
5      301
6       67
7       16
9        2
8        1
Name: disease_count, dtype: int64

## We need to see which diseases are the most present in each disease_count 
#### so we can try to eliminate imbalances, and so we don't have to train on 110k images

In [13]:
### see what are the most common diseases when patients have 1 disease
one_disease = df[(df['disease_count'] == 1) & (df['labels'] != 'No_Finding')]

In [14]:
one_disease.labels.value_counts()

Infiltration          9547
Atelectasis           4215
Effusion              3955
Nodule                2705
Pneumothorax          2194
Mass                  2139
Consolidation         1310
Pleural_Thickening    1126
Cardiomegaly          1093
Emphysema              892
Fibrosis               727
Edema                  628
Pneumonia              322
Hernia                 110
Name: labels, dtype: int64

In [15]:
one_disease[one_disease['labels'] == 'Infiltration']['View Position'].value_counts()

PA    5270
AP    4277
Name: View Position, dtype: int64

In [16]:
### see what are the most common diseases when patients have 2 diseases
two_diseases = df[df['disease_count'] == 2]

In [17]:
two_diseases.labels.value_counts()

Effusion Infiltration               1603
Atelectasis Infiltration            1350
Atelectasis Effusion                1165
Infiltration Nodule                  829
Cardiomegaly Effusion                484
Consolidation Infiltration           441
Infiltration Mass                    420
Effusion Pneumothorax                403
Effusion Mass                        402
Atelectasis Consolidation            398
Mass Nodule                          394
Edema Infiltration                   392
Infiltration Pneumothorax            345
Emphysema Pneumothorax               337
Consolidation Effusion               337
Effusion Pleural_Thickening          251
Effusion Nodule                      249
Atelectasis Pneumothorax             240
Infiltration Pleural_Thickening      210
Cardiomegaly Infiltration            200
Infiltration Pneumonia               199
Atelectasis Nodule                   199
Edema Effusion                       189
Fibrosis Infiltration                185
Mass Pneumothora

In [18]:
### see what are the most common diseases when patients have 3 diseases
three_diseases = df[df['disease_count'] == 3]

In [19]:
three_diseases.labels.value_counts()

Atelectasis Effusion Infiltration                737
Atelectasis Consolidation Effusion               200
Edema Infiltration Pneumonia                     137
Consolidation Effusion Infiltration              134
Edema Effusion Infiltration                      127
Cardiomegaly Effusion Infiltration               120
Effusion Infiltration Nodule                     106
Atelectasis Consolidation Infiltration           104
Effusion Infiltration Mass                        99
Atelectasis Effusion Pneumothorax                 99
Effusion Infiltration Pneumothorax                88
Atelectasis Effusion Mass                         72
Emphysema Infiltration Pneumothorax               72
Effusion Infiltration Pleural_Thickening          71
Atelectasis Infiltration Pneumothorax             71
Atelectasis Cardiomegaly Effusion                 68
Atelectasis Infiltration Mass                     67
Infiltration Mass Nodule                          67
Consolidation Effusion Mass                   

In [20]:
### see what are the most common diseases when patients have 4 diseases
four_diseases = df[df['disease_count'] == 4]

In [21]:
four_diseases.labels.value_counts()

Atelectasis Consolidation Effusion Infiltration               81
Atelectasis Cardiomegaly Effusion Infiltration                54
Atelectasis Effusion Infiltration Pneumothorax                44
Atelectasis Effusion Infiltration Pleural_Thickening          41
Atelectasis Effusion Infiltration Mass                        38
Atelectasis Consolidation Effusion Mass                       37
Atelectasis Emphysema Infiltration Pneumothorax               27
Atelectasis Edema Effusion Infiltration                       24
Effusion Infiltration Mass Nodule                             22
Edema Effusion Infiltration Pneumonia                         21
Atelectasis Effusion Infiltration Nodule                      21
Atelectasis Effusion Infiltration Pneumonia                   18
Atelectasis Effusion Emphysema Infiltration                   18
Effusion Infiltration Nodule Pleural_Thickening               17
Effusion Infiltration Mass Pleural_Thickening                 16
Consolidation Effusion In

In [22]:
### see what are the most common diseases when patients have 5 diseases
five_diseases = df[df['disease_count'] == 5]

In [23]:
### see what are the most common diseases when patients have 5 diseases
five_diseases = df[df['disease_count'] == 5]

In [24]:
five_diseases.labels.value_counts()

Atelectasis Consolidation Effusion Mass Nodule                            16
Atelectasis Cardiomegaly Consolidation Effusion Infiltration              12
Atelectasis Consolidation Effusion Mass Pleural_Thickening                11
Atelectasis Effusion Infiltration Mass Nodule                              9
Effusion Infiltration Mass Nodule Pleural_Thickening                       8
Consolidation Effusion Infiltration Mass Nodule                            7
Atelectasis Consolidation Effusion Infiltration Pleural_Thickening         6
Atelectasis Effusion Infiltration Mass Pneumothorax                        6
Atelectasis Cardiomegaly Effusion Infiltration Nodule                      6
Atelectasis Cardiomegaly Effusion Infiltration Pleural_Thickening          6
Atelectasis Consolidation Effusion Infiltration Pneumothorax               5
Atelectasis Consolidation Effusion Infiltration Mass                       5
Atelectasis Consolidation Edema Effusion Infiltration                      4

In [25]:
### see what are the most common diseases when patients have 6 diseases
six_diseases = df[df['disease_count'] == 6]

In [26]:
six_diseases.labels.value_counts()

Atelectasis Consolidation Effusion Infiltration Mass Nodule                        13
Atelectasis Consolidation Effusion Infiltration Mass Pleural_Thickening             7
Atelectasis Consolidation Effusion Emphysema Infiltration Mass                      4
Atelectasis Effusion Infiltration Mass Nodule Pleural_Thickening                    2
Atelectasis Cardiomegaly Consolidation Edema Effusion Pleural_Thickening            2
Atelectasis Cardiomegaly Consolidation Edema Effusion Infiltration                  2
Atelectasis Cardiomegaly Edema Effusion Infiltration Pneumothorax                   1
Atelectasis Cardiomegaly Effusion Infiltration Mass Pleural_Thickening              1
Atelectasis Cardiomegaly Consolidation Effusion Mass Pleural_Thickening             1
Effusion Fibrosis Infiltration Nodule Pleural_Thickening Pneumothorax               1
Atelectasis Cardiomegaly Effusion Infiltration Mass Pneumothorax                    1
Cardiomegaly Consolidation Effusion Infiltration Mass 

In [27]:
### see what are the most common diseases when patients have 7 diseases
seven_diseases = df[df['disease_count'] == 7]

In [28]:
seven_diseases.labels.value_counts()

Atelectasis Effusion Infiltration Mass Nodule Pleural_Thickening Pneumothorax             2
Atelectasis Cardiomegaly Consolidation Effusion Infiltration Mass Pleural_Thickening      2
Consolidation Effusion Infiltration Mass Nodule Pleural_Thickening Pneumonia              1
Atelectasis Consolidation Effusion Infiltration Mass Nodule Pneumothorax                  1
Atelectasis Cardiomegaly Consolidation Effusion Infiltration Nodule Pleural_Thickening    1
Atelectasis Consolidation Effusion Emphysema Fibrosis Mass Nodule                         1
Atelectasis Cardiomegaly Consolidation Edema Effusion Infiltration Mass                   1
Cardiomegaly Edema Effusion Infiltration Mass Pleural_Thickening Pneumonia                1
Cardiomegaly Consolidation Edema Effusion Mass Nodule Pleural_Thickening                  1
Atelectasis Consolidation Emphysema Mass Nodule Pleural_Thickening Pneumothorax           1
Atelectasis Consolidation Effusion Fibrosis Infiltration Mass Pleural_Thickening

In [29]:
# List of diseases
disease_list = ['Infiltration', 'Pneumonia', 'Atelectasis', 'Effusion', 'Nodule', 'Pneumothorax', 'Mass', 
                'Consolidation', 'Pleural_Thickening', 'Cardiomegaly', 'Emphysema', 'Fibrosis', 'Edema', 'Pneumonia']

# Identify the indices of rows with diseases in the list
disease_indices = df[df['labels'].isin(disease_list)].index

# Create an empty DataFrame to store the sampled indices
sampled_indices_df = pd.DataFrame()

# Iterate through each disease and View Position group
for disease in disease_list:
    for view_position in df['View Position'].unique():
        # Identify indices for the current disease and View Position
        subset_indices = (df['labels'] == disease) & (df['View Position'] == view_position)

        # Sample the indices for the current disease and View Position, keeping only 100
        sampled_indices = df.loc[subset_indices].sample(min(subset_indices.sum(), 50), random_state=42).index

        # Append the sampled indices to the DataFrame
        sampled_indices_df = sampled_indices_df.append(df.loc[sampled_indices])

# Drop duplicates from the sampled indices DataFrame
sampled_indices_df = sampled_indices_df.drop_duplicates()

# Update only the rows with diseases in the list in the original DataFrame with the sampled indices
df.loc[disease_indices, :] = df.loc[sampled_indices_df.index, :]

In [30]:
#List of diseases
disease_list = ['Effusion Infiltration', 'Atelectasis Infiltration', 'Atelectasis Effusion', 'Infiltration Nodule', 
                'Cardiomegaly Effusion', 'Consolidation Infiltration', 'Infiltration Mass', 'Effusion Pneumothorax',
                'Effusion Mass', 'Atelectasis Consolidation', 'Mass Nodule', 'Edema Infiltration', 'Infiltration Pneumothorax', 
                'Emphysema Pneumothorax', 'Consolidation Effusion', 'Effusion Pleural_Thickening', 'Effusion Nodule', 'Atelectasis Pneumothorax',
               'Infiltration Pleural_Thickening', 'Cardiomegaly Infiltration', 'Infiltration Pneumonia', 'Atelectasis Nodule',
               'Edema Effusion', 'Fibrosis Infiltration', 'Mass Pneumothorax', 'Atelectasis Mass', 'Consolidation Mass']

# Identify the indices of rows with diseases in the list
disease_indices = df['labels'].isin(disease_list)

# Create an empty DataFrame to store the sampled indices
sampled_indices_df = pd.DataFrame()

# Iterate through each disease and View Position group
for disease in disease_list:
    for view_position in df['View Position'].unique():
        # Identify indices for the current disease and View Position
        subset_indices = (df['labels'] == disease) & (df['View Position'] == view_position)

        # Sample the indices for the current disease and View Position
        sampled_indices = df.loc[subset_indices].sample(min(subset_indices.sum(), 50), random_state=42).index

        # Append the sampled indices to the DataFrame
        sampled_indices_df = sampled_indices_df.append(df.loc[sampled_indices])

# Update only the rows with diseases in the list in the original DataFrame with the sampled indices
df.loc[disease_indices, :] = df.loc[sampled_indices_df.index, :]

In [31]:
#List of diseases
disease_list = ['Atelectasis Effusion Infiltration', 'Atelectasis Consolidation Effusion']

# Identify the indices of rows with diseases in the list
disease_indices = df['labels'].isin(disease_list)

# Create an empty DataFrame to store the sampled indices
sampled_indices_df = pd.DataFrame()

# Iterate through each disease and View Position group
for disease in disease_list:
    for view_position in df['View Position'].unique():
        # Identify indices for the current disease and View Position
        subset_indices = (df['labels'] == disease) & (df['View Position'] == view_position)

        # Sample the indices for the current disease and View Position
        sampled_indices = df.loc[subset_indices].sample(min(subset_indices.sum(), 50), random_state=42).index

        # Append the sampled indices to the DataFrame
        sampled_indices_df = sampled_indices_df.append(df.loc[sampled_indices])

# Update only the rows with diseases in the list in the original DataFrame with the sampled indices
df.loc[disease_indices, :] = df.loc[sampled_indices_df.index, :]

In [32]:
# Identify the rows with at least 5 occurrences for each disease_count
mask = df.groupby('disease_count')['labels'].transform('count') >= 5

# Apply the mask to filter the DataFrame
df = df[mask]

In [33]:
#List of diseases
disease_list = ['No_Finding']

# Identify the indices of rows with diseases in the list
disease_indices = df['labels'].isin(disease_list)

# Create an empty DataFrame to store the sampled indices
sampled_indices_df = pd.DataFrame()

# Iterate through each disease and View Position group
for disease in disease_list:
    for view_position in df['View Position'].unique():
        # Identify indices for the current disease and View Position
        subset_indices = (df['labels'] == disease) & (df['View Position'] == view_position)

        # Sample the indices for the current disease and View Position
        sampled_indices = df.loc[subset_indices].sample(min(subset_indices.sum(), 1000), random_state=42).index

        # Append the sampled indices to the DataFrame
        sampled_indices_df = sampled_indices_df.append(df.loc[sampled_indices])

# Update only the rows with diseases in the list in the original DataFrame with the sampled indices
df.loc[disease_indices, :] = df.loc[sampled_indices_df.index, :]

In [34]:
df.labels.value_counts()

No_Finding                                                                                2000
Nodule Pleural_Thickening                                                                  140
Edema Infiltration Pneumonia                                                               137
Consolidation Effusion Infiltration                                                        134
Mass Pleural_Thickening                                                                    132
Nodule Pneumothorax                                                                        130
Consolidation Nodule                                                                       128
Edema Effusion Infiltration                                                                127
Emphysema Infiltration                                                                     126
Cardiomegaly Effusion Infiltration                                                         120
Atelectasis Emphysema                             

In [35]:
df.head()

Unnamed: 0,imageindex,labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],disease_count
1,00000001_001.png,Cardiomegaly Emphysema,1.0,1.0,58.0,M,PA,2894.0,2729.0,0.143,0.143,2.0
3,,,,,,,,,,,,
4,00000003_001.png,Hernia,0.0,3.0,74.0,F,PA,2500.0,2048.0,0.168,0.168,1.0
5,00000003_002.png,Hernia,1.0,3.0,75.0,F,PA,2048.0,2500.0,0.168,0.168,1.0
6,00000003_003.png,Hernia Infiltration,2.0,3.0,76.0,F,PA,2698.0,2991.0,0.143,0.143,2.0


In [36]:
df = df.dropna()

In [37]:
len(df)

14401

In [38]:
df.to_csv('reduced_metadata.csv', index=False)

In [39]:
# Keep only the first two columns
labels_df = df.iloc[:, :2]

In [40]:
labels_df.to_csv('labels.csv', index=False)