In [23]:
# Import libraries for data cleaning and preprocessing
import pandas as pd
import os 



In [24]:
# Load labeled data sample_labels.csv
labels = pd.read_csv('Resources/sample_labels.csv')

labels


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalImagePixelSpacing_x,OriginalImagePixelSpacing_y
0,00000013_005.png,Emphysema|Infiltration|Pleural_Thickening|Pneu...,5,13,060Y,M,AP,3056,2544,0.139000,0.139000
1,00000013_026.png,Cardiomegaly|Emphysema,26,13,057Y,M,AP,2500,2048,0.168000,0.168000
2,00000017_001.png,No Finding,1,17,077Y,M,AP,2500,2048,0.168000,0.168000
3,00000030_001.png,Atelectasis,1,30,079Y,M,PA,2992,2991,0.143000,0.143000
4,00000032_001.png,Cardiomegaly|Edema|Effusion,1,32,055Y,F,AP,2500,2048,0.168000,0.168000
...,...,...,...,...,...,...,...,...,...,...,...
5601,00030712_000.png,No Finding,0,30712,058Y,M,PA,2021,2021,0.194311,0.194311
5602,00030786_005.png,Cardiomegaly|Effusion|Emphysema,5,30786,061Y,F,AP,3056,2544,0.139000,0.139000
5603,00030789_000.png,Infiltration,0,30789,052Y,F,PA,2021,2021,0.194311,0.194311
5604,00030792_000.png,No Finding,0,30792,010Y,F,PA,1775,1712,0.194311,0.194311


### Convert Age

The 'Patient Age' column contains ages with a 'Y' suffix indicating years. Convert these into integer values for easier processing.

In [4]:
# Display unique age values before processing
print("Unique age values before processing:", labels['Patient Age'].unique())

# Use regular expressions to remove non-digit characters and convert to integer
labels['Patient Age'] = labels['Patient Age'].str.extract('(\d+)').astype(int)

# Check the operation
print("Unique age values after processing:", labels['Patient Age'].unique())

Unique age values before processing: ['060Y' '057Y' '077Y' '079Y' '055Y' '068Y' '071Y' '063Y' '067Y' '056Y'
 '058Y' '075Y' '073Y' '054Y' '078Y' '047Y' '050Y' '045Y' '084Y' '030Y'
 '049Y' '061Y' '069Y' '051Y' '044Y' '087Y' '059Y' '031Y' '064Y' '053Y'
 '082Y' '072Y' '046Y' '080Y' '048Y' '021Y' '038Y' '062Y' '052Y' '023Y'
 '037Y' '022Y' '032Y' '070Y' '065Y' '040Y' '042Y' '041Y' '028Y' '035Y'
 '034Y' '066Y' '019Y' '039Y' '074Y' '026Y' '024Y' '085Y' '016Y' '029Y'
 '033Y' '013Y' '017Y' '020Y' '014Y' '043Y' '012Y' '009Y' '011Y' '027Y'
 '036Y' '081Y' '025Y' '018Y' '076Y' '083Y' '008Y' '006Y' '015Y' '002Y'
 '005Y' '010Y' '003Y' '089Y' '007Y' '086Y' '004Y' '094Y' '013M' '411Y'
 '001D' '088Y']
Unique age values after processing: [ 60  57  77  79  55  68  71  63  67  56  58  75  73  54  78  47  50  45
  84  30  49  61  69  51  44  87  59  31  64  53  82  72  46  80  48  21
  38  62  52  23  37  22  32  70  65  40  42  41  28  35  34  66  19  39
  74  26  24  85  16  29  33  13  17  20  14  43  12 

Removing Outliers: Ages that are not physiologically plausible (such as 411 years old) should be treated as data entry errors and removed or corrected based on context or additional information.

Handling Special Cases: The 1 in 001D may be a special case or error. This needs a review to decide whether it should be modified or removed.

In [5]:
# Review entries with age values that are unusually high
print("Entries with unusually high ages:")
print(labels[labels['Patient Age'] > 120])

# Assuming you want to remove these entries
labels = labels[labels['Patient Age'] <= 120]

# Confirm changes
print("Updated age values after removing outliers:")
print(labels['Patient Age'].unique())


Entries with unusually high ages:
           Image Index Finding Labels  Follow-up #  Patient ID  Patient Age  \
4242  00020900_002.png     No Finding            2       20900          411   

     Patient Gender View Position  OriginalImageWidth  OriginalImageHeight  \
4242              M            AP                3056                 2544   

      OriginalImagePixelSpacing_x  OriginalImagePixelSpacing_y  
4242                        0.139                        0.139  
Updated age values after removing outliers:
[60 57 77 79 55 68 71 63 67 56 58 75 73 54 78 47 50 45 84 30 49 61 69 51
 44 87 59 31 64 53 82 72 46 80 48 21 38 62 52 23 37 22 32 70 65 40 42 41
 28 35 34 66 19 39 74 26 24 85 16 29 33 13 17 20 14 43 12  9 11 27 36 81
 25 18 76 83  8  6 15  2  5 10  3 89  7 86  4 94  1 88]


### Handle Multiple Labels
Convert the Finding Labels into a one-hot encoded format for each possible diagnosis. This means creating a binary column for each label (e.g., Emphysema, Infiltration, etc.),

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder

#### Transformation of Finding Labels
The 'Finding Labels' column was originally a single string containing multiple diagnoses separated by '|'. I transformed this column into a list of labels for each row, which will facilitate further data processing, such as one-hot encoding for model training.


In [7]:
# Split 'Finding Labels' into a list of labels
labels['Finding Labels'] = labels['Finding Labels'].apply(lambda x: x.split('|'))

In [11]:
# Display the first few rows to show how 'Finding Labels' are now lists of conditions
print(labels[['Image Index', 'Finding Labels']].head())

# Optionally, use .sample() to display a random subset of the dataset
# print(labels[['Image Index', 'Finding Labels']].sample(5))


        Image Index                                     Finding Labels
0  00000013_005.png  [Emphysema, Infiltration, Pleural_Thickening, ...
1  00000013_026.png                          [Cardiomegaly, Emphysema]
2  00000017_001.png                                       [No Finding]
3  00000030_001.png                                      [Atelectasis]
4  00000032_001.png                    [Cardiomegaly, Edema, Effusion]


#### One-Hot Encoding of Diagnosis Labels
To prepare the 'Finding Labels' for machine learning modeling, I used the `MultiLabelBinarizer` from `sklearn` to convert the list of labels in each row into a one-hot encoded format. This transformation creates a new column for each unique label in the dataset, where each column represents the presence (1) or absence (0) of that specific condition in the x-ray images. This format is necessary for training classification models, as it allows the model to learn to predict the presence of each condition independently.


In [12]:
# One-hot encode the findings
mlb = MultiLabelBinarizer()
labels_encoded = mlb.fit_transform(labels['Finding Labels'])
labels_encoded_df = pd.DataFrame(labels_encoded, columns=mlb.classes_)

# Concatenate with original dataframe
labels = pd.concat([labels, labels_encoded_df], axis=1)

In [13]:
# Display a few rows from the original and transformed dataframe to show the effect of one-hot encoding
print("Original DataFrame with lists of Finding Labels:")
print(labels[['Image Index', 'Finding Labels']].head())

print("\nDataFrame after One-Hot Encoding of Finding Labels:")
print(labels.iloc[:, -len(mlb.classes_):].head())  # Adjust the slicing as needed


Original DataFrame with lists of Finding Labels:
        Image Index                                     Finding Labels
0  00000013_005.png  [Emphysema, Infiltration, Pleural_Thickening, ...
1  00000013_026.png                          [Cardiomegaly, Emphysema]
2  00000017_001.png                                       [No Finding]
3  00000030_001.png                                      [Atelectasis]
4  00000032_001.png                    [Cardiomegaly, Edema, Effusion]

DataFrame after One-Hot Encoding of Finding Labels:
   Atelectasis  Cardiomegaly  Consolidation  Edema  Effusion  Emphysema  \
0          0.0           0.0            0.0    0.0       0.0        1.0   
1          0.0           1.0            0.0    0.0       0.0        1.0   
2          0.0           0.0            0.0    0.0       0.0        0.0   
3          1.0           0.0            0.0    0.0       0.0        0.0   
4          0.0           1.0            0.0    1.0       1.0        0.0   

   Fibrosis  Hernia  

In [14]:
# Show the count of each label in the dataset
label_counts = labels.iloc[:, -len(mlb.classes_):].sum().sort_values(ascending=False)
print(label_counts)


No Finding            3043.0
Infiltration           967.0
Effusion               644.0
Atelectasis            508.0
Nodule                 313.0
Mass                   284.0
Pneumothorax           271.0
Consolidation          226.0
Pleural_Thickening     176.0
Cardiomegaly           141.0
Emphysema              127.0
Edema                  118.0
Fibrosis                84.0
Pneumonia               62.0
Hernia                  13.0
dtype: float64


#### Encoding Categorical Data

In this step, I encoded the 'Patient Gender' and 'View Position' columns using the `LabelEncoder` from `sklearn.preprocessing`. This transformation is crucial for machine learning models as they typically require numerical input. Encoding transforms categorical data into a format that can be better interpreted by the algorithms:

- **Patient Gender**: Originally a string that indicates the gender of the patient. This column was transformed into numeric codes where each unique string label is replaced by a number (e.g., 'M' might be encoded as 0 and 'F' as 1).

- **View Position**: Indicates the position in which the x-ray was taken. Similar to gender, this categorical string data is encoded into numeric values, allowing the model to incorporate this information effectively.

This encoding process is not just a necessary technical step but also a foundational part of preparing our data for the subsequent model training phase.


In [15]:
# Encode other categorical data
le_gender = LabelEncoder()
labels['Patient Gender'] = le_gender.fit_transform(labels['Patient Gender'])

le_view = LabelEncoder()
labels['View Position'] = le_view.fit_transform(labels['View Position'])

In [16]:
# Display the processed data
print(labels.head())

        Image Index                                     Finding Labels  \
0  00000013_005.png  [Emphysema, Infiltration, Pleural_Thickening, ...   
1  00000013_026.png                          [Cardiomegaly, Emphysema]   
2  00000017_001.png                                       [No Finding]   
3  00000030_001.png                                      [Atelectasis]   
4  00000032_001.png                    [Cardiomegaly, Edema, Effusion]   

   Follow-up #  Patient ID  Patient Age  Patient Gender  View Position  \
0          5.0        13.0         60.0               1              0   
1         26.0        13.0         57.0               1              0   
2          1.0        17.0         77.0               1              0   
3          1.0        30.0         79.0               1              1   
4          1.0        32.0         55.0               0              0   

   OriginalImageWidth  OriginalImageHeight  OriginalImagePixelSpacing_x  ...  \
0              3056.0         

In [19]:
# Check for missing values
print("Missing values in each column:\n", labels.isnull().sum())

# Summary statistics for numerical columns
print("\nSummary Statistics:\n", labels.describe())

# Check the balance of encoded labels
label_counts = labels.iloc[:, -len(mlb.classes_):].sum()
print("\nCounts of each label:\n", label_counts)


Missing values in each column:
 Image Index                    1
Finding Labels                 1
Follow-up #                    1
Patient ID                     1
Patient Age                    1
Patient Gender                 0
View Position                  0
OriginalImageWidth             1
OriginalImageHeight            1
OriginalImagePixelSpacing_x    1
OriginalImagePixelSpacing_y    1
Atelectasis                    1
Cardiomegaly                   1
Consolidation                  1
Edema                          1
Effusion                       1
Emphysema                      1
Fibrosis                       1
Hernia                         1
Infiltration                   1
Mass                           1
No Finding                     1
Nodule                         1
Pleural_Thickening             1
Pneumonia                      1
Pneumothorax                   1
dtype: int64

Summary Statistics:
        Follow-up #    Patient ID  Patient Age  Patient Gender  View Positio

#### Handling Missing Values
Since there appears to be only one row with missing values (assuming the dataset is large), removing this row might be the simplest solution without significantly affecting the dataset’s integrity.

In [20]:
# Check how many rows have missing values
print("Number of rows with any missing values:", labels.isnull().any(axis=1).sum())

# Remove rows with any missing values
labels_cleaned = labels.dropna()

# Verify changes
print("Data after removing rows with missing values:")
print(labels_cleaned.isnull().sum())
print("\nRemaining data points:", labels_cleaned.shape[0])


Number of rows with any missing values: 2
Data after removing rows with missing values:
Image Index                    0
Finding Labels                 0
Follow-up #                    0
Patient ID                     0
Patient Age                    0
Patient Gender                 0
View Position                  0
OriginalImageWidth             0
OriginalImageHeight            0
OriginalImagePixelSpacing_x    0
OriginalImagePixelSpacing_y    0
Atelectasis                    0
Cardiomegaly                   0
Consolidation                  0
Edema                          0
Effusion                       0
Emphysema                      0
Fibrosis                       0
Hernia                         0
Infiltration                   0
Mass                           0
No Finding                     0
Nodule                         0
Pleural_Thickening             0
Pneumonia                      0
Pneumothorax                   0
dtype: int64

Remaining data points: 5604


In [21]:
# Summary statistics for cleaned data
print(labels_cleaned.describe())

# Re-check the balance of encoded labels
label_counts_cleaned = labels_cleaned.iloc[:, -len(mlb.classes_):].sum()
print("\nCounts of each label in cleaned data:\n", label_counts_cleaned)


       Follow-up #    Patient ID  Patient Age  Patient Gender  View Position  \
count  5604.000000   5604.000000  5604.000000     5604.000000    5604.000000   
mean      8.619379  14326.506424    46.709136        0.558887       0.605282   
std      15.567916   8409.643416    16.713873        0.496565       0.488834   
min       0.000000     13.000000     1.000000        0.000000       0.000000   
25%       0.000000   7286.000000    35.000000        0.000000       0.000000   
50%       3.000000  13993.000000    49.000000        1.000000       1.000000   
75%      10.000000  20650.250000    59.000000        1.000000       1.000000   
max     177.000000  30792.000000    94.000000        1.000000       1.000000   

       OriginalImageWidth  OriginalImageHeight  OriginalImagePixelSpacing_x  \
count         5604.000000          5604.000000                  5604.000000   
mean          2644.833690          2491.161849                     0.155463   
std            347.107231           399.14

In [22]:
# Save dataset to a new CSV file
labels_cleaned.to_csv('Resources/sample_labels_cleaned.csv', index=False)