In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os

# modify csv files to follow a certain distribution and keep only relevant columns

## MIMIC

In [2]:
path = <path to csv containing labels and demographic groups> 
pathologies_metadata = pd.read_csv(path)

### total distributions of front X-Rays

In [3]:
pa_ap_samples = pathologies_metadata.loc[pathologies_metadata['ViewPosition'].isin(['PA', 'AP'])]


In [4]:
count_subjects_df = pa_ap_samples.loc[:, ['subject_id','study_id', 'gender', 'dicom_id','split','ViewPosition',
            'Atelectasis',
            'Cardiomegaly',
            'Consolidation',
            'Edema',
            'Enlarged Cardiomediastinum',
            'Fracture',
            'Lung Lesion',
            'Lung Opacity',
            'No Finding',
            'Pleural Effusion',
            'Pleural Other',
            'Pneumonia',
            'Pneumothorax',
            'Support Devices' 
            ]]

In [5]:
len(count_subjects_df)

243149

### subset of ['Edema','Cardiomegaly','Support Devices','Atelectasis','Pleural Effusion',  'Lung Opacity']

In [9]:
#label_subset = labels

label_subset = ['Edema','Cardiomegaly','Atelectasis','Lung Opacity','Pleural Effusion','Support Devices']
#label_subset = ['Edema','Cardiomegaly','Atelectasis']
#label_subset = ['Edema','Support Devices','Cardiomegaly']
column_subset = ['subject_id','study_id', 'gender', 'dicom_id','split'] 
column_subset = column_subset + label_subset

data_subset = count_subjects_df.loc[:,column_subset]
test_validate = count_subjects_df.loc[count_subjects_df["split"] != "train",column_subset]
data_subset[label_subset] = data_subset[label_subset].applymap(lambda x: np.nan if x <= 0 else x)
data_subset.dropna(subset=label_subset, how='all', inplace=True)
data_subset[label_subset] = data_subset[label_subset].applymap(lambda x: 0 if np.isnan(x) else x)
#data_subset.to_csv('total_AP_PA_ds.csv', index=False)


# Aggregations

In [None]:
def aggregate(df,labels, byGender):
    # Initialize an empty dictionary to store the aggregation functions
    aggregation_functions = {}

    # Iterate over the columns and add them to the aggregation functions dictionary
    for column in labels:
        aggregation_functions[column] = 'sum'
    if byGender:
        # Perform the dynamic aggregation
        result = df.groupby(['gender']).agg(aggregation_functions)
    else:
        result = pd.DataFrame(df.agg(aggregation_functions)).T

    return result

# Label powerset

In [15]:
ids_array = [1, 3, 5, 10, 20, 40] #,80,160,320,640,1280,2560,5120,10240
data_subset["powerset"] = data_subset.apply(lambda x: sum(val * x[col] for val, col in zip(ids_array, label_subset)), axis=1)


In [16]:
np.unique(data_subset['powerset'])

array([ 1.,  3.,  4.,  5.,  6.,  8.,  9., 10., 11., 13., 14., 15., 16.,
       18., 19., 20., 21., 23., 24., 25., 26., 28., 29., 30., 31., 33.,
       34., 35., 36., 38., 39., 40., 41., 43., 44., 45., 46., 48., 49.,
       50., 51., 53., 54., 55., 56., 58., 59., 60., 61., 63., 64., 65.,
       66., 68., 69., 70., 71., 73., 74., 75., 76., 78., 79.])

# ROS_c from 
## "A First Approach to Deal with Imbalance in Multi-label Datasets"

In [29]:
def add_samples(df, x, gender = None):
    if gender is not None:
        temp_df = df.iloc[np.random.choice(np.where(df['gender'] == gender)[0],size=x)]
    else:
        sampled_rows = random.choices(df.index.tolist(), k=x)
        temp_df = df.loc[sampled_rows]
         #temp_df = df.sample(n=x)
    result_df = pd.concat([df, temp_df], ignore_index=True)
    
    return result_df

In [None]:
def lp_ros(D, size_increasing):
    increment =  int(len(D) * size_increasing)
    print("Total samples to add: ",increment)
    unique_values = np.unique(D['powerset'])
    labelset_Bag = []
    for value in unique_values:
        powerset = D.loc[D['powerset'] ==value]
        labelset_Bag.append((powerset,value))

    meanSize = sum(D['powerset'].value_counts()) / len(unique_values)  
    
    minBag = []

    for labelset,powerset in labelset_Bag:
        if len(labelset) < meanSize:
            minBag.append((labelset,powerset))
            # if powerset greater meanSize drop this powerset and append it at the end, after
            # its size has been increased
            


    meanIncrement = increment/len(minBag)
    minBag.sort(key=lambda df: len(df), reverse=True)
    total_samples_inc = 0
    for i,(minSet,powerset) in enumerate(minBag):
        #if total_samples_inc >= increment:
        #    break
        
        incrementBag = int(min(abs(len(minSet)-meanSize),meanIncrement))
        remainder = int(meanIncrement - incrementBag)
        # distribute among Bags könnte meinen die meanReduction für die nachfolgenden Bags zu erhöhen;
        # gleichverteilt auf die verbleibenden Klassen 
        # print(gender, "- samples deleted: ", reductionBag)
        total_samples_inc += incrementBag
        #D = D.drop(D[D['powerset'] == powerset].index)
        minSet = add_samples(minSet,incrementBag)
        D = pd.concat([D, minSet])
        
        num_remaining_cl = ((len(minBag)-1)-i)
        if num_remaining_cl > 0:
            meanIncrement += remainder/num_remaining_cl
    print("Total samples added:", total_samples_inc)
    return D
lp_ros(data_subset, 0.3)

# ROS_g:
## considering gender

In [None]:
def det_ovs_num_samples_gender(dataset, size_increasing):
    num_fem_samples = dataset.loc[dataset['gender'] == 'F'].value_counts().size

    
    # Determine the number of samples to increase
    increasing_count = int((len(dataset) * size_increasing) - len(dataset))
    
    female_ratio = num_fem_samples / len(dataset)
    
    # Calculate the desired number of removed samples for each gender
    male_increasing = int(increasing_count * female_ratio )
    female_increasing = int(increasing_count * (1- female_ratio))
    increments = {'M':male_increasing,'F':female_increasing}
    return increments

det_ovs_num_samples_gender(data_subset, 1.3)

In [None]:
def lp_ros_gender(D, size_increasing):
    increment =  det_ovs_num_samples_gender(D, size_increasing)
    print("Total samples to add: Male -", increment['M'],", Female - ", increment['F'])
    unique_values = np.unique(D['powerset'])
    labelset_Bag = []
    meanSize = {'M':0, 'F':0}
    for gender in meanSize.keys():
        value_counts = 0
   
        for value in unique_values:
            powerset = D.loc[(D['powerset']==value) & (D['gender'] == gender)]
            labelset_Bag.append((powerset,value,gender))
            value_counts += powerset.value_counts().size


        meanSize[gender] = (value_counts / len(unique_values))

    mean_values = sum(meanSize.values()) / len(meanSize)
    meanSize = {key: mean_values for key in meanSize}
    
    minBag = []
    bag_count = {'M':0, 'F':0}

    for labelset, powerset_value, gender in labelset_Bag:
        gender_size = meanSize[gender] 

        if len(labelset) < gender_size:
            minBag.append((labelset,powerset_value, gender))
            bag_count[gender] += 1


    mean_increment_per_gender = {
        gender: increment[gender] / bag_count[gender] if bag_count[gender] != 0 else 0
        for gender in bag_count.keys()
    }

    minBag = sorted(minBag, key=lambda tup: len(tup[0]), reverse=True)

    total_samples_added = {'M':0, 'F':0}  
    for i,(minSet,powerset_value,gender) in enumerate(minBag):
        if bag_count['M'] ==3:
            print("sth")

        gender_size = meanSize[gender] 

        incrementBag = int(min(abs(len(minSet)-gender_size),mean_increment_per_gender[gender]))
        remainder = int(mean_increment_per_gender[gender] - incrementBag)
        # distribute among Bags könnte meinen die meanReduction für die nachfolgenden Bags zu erhöhen;
        # gleichverteilt auf die verbleibenden Klassen 

        total_samples_added[gender] += incrementBag

        minSet = add_samples(minSet,incrementBag, gender)
        D = pd.concat([D, minSet])
        # calculate remaining classes for each gender
        bag_count[gender]-=1 
        if bag_count[gender] > 0:
            mean_increment_per_gender[gender] += remainder/bag_count[gender]
    print("Total samples added:", total_samples_added)
    return D
lp_ros_gender(data_subset, 1.1)

# RUS_c: 
## From "A First Approach to Deal with Imbalance in Multi-label Datasets"

In [33]:
def delete_samples(df, x, gender = None):
    if gender is not None:
        samples = df[df['gender'] == gender].sample(n=x)
    else:
        samples = df.sample(n=x)
  
    df = df.drop(samples.index)
    return df

In [34]:
def lp_rus(D, size_reduction):
    samples_to_delete = int(len(D) * size_reduction)
    print("Total samples to delete:", samples_to_delete)
    unique_values = np.unique(D['powerset'])
    labelset_Bag = []
    for value in unique_values:
        powerset = D.loc[D['powerset'] ==value]
        labelset_Bag.append((powerset,value))

    meanSize = sum(D['powerset'].value_counts()) / len(unique_values)  
    majBag = []
    for labelset,powerset in labelset_Bag:
        if len(labelset) > meanSize:
            majBag.append(labelset)
            # if powerset greater meanSize drop this powerset and append it at the end, after
            # its size has been reduced
            D = D.drop(D[D['powerset'] == powerset].index)
    
    meanReduction = samples_to_delete/len(majBag)
    majBag.sort(key=lambda df: len(df))
    total_samples_del = 0
    for i,majSet in enumerate(majBag):
        if total_samples_del >= samples_to_delete:
            break
        reductionBag = int(min(len(majSet)-meanSize,meanReduction))
        remainder = meanReduction - reductionBag
        # distribute among Bags könnte meinen die meanReduction für die nachfolgenden Bags zu erhöhen;
        # gleichverteilt auf die verbleibenden Klassen 
        # print("samples deleted: ", reductionBag)
        total_samples_del += reductionBag
        majSet = delete_samples(majSet,reductionBag)
        D = pd.concat([D, majSet])
        num_remaining_cl = ((len(majBag)-1)-i)
        if num_remaining_cl > 0:
            meanReduction += remainder/num_remaining_cl
    print("Total samples deleted:", total_samples_del)
    return D


## RUS_g: considering gender distribution

In [37]:
#The `det_excess` function takes a dataset as input and calculates the excess ratio between the majority and minority groups in terms of gender.
def det_excess(dataset):

    group_counts = dataset['gender'].value_counts()
    majority_group = group_counts.idxmax()
    minority_group = group_counts.idxmin()
    max_occurrences = group_counts[majority_group]
    min_occurrences = group_counts[minority_group]
    print("The majority group is:", majority_group)
    print("Number of occurrences:", max_occurrences)

    return max_occurrences,min_occurrences, majority_group, minority_group

det_excess(data_subset)

The majority group is: M
Number of occurrences: 84953


(0.5603205487583682, 84953, 66662, 'M', 'F')

In [38]:
# The `det_num_samples_group` function takes in a dataset and a size reduction value as parameters to define how many samples have to be deleted
# It first calls the `det_excess` function to determine the current ratio of occurrences between two groups in the dataset, 
# as well as the maximum and minimum occurrences and the majority and minority groups.
def det_num_samples_group(dataset, size_reduction):
    max_occurrences,min_occurrences,majority_group,minority_group = det_excess(dataset)
    
    
    # Determine the number of samples to remove
    removal_count = int(len(dataset) * size_reduction)
    difference = max_occurrences - min_occurrences
    
    max_removals = 0 
    min_removals = 0
    max_removals += difference 
    if removal_count > difference:
        excess_difference = (removal_count - difference) / 2
        max_removals += excess_difference 
        min_removals += excess_difference
    
    removals = {majority_group:max_removals,minority_group:min_removals}
    return removals


The majority group is: M
Number of occurrences: 84953


{'M': 24307.0, 'F': 6016.0}

In [39]:
def lp_rus_gender(D, size_reduction):
    removals = det_num_samples_group(D, size_reduction) 
    print("Total samples to delete: Male -", removals['M'],", Female - ", removals['F'])
    unique_values = np.unique(D['powerset'])
    labelset_Bag = []
    meanSize = {'M':0, 'F':0}
    for gender in meanSize.keys():
            value_counts = 0
    
            for value in unique_values:
                powerset = D.loc[(D['powerset']==value) & (D['gender'] == gender)]
                labelset_Bag.append((powerset,value,gender))
                value_counts += powerset.value_counts().size


            meanSize[gender] = (value_counts / len(unique_values))

    mean_values = sum(meanSize.values()) / len(meanSize)
    meanSize = {key: mean_values for key in meanSize}

    
    majBag = []
    bag_count = {'M':0, 'F':0}

    for labelset, powerset_value, gender in labelset_Bag:
        gender_mean_size = meanSize[gender]

        if len(labelset) > gender_mean_size:
            majBag.append((labelset, gender))
            bag_count[gender] += 1
            # if powerset greater meanSize drop this powerset and append it at the end, after
            # its size has been reduced
            D = D.drop(D[(D['powerset'] == powerset_value) & (D['gender'] == gender)].index)

    mean_reduction_per_gender = {
        gender: removals[gender] / bag_count[gender] if bag_count[gender] != 0 else 0
        for gender in bag_count.keys()
    }

    majBag.sort(key=lambda tup: len(tup[0]))

    total_samples_del = 0
    for i,(majSet,gender) in enumerate(majBag):
        gender_mean_size = meanSize[gender]
        reductionBag = int(min(len(majSet)-gender_mean_size,mean_reduction_per_gender[gender]))
        remainder = mean_reduction_per_gender[gender] - reductionBag
        # distribute among Bags könnte meinen die meanReduction für die nachfolgenden Bags zu erhöhen;
        # gleichverteilt auf die verbleibenden Klassen 

        total_samples_del += reductionBag
        majSet = delete_samples(majSet,reductionBag, gender)
        D = pd.concat([D, majSet])
        # calculate remaining classes for each gender
        bag_count[gender]-=1 #((len(majBag)-1)-i)
        if bag_count[gender] > 0:
            mean_reduction_per_gender[gender] += remainder/bag_count[gender]
    print("Total samples deleted:", total_samples_del)
    return D
df_lp_rus_gender = lp_rus_gender(data_subset, 0.2)      

The majority group is: M
Number of occurrences: 84953
Total samples to delete: Male - 24307.0 , Female -  6016.0
Total samples deleted: 30321


In [50]:
percentage = 0.3
training_data = data_subset.loc[data_subset["split"] == "train"]
test_vaildate = data_subset.loc[data_subset["split"] != "train"]
df_lp_rus_gender = lp_rus_gender(training_data, percentage)   

df_lp_rus_gender = pd.concat([df_lp_rus_gender, test_vaildate], ignore_index=True)

df_lp_rus_gender = df_lp_rus_gender.drop(['powerset'],axis=1)

file_name = "RUS/rus_gender" + str(percentage).replace(".","")+".csv"

df_lp_rus_gender.to_csv(file_name,index=False)

The majority group is: M
Number of occurrences: 82817
Total samples to delete: Male - 31104.5 , Female -  13219.5
Total samples deleted: 44323


In [48]:
percentage = 1.1
training_data = data_subset.loc[data_subset["split"] == "train"]
test_vaildate = data_subset.loc[data_subset["split"] != "train"]
df_lp_ros_gender = lp_ros_gender(training_data, percentage)   

df_lp_ros_gender = pd.concat([df_lp_ros_gender, test_vaildate], ignore_index=True)

df_lp_ros_gender = df_lp_ros_gender.drop(['powerset'],axis=1)

file_name = "ROS/ros_gender" + str(percentage).replace(".","")+".csv"

df_lp_ros_gender.to_csv(file_name,index=False)

Total samples to add: Male - 6492 , Female -  8281
sth
sth
Total samples added: {'M': 6468, 'F': 8268}
