In [1]:
import imbalance_degree.imbalance_degree as ib
import pandas as pd
import numpy as np
import openpyxl
def aggregate(df,labels, byGender):
     #Initialize an empty dictionary to store the aggregation functions
    aggregation_functions = {}

     #Iterate over the columns and add them to the aggregation functions dictionary
    for column in labels:
        aggregation_functions[column] = 'sum'
    if byGender:
         #Perform the dynamic aggregation
        result = df.groupby(['gender']).agg(aggregation_functions)
    else:
        result = pd.DataFrame(df.agg(aggregation_functions)).T

    return result

In [2]:
def format_with_commas(num):
    return '{:,.2f}'.format(num).replace('.', ',')

In [3]:
def compute_IR(df):

    # take an aggregated dataframe which contains the count of the labels per gender and disease and calculate the IR
    columns = list(df.columns)
    sum_result = []
    column_names = []
    for column_outer in columns[:]:
        max_min_values = []
        for column_inner in columns[:]:
            if column_outer == column_inner:
                continue
            else:
            # Calculate the max/min values for each combination of columns
                max_min_values.append(df.apply(lambda x: max(x[column_inner], x[column_outer])/min(x[column_inner], x[column_outer]),
                          axis=1).values)
        sum_result.append([sum(elements) for elements in zip(*max_min_values)])

    df_IR_disease = pd.DataFrame(data=sum_result,index=columns, columns=df.index.values)

    rows = df.index
    ir_result = []

    for column in columns[:]:
        sum_row_result = []
        for row_outer in rows:
            max_min_values = []
            for row_inner in rows:
                if row_outer >= row_inner:
                    continue
                else:

                    outer = df.loc[row_outer, column]
                    inner = df.loc[row_inner, column]
                    max_min_values.append( max(df.loc[row_outer, column], df.loc[row_inner, column]) / min(df.loc[row_outer, column], df.loc[row_inner, column]) )
            if len(max_min_values) > 0:
                sum_row_result.append(sum(max_min_values) )
        ir_result.append(sum(sum_row_result))
        

    ir_df = pd.DataFrame(ir_result, index=columns).T

   
    return df_IR_disease, ir_df

In [6]:
def mmult(df): #file_name
    df_IR_disease, df_IR_gender = compute_IR(df)
    df_IR_disease.to_numpy()
    df_IR_gender.to_numpy()
    matrix_mul = np.matmul(df_IR_gender,df_IR_disease)
    correction = (df_IR_disease.shape[0] - 1) * (df_IR_gender.shape[1]) 
    result = matrix_mul - correction
    result_df = pd.DataFrame(result)
    # Concatenate the DataFrames horizontally
    combined_df = pd.concat([df_IR_disease, df_IR_gender, result_df], axis=1)
    rounded_df = combined_df.round(6)
    formatted_df = rounded_df.applymap(format_with_commas)
    normalized_df = result_df.copy()
    normalized_df = normalized_df.sum(axis=1) / (len(df.columns) * len(df))
   
    return result_df.applymap(format_with_commas)#,formatted_df



In [None]:
data = {
    'Edema': [5000, 15000],
    'Cardiomegaly': [5000, 15000],
    'Atelectasis': [5000, 15000]
}

index = ['0', '1']

df = pd.DataFrame(data, index=index)


result_df = pd.DataFrame(columns=['mmult_Result'])
result = mmult(df)


result_df = result_df.append({'mmult_Result': result.values}, ignore_index=True)

# Loop through each class
for col in df.columns:
    current_freq = df.at['0', col]
    while current_freq <= 15000:
        current_freq += 5000
        if current_freq > 15000:
            continue
        else:
            df.at['0', col] = current_freq
        result = mmult(df)
        result_df = result_df.append({'mmult_Result': result.values}, ignore_index=True)
        


#result_df.to_excel('result_interpretation.xlsx', index=False)

In [15]:
data = {
    'Edema': [5000, 15000],
    'Cardiomegaly': [5000, 15000],
    'Atelectasis': [5000, 15000]
}

index = ['0', '1']

df = pd.DataFrame(data, index=index)
aggregated_df = pd.DataFrame(columns=df.columns)
aggregated_result = df.sum()
aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)

for col in df.columns:
    current_freq = df.at['0', col]
    while current_freq <= 15000:
        current_freq += 5000
        if current_freq > 15000:
            continue
        else:
            df.at['0', col] = current_freq
        
        aggregated_result = df.sum()
        aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)

        

            

# Print the aggregated results
print(aggregated_df)

# List to store data for each class
#class_data = {col: [] for col in df.columns}
class_data = [] #['classes']
results = []
# Iterate through each row in the aggregated DataFrame
for index, row in aggregated_df.iterrows():
    class_names = row.index
    class_frequencies = row.values

    # Iterate through each class and its frequency
    for class_outer, frequency in zip(class_names, class_frequencies):
        class_data.extend([class_names.get_loc(class_outer)] * int(frequency))

    results.append(ib.imbalance_degree(class_data, distance="EU"))
    
    class_data = []
result_df = pd.DataFrame(data=results,columns=["ID"])
#result_df.to_excel('result_ID.xlsx', index=False)

   Edema Cardiomegaly Atelectasis
0  20000        20000       20000
1  25000        20000       20000
2  30000        20000       20000
3  30000        25000       20000
4  30000        30000       20000
5  30000        30000       25000
6  30000        30000       30000


  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)
  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)
  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)
  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)
  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)
  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)
  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)


In [16]:
data = {
    'Edema': [5000, 15000],
    'Cardiomegaly': [5000, 15000],
    'Atelectasis': [5000, 15000]
}

index = ['0', '1']

df = pd.DataFrame(data, index=index)

aggregated_df = pd.DataFrame(columns=df.columns)
aggregated_result = df.sum()
aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)


for col in df.columns:
    current_freq = df.at['0', col]
    while current_freq <= 15000:
        current_freq += 5000
        if current_freq > 15000:
            continue
        else:
            df.at['0', col] = current_freq
        aggregated_result = df.sum()
        aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)

IRperLabel_values = []
mean_IRpLabel = []
for index, row in aggregated_df.iterrows():
    output = max(row.values)/row.values
    IRperLabel_values.append(output)
    mean_IRpLabel.append(np.mean(output))
        
result_df = pd.DataFrame(data=zip(IRperLabel_values,mean_IRpLabel))
result_df


  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)
  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)
  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)
  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)
  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)
  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)
  aggregated_df = aggregated_df.append(aggregated_result, ignore_index=True)


Unnamed: 0,0,1
0,"[1.0, 1.0, 1.0]",1.0
1,"[1.0, 1.25, 1.25]",1.166667
2,"[1.0, 1.5, 1.5]",1.333333
3,"[1.0, 1.2, 1.5]",1.233333
4,"[1.0, 1.0, 1.5]",1.166667
5,"[1.0, 1.0, 1.2]",1.066667
6,"[1.0, 1.0, 1.0]",1.0


# Random initialisation across three groups

In [5]:
data = {
    'Edema': [5000,10000,15000, 15000, 25000, 30000, 35000],
    'Cardiomegaly': [2500,5000,7500, 10000, 12500, 15000, 17500],
    'Atelectasis': [7500,12500,17500, 22500, 27500, 32500, 37500]
}

index = list(range(7))

df_class_distributions = pd.DataFrame(data, index=index)

# Print the transformed DataFrame
df_class_distributions.to_clipboard()

In [None]:
excel_index = 1
with pd.ExcelWriter('output_results_42.xlsx') as writer:
    for index, row in df_class_distributions.iterrows():
        np.random.seed(42)
        new_df = pd.DataFrame([row], columns=df_class_distributions.columns, index=[0])
        class_frequencies_np = np.array(row.values)


        total_values = {col: value for col, value in new_df.iloc[0].items()}

        num_groups = 3
        groups = []
        mean_value = class_frequencies_np / 3

        remaining_values = class_frequencies_np.copy()
        
        group_values = {group_index: np.zeros_like(new_df.values[0]) for group_index in range(num_groups)}
    
        for group_index in range(num_groups - 1):
            range_percentage = 0.45  # 30% around the mean
            

            lower_bound = mean_value * (1 - range_percentage)
            upper_bound = mean_value * (1 + range_percentage)

            group_row = np.random.randint(lower_bound, upper_bound + 1, size=num_groups)


            values =  np.minimum(group_row, remaining_values)
            group_values[group_index] = values
            
            remaining_values -= group_row
            mean_value = remaining_values / (num_groups - (group_index + 1))

        group_values[2] = remaining_values

        # Create a new DataFrame with the split data and groups
        result_df = pd.DataFrame(group_values, index=['Edema','Cardiomegaly','Atelectasis'])
        result_df = result_df.T
        result_df.to_excel(writer, sheet_name='Group_Sheet', startrow=excel_index, index=False)

        
        imbalance_score = mmult(result_df)
        imbalance_score = imbalance_score.T
        #imbalance_score_df = pd.DataFrame({'Imbalance Score': [imbalance_score]})
        imbalance_score.to_excel(writer, sheet_name='Group_Sheet',startrow=excel_index, startcol=5, index=False)
        excel_index +=20
        #result_df
        #print(result_df,imbalance_score)