In [1]:
import pandas as pd
import ast,re

In [None]:
def csv_with_top_similarity(input_csv_path, output_csv_path):
    """
    Add top similarity and top concept columns for each neuron (row) based on first entry in similarity and description list entries respectively.
    Args:
        input_csv_path (str): Path to the input CSV file.
        output_csv_path (str): Path to save the filtered CSV file.
    """
    # Step 1: Read the CSV file
    df = pd.read_csv(input_csv_path)
    # Step 2: Fix malformed list strings in the similarity column
    def fix_list_format(s):
        if isinstance(s, str):
            # Add commas between numbers using regex
            s = re.sub(r'\s+', ', ', s.strip())  # Replace spaces with commas
            s = s.replace('[,', '[')  # Fix cases where a comma appears after the opening bracket
            return s
        return s
    df['similarity'] = df['similarity'].apply(fix_list_format)
    # Step 3: Convert the fixed strings to actual Python lists
    df['similarity'] = df['similarity'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    # Step 4: Extract the first number in the similarity list for comparison
    df['Top_similarity'] = df['similarity'].apply(lambda x: x[0] if isinstance(x, list) else x)

    # Step 7: Extract the first string in the description list as the 'top_concept'
    def extract_top_concept(description):
        if isinstance(description, str):
            # Convert the string representation of the list to an actual list
            description_list = ast.literal_eval(description)
            if isinstance(description_list, list) and len(description_list) > 0:
                return description_list[0]  # Return the first string in the list
        return None  # Return None if the description is not a valid list
    df.loc[:, 'Top_concept'] = df['description'].apply(extract_top_concept)
    df = df.sort_values(by=['layer', 'Top_similarity'], ascending=[True, False])
    df.to_csv(output_csv_path, index=False)

In [None]:
#inputs are output csvs from mammoclip-dissect
mammo_pretrained_input_csv_path=""
gen_pretrained_input_csv_path=""

In [None]:
#outputs are desired output directory
mammo_pretrained_output_csv_path=""
gen_pretrained_output_csv_path=""

In [None]:
csv_with_top_similarity(mammo_pretrained_input_csv_path, mammo_pretrained_output_csv_path)
csv_with_top_similarity(gen_pretrained_input_csv_path, gen_pretrained_output_csv_path)

In [None]:
#Now evaluate and add mean similarities for each layer
#first load the processed dataframes
mammo_processed_df = pd.read_csv(mammo_pretrained_output_csv_path)
gen_processed_df = pd.read_csv(gen_pretrained_output_csv_path)

In [None]:
#get unique layers
unique_layers_mammo = mammo_processed_df['layer'].unique()
unique_layers_gen = gen_processed_df['layer'].unique()

In [None]:
#function to calculate mean similarities for a given DataFrame and output path
def calculate_and_save_mean_similarities(df, output_path):
    """
    Calculate mean similarities for each layer in the DataFrame and save to a new CSV file.

    Args:
        df (pd.DataFrame): DataFrame containing the processed data with 'layer' and 'Top_similarity' columns.
        output_path (str): Path to save the updated CSV file with mean similarities.
    """
    unique_layers = df['layer'].unique()
    for layer in unique_layers:
        # Filter the DataFrame to get rows corresponding to the current layer
        layer_data = df[df['layer'] == layer]
        # Calculate the mean of the similarity values for the current layer
        mean_similarity = layer_data['Top_similarity'].mean()
        # Add the mean similarity to the DataFrame
        df.loc[df['layer'] == layer, 'Mean_similarity'] = mean_similarity
    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_path, index=False)

In [None]:
#Add Mean similarity coloumn for M and G MammoCLIP Dissect
mammo_out_pth="Path/For/Output"
gen_out_pth="Path/For/Output"
calculate_and_save_mean_similarities(mammo_processed_df, mammo_out_pth)
calculate_and_save_mean_similarities(gen_processed_df, gen_out_pth)

In [None]:
#now for each df with mean sim, for each row check if the Top_similarity is greater than or equal to the Mean_similarity, based on this make a new column 'Threshold_met' with True/False
def add_threshold_met_column(df,output_path):
    """
    Add a 'Threshold_met' column to the DataFrame indicating whether the Top_similarity
    is greater than or equal to the Mean_similarity for each layer.

    Args:
        df (pd.DataFrame): DataFrame containing 'Top_similarity' and 'Mean_similarity' columns.
    """
    df['Local_Threshold_met'] = df['Top_similarity'] >= df['Mean_similarity']
    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_path, index=False)

In [None]:
mammo_mean_df = pd.read_csv(mammo_out_pth)
gen_mean_df = pd.read_csv(gen_out_pth)

In [None]:
mammo_thresh_out_pth=""
add_threshold_met_column(mammo_mean_df, mammo_thresh_out_pth)
gen_thresh_out_pth=""
add_threshold_met_column(gen_mean_df, gen_thresh_out_pth)

In [None]:
# Mean_similarity of mammo pretrained is global threshold for mammo pretrained and gen pretrained
#using df of mammo pretrained and gen pretrained, see if Top_similarity in gen pretrained is greater than or equal to Mean_similarity of mammo pretrained and create a new column 'Global_Threshold_met' with True/False in gen pretrained df
def compare_similarity(csv1_path, csv2_path, output_path):
    """
    Compare Mean_similarity from csv1 with Top_similarity in csv2 for each unique layer
    and add a new column 'Global_thresh_met' in csv2 indicating if the threshold is met.
    Args:
        csv1_path (str): Path to the first CSV file.
        csv2_path (str): Path to the second CSV file.
        output_path (str): Path to save the updated csv2 with the new column.
    Returns:
        pd.DataFrame: Updated DataFrame for csv2 with the new column.
    """
    # Load the CSVs into DataFrames
    df1 = pd.read_csv(csv1_path)
    df2 = pd.read_csv(csv2_path)
    # Create a dictionary mapping each layer to its Mean_similarity from csv1
    mean_similarity_map = df1.set_index('layer')['Mean_similarity'].to_dict()
    # Add the 'Global_thresh_met' column to df2
    df2['Global_thresh_met'] = df2.apply(
        lambda row: row['Top_similarity'] >= mean_similarity_map.get(row['layer'], float('-inf')),
        axis=1
    )
    # Save the updated csv2 to the specified output path
    df2.to_csv(output_path, index=False)

In [None]:
def compare_similarity2(csv1_path, csv2_path, output_path1, output_path2):
    """
    Compare Mean_similarity from csv1 with Top_similarity in csv2 for each unique layer,
    determine the higher value as the global threshold, and add a new column 'Global_thresh_met'
    in both DataFrames indicating if the threshold is met.

    Args:
        csv1_path (str): Path to the first CSV file.
        csv2_path (str): Path to the second CSV file.
        output_path1 (str): Path to save the updated csv1 with the new column.
        output_path2 (str): Path to save the updated csv2 with the new column.

    Returns:
        pd.DataFrame, pd.DataFrame: Updated DataFrames for csv1 and csv2 with the new column.
    """
    # Load the CSVs into DataFrames
    df1 = pd.read_csv(csv1_path)
    df2 = pd.read_csv(csv2_path)

    # Create a dictionary mapping each layer to its Mean_similarity from csv1
    mean_similarity_map_df1 = df1.set_index('layer')['Mean_similarity'].to_dict()
    # Create a dictionary mapping each layer to its Top_similarity from csv2
    mean_similarity_map_df2 = df2.set_index('layer')['Mean_similarity'].to_dict()

    # Determine the global threshold for each layer (max of df1 and df2 mean similarities)
    global_threshold_map = {
        layer: max(mean_similarity_map_df1.get(layer, float('-inf')), mean_similarity_map_df2.get(layer, float('-inf')))
        for layer in set(mean_similarity_map_df1.keys()).union(mean_similarity_map_df2.keys())
    }

    # Add the 'Global_thresh_met' column to df1
    df1['Global_thresh_met'] = df1.apply(
        lambda row: row['Top_similarity'] >= global_threshold_map.get(row['layer'], float('-inf')),
        axis=1
    )

    # Add the 'Global_thresh_met' column to df2
    df2['Global_thresh_met'] = df2.apply(
        lambda row: row['Top_similarity'] >= global_threshold_map.get(row['layer'], float('-inf')),
        axis=1
    )

    # Save the updated DataFrames to the specified output paths
    df1.to_csv(output_path1, index=False)
    df2.to_csv(output_path2, index=False)

    return df1, df2

In [None]:
#Output path for general pretrained path after comparing with global threshold which is higher of the M and G mean similarity values at a given layer
rev_gen_out="/your_dir/Rev_Global_Thresh_vindr_gen_pretrained_breast_clip_descriptions.csv"
rev_mammo_out="/your_dir/Rev_Global_Thresh_vindr_gen_pretrained_breast_clip_descriptions.csv"
#compare_similarity(mammo_thresh_out_pth, gen_thresh_out_pth, rev_gen_out)

In [None]:
compare_similarity2(csv_mammo_pre, csv_gen_pre,rev_mammo_out,rev_gen_out)

In [None]:
#Add categories to which Top_concept belongs to based on excel
#Assign category to each row
def map_concepts_to_categories(csv_path, excel_path, output_path):
    """
    Map Top_concept entries from a CSV to columns and worksheets in an Excel file.
    Assign the column name as 'Category' and worksheet name as 'Broad_categories' in the CSV.
    Args:
        csv_path (str): Path to the input CSV file.
        excel_path (str): Path to the input Excel file.
        output_path (str): Path to save the updated CSV file.
    Returns:
        pd.DataFrame: Updated DataFrame with new columns.
    """
    # Load the CSV into a DataFrame
    csv_df = pd.read_csv(csv_path)
    # Load the Excel file
    excel_data = pd.ExcelFile(excel_path)
    # Initialize new columns in the CSV
    csv_df['Category'] = None
    csv_df['Broad_categories'] = None
    # Iterate through each worksheet in the Excel file
    for sheet_name in excel_data.sheet_names:
        # Load the worksheet into a DataFrame
        sheet_df = excel_data.parse(sheet_name)
        # Iterate through each column in the worksheet
        for column in sheet_df.columns:
            # Check if any Top_concept in the CSV matches entries in the current column
            matches = csv_df['Top_concept'].isin(sheet_df[column].dropna())

            # Assign the column name and worksheet name to the corresponding rows in the CSV
            csv_df.loc[matches, 'Category'] = column
            csv_df.loc[matches, 'Broad_categories'] = sheet_name
    # Save the updated CSV to the specified output path
    csv_df.to_csv(output_path, index=False)


In [None]:
excel_path="/Your_directory/Breast_text_categories_revised.xlsx"

In [None]:
out_gen=""
map_concepts_to_categories(rev_gen_out, excel_path, out_gen)

In [None]:
out_mammo=""
map_concepts_to_categories(rev_mammo_out,excel_path, out_mammo)