In [1]:
import os
import pandas as pd
import re
import xml.etree.ElementTree as ET
import numpy as np

In [2]:
def show_index_xml(path):
    """
    Search for and display the content of an index.xml file within 'flexp' folders.

    Parameters:
        path (str): Root path to search for index.xml files.

    Returns:
        str: Content of the first index.xml file found.
    """
    for root, dirs, files in os.walk(path):
        if os.path.basename(root) == 'flexp':
            for file in files:
                if file == 'index.xml':
                    index_path = os.path.join(root, file)
                    with open(index_path, 'r') as f:
                        content = f.read()
                    return content
    return "No index.xml file found."

def get_channel_mapping(root_path):
    """
    Parse the index.xml files and extract the mapping of ChannelIDs to ChannelNames, including experiment, plate, and panel information.

    Parameters:
        root_path (str): Root path containing all experiments and plates.

    Returns:
        pd.DataFrame: A DataFrame containing Experiment, Plate, Panel, ChannelID, and ChannelName.
    """
    data = []

    for root, dirs, files in os.walk(root_path):
        if os.path.basename(root) == 'flexp':
            for file in files:
                if file == 'index.xml':
                    index_path = os.path.join(root, file)
                    tree = ET.parse(index_path)
                    root_element = tree.getroot()

                    # Extract Experiment and Plate from the directory structure
                    experiment_match = re.search(r'Exp(\d+)', root)
                    experiment = experiment_match.group(1) if experiment_match else 'Unknown'

                    plate_match = re.search(r'plate(\d+)', root, re.IGNORECASE)
                    plate = plate_match.group(1) if plate_match else 'Unknown'

                    # Extract panel name from the directory structure
                    panel_match = re.search(r'Panel([A-Z])', root)
                    panel = panel_match.group(1) if panel_match else 'Unknown'

                    # Extract namespace
                    namespace = {'ns0': root_element.tag.split('}')[0].strip('{')}

                    # Traverse <Map> tags and their <Entry> children
                    for map_elem in root_element.findall(".//ns0:Map", namespaces=namespace):
                        for entry in map_elem.findall(".//ns0:Entry", namespaces=namespace):
                            channel_id = entry.get("ChannelID")
                            channel_name = entry.findtext("ns0:ChannelName", namespaces=namespace)
                            if channel_id and channel_name:
                                data.append({
                                    'Experiment': experiment,
                                    'Plate': plate,
                                    'Panel': panel,
                                    'ChannelID': int(channel_id),
                                    'ChannelName': channel_name
                                })

    # Create a DataFrame from the collected data
    return pd.DataFrame(data)

def rename_channel_names(df):
    """
    Rename channel names in the DataFrame according to predefined mappings.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing a 'ChannelName' column.

    Returns:
        pd.DataFrame: The updated DataFrame with renamed channel names.
    """
    mapping = {
        'Alexa 488': 'Cy2',
        'HOECHST 33342': 'DAPI'
    }
    df['ChannelName'] = df['ChannelName'].replace(mapping)
    return df

def parse_image_files(root_path):
    """
    Iterate over image files within the flexp folders and extract their paths and metadata.

    Parameters:
        root_path (str): Root path containing all experiments and plates.

    Returns:
        pd.DataFrame: A DataFrame containing image metadata.
    """
    data = []

    for root, dirs, files in os.walk(root_path):
        if os.path.basename(root) == 'flexp':
            # Dynamically extract Experiment and Plate from the directory structure
            experiment_match = re.search(r'Exp(\d+)', root)
            experiment = experiment_match.group(1) if experiment_match else 'Unknown'

            plate_match = re.search(r'plate(\d+)', root, re.IGNORECASE)
            plate = plate_match.group(1) if plate_match else 'Unknown'

            for file in files:
                if file.endswith('.tiff') or file.endswith('.tif'):
                    # Extract panel name from the directory structure
                    panel_match = re.search(r'Panel([A-Z])', root)
                    panel = panel_match.group(1) if panel_match else 'Unknown'

                    # Parse file name for metadata
                    match = re.match(r'r(\d+)c(\d+)f(\d+)-ch(\d+).*\.tiff?', file)
                    if match:
                        row = int(match.group(1))
                        col = int(match.group(2))
                        fov = int(match.group(3))
                        channel = int(match.group(4))

                        data.append({
                            'Experiment': experiment,
                            'Plate': plate,
                            'Panel': panel,
                            'ImageName': file,
                            'Row': row,
                            'Column': col,
                            'FOV': fov,
                            'ChannelID': channel,
                            'Path': os.path.join(root, file)
                        })

    # Create a DataFrame from the collected data
    return pd.DataFrame(data)

In [3]:
path = "/home/labs/hornsteinlab/Collaboration/FUNOVA"

In [4]:
df_images = parse_image_files(path)
df_channel_mapping = get_channel_mapping(path)
df_channel_mapping = rename_channel_names(df_channel_mapping)
cols_info = pd.read_excel("marker_info.xlsx", sheet_name='cols_info')
rows_info = pd.read_excel("marker_info.xlsx", sheet_name='rows_info')

In [5]:
# ## Experiment 3 was already processed, now working on Experiment 4

# df_images = df_images.loc[df_images.Experiment == '4']
# df_images.index = list(range(len(df_images)))

In [6]:
# df_channel_mapping.loc[df_channel_mapping.Experiment == '4']

In [7]:
combined_df = pd.merge(df_images, rows_info, on='Row', how='left')
combined_df = pd.merge(combined_df, df_channel_mapping[['Panel', 'ChannelID', 'ChannelName', 'Experiment', 'Plate']], 
                       on=['Panel', 'ChannelID', 'Experiment', 'Plate'], how='left')
# Merge combined_df with cols_info based on Panel and ChannelName
combined_df = pd.merge(combined_df, cols_info, on=['Panel', 'ChannelName'],how='left')
# Update Staining, Function, and Category for DAPI channel
combined_df.loc[combined_df['ChannelName'] == 'DAPI', ['Staining', 'Category']] = 'Nucleus'
combined_df.loc[combined_df['ChannelName'] == 'DAPI', ['Function']] = 'DAPI'
combined_df['Function'] = combined_df['Function'].str.replace(' ', '_', regex=False)

In [8]:
combined_df.loc[(combined_df['Experiment'] == '4') & (combined_df['Plate'] == '1'), 'Plate'] = '3'
combined_df.loc[(combined_df['Experiment'] == '4') & (combined_df['Plate'] == '2'), 'Plate'] = '4'

In [9]:
combined_df

Unnamed: 0,Experiment,Plate,Panel,ImageName,Row,Column,FOV,ChannelID,Path,CellLine,PatientID,Stress,ChannelName,Staining,Function,Category
0,3,2,H,r12c16f93-ch1t1.tiff,12,16,93,1,/home/labs/hornsteinlab/Collaboration/FUNOVA/E...,C9orf72-HRE,1008566,1,DAPI,Nucleus,DAPI,Nucleus
1,3,2,H,r05c16f96-ch3t1.tiff,5,16,96,3,/home/labs/hornsteinlab/Collaboration/FUNOVA/E...,Control,1001733,0,Cy5,S139-phosphorylated-Histone2Ax staining,DNA_damage_pH2Ax,Neuronal Cell death/Senescence
2,3,2,H,r04c16f29-ch1t1.tiff,4,16,29,1,/home/labs/hornsteinlab/Collaboration/FUNOVA/E...,C9orf72-HRE,1008566,0,DAPI,Nucleus,DAPI,Nucleus
3,3,2,H,r16c15f61-ch2t1.tiff,16,15,61,2,/home/labs/hornsteinlab/Collaboration/FUNOVA/E...,C9orf72-HRE,981344,1,Cy2,Poly-ADP-ribosoe (PAR) staining,Parthanatos_early,Neuronal Cell death/Senescence
4,3,2,H,r12c16f12-ch3t1.tiff,12,16,12,3,/home/labs/hornsteinlab/Collaboration/FUNOVA/E...,C9orf72-HRE,1008566,1,Cy5,S139-phosphorylated-Histone2Ax staining,DNA_damage_pH2Ax,Neuronal Cell death/Senescence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473595,4,4,B,r02c04f80-ch3t1.tiff,2,4,80,3,/home/labs/hornsteinlab/Collaboration/FUNOVA/E...,TDP--43-G348V,1057052,0,Cy5,FK-2,Ubiquitin_levels,Proteostasis
473596,4,4,B,r07c03f87-ch2t1.tiff,7,3,87,2,/home/labs/hornsteinlab/Collaboration/FUNOVA/E...,Control,1017118,0,Cy2,phospho-IRE1a (Ser724) staining,UPR_IRE1a,Proteostasis
473597,4,4,B,r05c03f74-ch3t1.tiff,5,3,74,3,/home/labs/hornsteinlab/Collaboration/FUNOVA/E...,Control,1001733,0,Cy5,FK-2,Ubiquitin_levels,Proteostasis
473598,4,4,B,r07c04f96-ch3t1.tiff,7,4,96,3,/home/labs/hornsteinlab/Collaboration/FUNOVA/E...,Control,1017118,0,Cy5,FK-2,Ubiquitin_levels,Proteostasis


In [10]:
np.unique(combined_df.Function)

array(['Aberrant_splicing', 'Apoptosis', 'Autophagy', 'Cytoskeleton',
       'DAPI', 'DNA_damage_P53BP1', 'DNA_damage_pH2Ax',
       'Necroptosis_HMGB1', 'Necroptosis_pMLKL', 'Necrosis',
       'Neuronal_activity', 'Nuclear_speckles_SC35',
       'Nuclear_speckles_SON', 'Parthanatos_early', 'Parthanatos_late',
       'Protein_degradation', 'Senescence_signaling',
       'Splicing_factories', 'Stress_initiation', 'TDP-43', 'UPR_ATF4',
       'UPR_ATF6', 'UPR_IRE1a', 'Ubiquitin_levels',
       'impaired_Autophagosome', 'mature_Autophagosome'], dtype=object)

In [11]:
# Save combined_df to a CSV file
combined_df.to_csv("funova_metadata_Exp3-4.csv", index=False)

In [12]:
combined_df.columns

Index(['Experiment', 'Plate', 'Panel', 'ImageName', 'Row', 'Column', 'FOV',
       'ChannelID', 'Path', 'CellLine', 'PatientID', 'Stress', 'ChannelName',
       'Staining', 'Function', 'Category'],
      dtype='object')