# This calculates, summarizes, plots and exports as excel file the image analysis for nuclear import
Uses Fiji generated .txt files as input.  
Exports results and summarized results as excel file.  
Generates violinplots out of all single cells.  

## Initial processing and plotting of individual replicates

In [None]:
""" open folders sequentially to combine all txt files into one dict, which is used subsequently """

import tkinter as tk
from tkinter import filedialog
import os
import pandas as pd
import numpy as np
import re

rawfiles = {}
# --- GUI setup---
root = tk.Tk()
root.withdraw()

while True:
    folder_selected = filedialog.askdirectory(title="Choose folder with files, 'Cancel' when you're done, to continue")
    if not folder_selected:  # user cancelled
        break # exits whhile loop
    
    try:
        folder_date = re.findall(r'(\d{6})', folder_selected)[1]
    except IndexError:
        print("No date")

    for filename in sorted(os.listdir(folder_selected)):
        print(filename)
        file_path = os.path.join(folder_selected, filename)
        if filename.startswith("._") or filename.startswith("~$"):
            continue
        file_name, file_extension = os.path.splitext(filename)

        if file_extension == ".txt":            
            file_data = pd.read_csv(file_path, delimiter="\t", encoding='latin1')
        elif file_extension == ".csv":
            file_data = pd.read_csv(file_path, encoding='latin1')
        else:
            continue
        if "20" not in file_name:
            rawfiles[f"{folder_date} {file_name}"] = file_data

root.destroy()

rawfiles.keys()

In [None]:
import pandas as pd
pd.options.mode.copy_on_write = True
import numpy as np
import re
import copy

def Filter (rawfiles):
    """initial processing: remove columns and combine cell & nuc measurements in one row
    unprocessed txt files have cell and organelle measurements underneath and not assigned to each other yet
    some cells don't have a matching organelle, column names are the same
    fiji thresholding information is on bottom two rows and don't contain measurements
    remove cells which have 2 nuclei assigned
    """
    files = rawfiles.copy()
    #regex = r"(?P<cond>^\D*) (?P<time>.*min)_\d*(?P<col>.*)" #when name is: condition, timepoint
    regex = r"(?P<cond>.*) (?P<time>\d* min)_" #when name is: date, condition, timepoint
    filtered_files = {}
    for name, data in files.items():
        cond = re.search(regex, name).group("cond")
        time = re.search(regex, name).group("time")
        
    #Drop the two bottom rows & the column "message"
        data = data[:-2]
        proc_data = data.drop(columns=['Message'])
        
    #Calculate total signal (area*mean value) and signal per cell (mean value/ area)
        proc_data['Total Signal_Mean'] = proc_data.loc[:,'Area'] * proc_data.loc[:,'Mean']
        proc_data['Signal per Cell'] = proc_data.loc[:, 'Mean'] / proc_data.loc[:,"Area"]
        
    #Copy the lower half, the top half, rename columns, make new index from the unnamed column to merge later
        half_index = (len(proc_data) // 2) #integer division
        top_half = proc_data.iloc[:half_index].drop(columns=["Label"])
        top_half.columns = [re.sub(r'^ ', 'Cell_No', col) for col in top_half.columns]
        top_half["Cell_Label"] = top_half["Cell_No"]
        top_half = top_half.set_index("Cell_No")
        bottom_half = proc_data.iloc[half_index:].copy().drop(columns=[" "])
        bottom_half.columns = [re.sub(r'^Total', 'Nuclear', col) for col in bottom_half.columns]
        bottom_half.columns = [re.sub(r'per Cell', 'per Nuc', col) for col in bottom_half.columns]
    # Apply regex pattern to non-Nan values, extract the matching object as a string (using .group() ) and make it a number
    #and create 2 new columns to filter duplicate values -> remove cells with 2 nuclei
        num_pattern = r"(?<=:0)\d{3}"
        bottom_half["Cell_No"] = bottom_half["Label"].apply(
            lambda x: re.search(num_pattern, x).group() if pd.notna(x) and re.search(num_pattern, x) else None)\
            .astype("float")
        filtered_half = bottom_half.drop_duplicates(subset=["Cell_No"], keep=False).set_index("Cell_No").dropna()
    
    #Merge the two filtered df & drop rows with any NaN values
        merged = top_half.join(filtered_half.drop(columns="Label"), how="right", rsuffix="_Nuc")
        merged.columns = [re.sub(r'^Mean', 'Px Intst Mean', col) for col in merged.columns]
        filtered_files[name] = merged
    return filtered_files

    
def load_bkg ():
    """upload the NLS-neon background file to subtract the mean bkg pixel intensity from NLS-neon signal"""
    
    root = tk.Tk()
    root.withdraw()

    bkg_path = filedialog.askopenfilename(title="Please select the NLS-neon bkg.xlsx")
    print("bkg file:", bkg_path)
    bkg_file = pd.read_excel(bkg_path, sheet_name="CombinedSummary", index_col=[0,1]) #define which columns are the multiindex
    root.destroy()
    return bkg_file

def BkgSub_Calc (filtered):
    """renames columns according to measured organelle, 
    subtracts previously determined background signal intensities of each cell and organelle 
    does not subtract for BFP background files
    calculates Mean and Standard Error for each seperate file (field of view)
    """
    
    files = filtered.copy()
    regex = r"(?P<date>\d*) (?P<cond>\D*) (?P<time>\d* min)_" #when name is: date, condition, timepoint
    if "BFP" not in os.path.basename(file_dir) and "bkg" not in os.path.basename(file_dir):
    #upload the NLS-neon background file to subtract the background from NLS-neon
        bkg_file = load_bkg()
    calc_files = {}
    for name, file in files.items():
        cond = re.search(regex, name).group("cond").split()[0]+" no estr"
        time = re.search(regex, name).group("time")
        if "BFP" not in os.path.basename(file_dir) and "bkg" not in os.path.basename(file_dir):
    #Subtract mean background of each Cell or Nucleus Signal
            file["Bkg. Sub Px Intst_Cell"] = file.loc[:,"Px Intst Mean"] - bkg_file["Combined Mean: Px Intst Mean"].loc[(cond, time)]
            file["Bkg. Sub Px Intst_Nuc"] = file.loc[:,"Px Intst Mean_Nuc"] - bkg_file["Combined Mean: Px Intst Mean_Nuc"].loc[(cond, time)]
        #all Intensities after Bkg subtraction need to be >=1 to be a valid signal.
            file = file[file["Bkg. Sub Px Intst_Cell"] >=1]
            file = file[file["Bkg. Sub Px Intst_Nuc"] >=1]
            file['Bkg. Sub Total Signal_Mean'] = file.loc[:,'Area'] * file.loc[:,'Bkg. Sub Px Intst_Cell']
            file['Bkg. Sub Nuclear Signal_Mean'] = file.loc[:,'Area_Nuc'] * file.loc[:,'Bkg. Sub Px Intst_Nuc']
            file["Nuclear enrichment"] = file.loc[:,"Bkg. Sub Px Intst_Nuc"]/ file.loc[:,"Bkg. Sub Px Intst_Cell"]
        #The Nuc signal is part of the whole cell signal, 
        #cells which Nuc signal is stronger than the WC signal (NE <1) are sorted out
            file = file[file["Nuclear enrichment"] >=1]
            file['Nuclear Fraction'] = file.loc[:,'Bkg. Sub Nuclear Signal_Mean'] / file.loc[:,'Bkg. Sub Total Signal_Mean']
        else:
    #Calculate ratio of nucleus intensity and fraction of nucleus signal in the whole cell
            file["Nuclear enrichment"] = file.loc[:,"Px Intst Mean_Nuc"]/ file.loc[:,"Px Intst Mean"]  
            file['Nuclear Fraction'] = file.loc[:,'Nuclear Signal_Mean'] / file.loc[:,'Total Signal_Mean']

    #Generate new df to store the summarized results
    #Calculate Mean of the intensities, Standard Deviation and average cell and nucleus area 
        summary = pd.DataFrame(columns=[""])
        summary.loc[0] = [None]  
        summary["Mean: Area_Cell"] = file["Area"].mean()
        summary["Mean: Area_Nucleus"] = file["Area_Nuc"].mean()
        if "BFP" not in os.path.basename(file_dir) and "bkg" not in os.path.basename(file_dir):
            summary["Mean: Bkg. Sub Px Intst_Cell"] = file["Bkg. Sub Px Intst_Cell"].mean()
            summary["Mean: Bkg. Sub Px Intst_Nuc"] = file["Bkg. Sub Px Intst_Nuc"].mean()
            summary["SEM: Bkg. Sub Px Intst_Cell"] = file["Bkg. Sub Px Intst_Cell"].sem()
            summary["SEM: Bkg. Sub Px Intst_Nuc"] = file["Bkg. Sub Px Intst_Nuc"].sem()
        else: 
            summary["Mean: Px Intst_Cell"] = file["Px Intst Mean"].mean()
            summary["Mean: Px Intst_Nuc"] = file["Px Intst Mean_Nuc"].mean()
            summary["SEM: Px Intst_Cell"] = file["Px Intst Mean"].sem()
            summary["SEM: Px Intst_Nuc"] = file["Px Intst Mean_Nuc"].sem()
        
        summary["Mean: Signal per Cell"] = file["Signal per Cell"].mean()
        summary["Mean: Signal per Nucleus"] = file["Signal per Nuc"].mean()
        summary["Mean: Nuclear enrichment"] = file["Nuclear enrichment"].mean()
        summary["SEM: Nuclear enrichment"] = file["Nuclear enrichment"].sem()
        summary["Mean: Nuclear Fraction"] = file["Nuclear Fraction"].mean()
        summary["SEM: NucFrac"] = file["Nuclear Fraction"].sem()
        summary["# of cells"] = file["Cell_Label"].count()

    #Merge the two dataframes 
        final = file.reset_index().join(summary.drop(columns=""), how="left")
    
        calc_files[f"{name}"] = final
    return calc_files

def Summarize (calc_files):
    """
    Combine all single files generated earlier into one final small summary df ("single_merge") & a summary of each file (summary)
    Distinguishes between bkg file and real file.
    Input dfs contain all single cell outputs and a single row with all means and SEM
    The function uses multiindexing to combine it, keep every single df and keep it tidy
    """
    
    quant_files = copy.deepcopy(calc_files)
    regex = r"(?P<date>\d*) (?P<cond>\D*) (?P<time>.*min)_\d*(?P<col>.*)" #\s matches any whitespace character outside of capture group
    summary = pd.DataFrame()

    for name, file in quant_files.items():
        
        if file.columns.str.contains("Bkg. Sub").any(): #make the columns a string then look for any match
            subset = ["Mean: Area_Cell", "Mean: Area_Nucleus",
                      "Mean: Bkg. Sub Px Intst_Cell", "Mean: Bkg. Sub Px Intst_Nuc", 
                      "SEM: Bkg. Sub Px Intst_Cell", "SEM: Bkg. Sub Px Intst_Nuc",
                      "Mean: Signal per Cell", "Mean: Signal per Nucleus",
                      "Mean: Nuclear enrichment", "SEM: Nuclear enrichment", 
                      "Mean: Nuclear Fraction", "SEM: NucFrac",
                      "# of cells"]
            single_subset = ["Area", "Area_Nuc",
                      "Bkg. Sub Px Intst_Cell", "Bkg. Sub Px Intst_Nuc", 
                       "Signal per Cell", "Signal per Nuc",
                      "Nuclear enrichment",
                      "Nuclear Fraction",
                      "# of cells"]
        else: 
            subset = ["Mean: Area_Cell", "Mean: Area_Nucleus", 
                      "Mean: Px Intst_Cell", "Mean: Px Intst_Nuc", 
                      "SEM: Px Intst_Cell", "SEM: Px Intst_Nuc",
                      "Mean: Signal per Cell","Mean: Signal per Nucleus",
                      "Mean: Nuclear enrichment", "SEM: Nuclear enrichment",
                      "Mean: Nuclear Fraction", "SEM: NucFrac",
                      "# of cells"]
            single_subset = ["Area", "Area_Nuc",
                      "Px Intst Mean", "Px Intst Mean_Nuc", 
                       "Signal per Cell", "Signal per Nuc",
                      "Nuclear enrichment",
                      "Nuclear Fraction",
                      "# of cells"]

        date = str(re.search(regex, name).group("date"))
        cond = re.search(regex, name).group("cond")
        time = re.search(regex, name).group("time")
        file.columns = [f"{cond} {time}_{col}" for col in file.columns] #rename all columns to generate the multiindex
       
        multiindex = pd.MultiIndex.from_arrays(
            arrays=[[date] * len(file.columns),
                    [cond] * len(file.columns),
                    [time] * len(file.columns),
                    [re.search(regex, col).group("col") for col in file.columns]
                   ], names=["Replicate", "Condition", "Timepoint", "Col"]
        )
        file.columns = multiindex
        file_stacked = file.stack(["Replicate", "Condition", "Timepoint"], future_stack=True).droplevel(0)
        detailed_df = file_stacked[single_subset]
    
        if summary.empty:
            summary = file_stacked[subset].dropna()
            detailed_summary = detailed_df
        else:
            summary = pd.concat([summary, file_stacked[subset].dropna()], axis=0)
            detailed_summary = pd.concat([detailed_summary, detailed_df], axis=0)
    #subset the df, groupby condition and timepoint, calculate mean or sum, add a prefix to columns

    single_mean = detailed_summary.loc[:,[col for col in bkg_df.columns if not "#" in col]].groupby(["Condition", "Timepoint"]).mean().add_prefix("Combined Mean: ")

    single_sem = detailed_summary.loc[:, [col for col in bkg_df.columns if not any(substring in col for substring in ["#", "Area", "Signal"])]
                ].groupby(["Condition", "Timepoint"]).sem().add_prefix("SEM: ")
    #sum up all analyzed cells
    single_sum = detailed_summary.loc[:,["# of cells"]].groupby(["Condition","Timepoint"]).sum().add_prefix("Analyzed ")

    single_merged = pd.concat([single_mean, single_sem, single_sum], axis=1)
    
    return summary, single_merged

## call the above functions to calculate enrichment of NLS-neon in the nucleus and save as excel file

In [None]:
filtered_files = Filter(rawfiles)
quant_files = BkgSub_Calc(filtered_files)
summary, single_merge = Summarize(quant_files)
single_merge

In [None]:
"""generate an excel file with all single files as sheets and summary sheets and save"""

import pandas as pd
import numpy as np
import re

folder_date = re.findall(r'(\d{6})', file_dir)[1]
folder_name = os.path.basename(file_dir)
save_path = os.path.join(
    os.path.dirname(os.path.dirname(file_dir)), #the parent directory of the last folder
    re.search(r"\D*(?= Quantification)", os.path.basename(file_dir))[0] #match NLS-neon etc, 
    )+".xlsx"
print(save_path)

with pd.ExcelWriter(save_path, engine='openpyxl') as writer: #the file is automatically saved when the "with" block is finished
    single_merge.to_excel(writer, sheet_name="CombinedSummary", index=True)
    summary.to_excel(writer, sheet_name="Summary", index=True)
    #Save each individual merged_df to a combined excel file 
    for name, file in quant_files.items():
        #match the first part of the name without the hyperstack and use as excel sheet name
        sheet_name = re.search(r".*(?= hyperstack)", name).group()
        file.to_excel(writer, sheet_name=sheet_name, index=False)  

# Plot the enrichment for each single cell: Violinplots

In [None]:
#load the excel file
#put all columns underneath each other in one df
#calculate log2
import tkinter as tk
from tkinter import filedialog
import os
import numpy as np
import pandas as pd
pd.options.mode.copy_on_write = True
import re

def inrow_log2 ():
    root = tk.Tk()
    root.withdraw()
    
    data_path = filedialog.askopenfilename(title="Please select the .xlsx file to plot")
    print(data_path)
    global file_inrow
    data_dict = pd.read_excel(data_path, sheet_name=None)
    root.destroy()
    
    file_inrow = pd.DataFrame()
    for name, file in data_dict.items():
        regex = r"(?P<cond>\D*) (?P<time>.*min)_(?P<col>.*)"
        if "Summary" not in name:
            file = file[["Nuclear enrichment"]]
            file["log2"] = np.log2(file)
            cond = re.search(regex, name).group("cond").split()[0]
            time = re.search(regex, name).group("time")
            file.columns = [f"{cond} {time}_{col}" for col in file.columns] #rename all columns to generate the multiindex
           
            multiindex = pd.MultiIndex.from_arrays(
                arrays=[[cond] * len(file.columns),
                        [time] * len(file.columns),
                        [re.search(regex, col).group("col") for col in file.columns]
                       ], names=["Condition", "Timepoint", "Col"]
            )
            file.columns = multiindex
            file_log = file
            if file_inrow.empty:
                file_inrow= file_log
            else:
                file_inrow = pd.concat([file_inrow, file_log], axis=0, ignore_index=True)
    return data_path, file_inrow  

In [None]:
# Save the new df as an excel file

import pandas as pd
import numpy as np
import re

NLSneon_path, NLSneon = inrow_log2()
BFP_path, BFP = inrow_log2()

save_path = NLSneon_path.split(".")[0]+"_log2.xlsx"
print(save_path)

with pd.ExcelWriter(save_path, engine='openpyxl') as writer: #the file is automatically saved when the "with" block is finished
    NLSneon.to_excel(writer, sheet_name="NLSneon_allcells", index=True)
    BFP.to_excel(writer, sheet_name="BFP_allcells", index=True)
NLSneon

In [None]:
""" statistics on the 'inrow file' """
from scipy import stats
NLSneon_path, NLSneon = inrow_log2()
BFP_path, BFP = inrow_log2()

timepoint = input("Type the timpoint in xx min")
N_group1 = NLSneon.loc[:, ("nt", timepoint, "Nuclear enrichment")].dropna()
N_group2 = NLSneon.loc[:, ("DTT", timepoint, "Nuclear enrichment")].dropna()

B_group1 = BFP.loc[:, ("nt", timepoint, "Nuclear enrichment")].dropna()
B_group2 = BFP.loc[:, ("DTT", timepoint, "Nuclear enrichment")].dropna()

u_stat, N_p_value = stats.mannwhitneyu(N_group1, N_group2, alternative="two-sided")
u_stat, B_p_value = stats.mannwhitneyu(B_group1, B_group2, alternative="two-sided")

print("NLS-neon:", N_p_value)
print("BFP:", B_p_value)

### the actual plotting

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

NLSneon_path, NLSneon = inrow_log2()
BFP_path, BFP = inrow_log2()

save_path = os.path.dirname(NLSneon_path) #the parent directory of the last folder

color_map = {
    'nt': '#F5F5F5',
    'DTT': '#707070',
}
order = ["nt", "DTT"]

NLSneon = NLSneon.stack(["Condition", "Timepoint"], future_stack=True).droplevel(0).reset_index().dropna().sort_values(
    by=['Condition', 'Timepoint'], key=lambda x: pd.Categorical(x, categories=order))

BFP = BFP.stack(["Condition", "Timepoint"], future_stack=True).droplevel(0).reset_index().dropna().sort_values(
    by=['Condition', 'Timepoint'], key=lambda x: pd.Categorical(x, categories=order))

fig, axes = plt.subplots(2, 1, figsize=(6, 9))  # 2 rows, 1 column

sns.violinplot(data=NLSneon,
               x='Timepoint', y='log2',
               ax=axes[0],
               hue="Condition", #split=True,
               palette=color_map, dodge=True,
              inner="quart"
              )

# Second Plot: 
sns.violinplot(data=BFP,
               x='Timepoint', y='log2',
               ax=axes[1],
               hue="Condition", #split=True,
               palette=color_map, dodge=True,
              inner="quart"
              )

# Customize the first subplot (error bars plot)
axes[0].set_title("Fold enrichment: NLS-neon", fontsize=16)
axes[0].set_ylabel("log2 fold change", fontsize=12)
axes[0].set_yticks(np.arange(-0.5, 3, 0.5)) #ticks from 0.1 to 0.5 with stepsize of 0.1
axes[0].set_xticklabels(NLSneon['Timepoint'].unique(), rotation=45)
axes[0].legend(frameon=False)

# Customize the second subplot (Signal per Cell and Nucleus plot)
axes[1].set_title("Fold enrichment: BFP", fontsize=16)
axes[1].set_ylabel("log2 fold change", fontsize=12)
axes[1].set_yticks(np.arange(-0.5, 3, 0.5))
axes[1].set_xticklabels(BFP['Timepoint'].unique(), rotation=45)
axes[1].legend(frameon=False, loc="upper right")

plt.tight_layout(pad=3.0)

# save the plots

saveplots = input("Do you want to save the plots? -yes -no")

if saveplots == "yes":
    plt.savefig(os.path.join(save_path, f"Import fold change.svg"), 
                format='svg', dpi=300, bbox_inches='tight')  # Save with high resolution, crop whitespace around
    plt.savefig(os.path.join(save_path, f"Import fold change.png"), 
                dpi=300, bbox_inches='tight')  # Save with high resolution, crop whitespace around

# Show the plot
plt.show()