In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
import sys
import glob
from pptx import Presentation 
from pptx.util import Inches 
import seaborn as sns 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import LocalOutlierFactor
import scipy
from plotly.tools import FigureFactory as FF
from plotly.offline import iplot
import cufflinks as cf
from plotly import __version__ 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()
import plotly.express as px

In [None]:
##FUNCTIONS 

In [None]:
def summarize_dataframe(df, group_col):
    """
    This function takes a pandas DataFrame and a column to group by as inputs and returns
    a summary DataFrame with statistical parameters for each group.
    """
    # Group the DataFrame by the specified column
    grouped_df = df.groupby(group_col)
   
    # Define a dictionary to hold the statistical parameters to compute for each column
    agg_dict = {}
   
    # Loop over the columns in the DataFrame and add the statistical parameters to the agg_dict
    for col in df.columns:
        if col != group_col:
            agg_dict[col] = [
                ('count', 'count'),
                ('mean', 'mean'),
                ('std', 'std'),
                ('min', 'min'),
                ('25%', lambda x: np.quantile(x, 0.25)),
                ('median', 'median'),
                ('75%', lambda x: np.quantile(x, 0.75)),
                ('max', 'max')
                
            ]
   
    # Compute the summary statistics for each group and column using the agg_dict
    summary_df = grouped_df.agg(agg_dict)
   
    # Flatten the multi-index column names into a single level
    summary_df.columns = [f'{col}_{stat}' for col, stat in summary_df.columns]
   
    # Rename the index column
    summary_df.index.name = group_col
   
    # Return the summary DataFrame
    return summary_df

In [None]:
def dist_check_if_pre(frame):
    
    print("frame_ready")
    for col in frame:
            #Nh_df_24 = Nh_df_24.drop(columns=["PC", "group_with_pc", "group", "group_with_id"])
            plt.hist(frame[col])
            plt.title(f"Histogram of log({col})")
            plt.xlabel(f"log({col})")
            plt.ylabel("Frequency")
            plt.show()
            plt.savefig(curr_out_path + '//' + f"hist_pre{col}.pdf", dpi = 600)
            plt.close()
            print("hist_done!")
            plt.figure()
            scipy.stats.probplot(frame[col], dist="norm", plot=plt)
            plt.title(f"Q-Q-{col}")
            plt.show
            plt.savefig(curr_out_path + '//' + f"Q-Q_pre{col}.pdf", dpi = 600)
            plt.close()
            print("qq_done:)")
            plt.figure()
            sns.distplot(frame[col], kde = True, color ='red', bins = 30)
            plt.title(f"dist-{col}")
            plt.show
            plt.savefig(curr_out_path + '//' + f"dist_pre{col}.pdf", dpi = 600)
            plt.close()
            print("on it:)")
            print("done:)")
        

In [None]:
def dist_check_if_max(frame):
    
    print("frame_ready")
    for col in frame:
            #Nh_df_24 = Nh_df_24.drop(columns=["PC", "group_with_pc", "group", "group_with_id"])
            plt.hist(frame[col])
            plt.title(f"Histogram of log({col})")
            plt.xlabel(f"log({col})")
            plt.ylabel("Frequency")
            plt.show()
            plt.savefig(curr_out_path + '//' + f"hist_max{col}.pdf", dpi = 600)
            plt.close()
            print("hist_done!")
            plt.figure()
            scipy.stats.probplot(frame[col], dist="norm", plot=plt)
            plt.title(f"Q-Q-{col}")
            plt.show
            plt.savefig(curr_out_path + '//' + f"Q-Q_max{col}.pdf", dpi = 600)
            plt.close()
            print("qq_done:)")
            plt.figure()
            sns.distplot(frame[col], kde = True, color ='red', bins = 30)
            plt.title(f"dist-{col}")
            plt.show
            plt.savefig(curr_out_path + '//' + f"dist_max{col}.pdf", dpi = 600)
            plt.close()
            print("on it:)")
            print("done:)")

In [None]:
##PAS ASSAY ANALYSIS
path = ('Results_PAS') 
if not os.path.exists(path):
    os.mkdir(path)
    print('Output folder created')
input_path = [f for f in glob.glob(os.path.join('PAS_DATA', '*.csv'))]
print(input_path) 
verbose = False
for path_i in input_path: 
    CLEAN_df = pd.read_csv(path_i, skip_blank_lines=True)
    path_i.split('.csv')[0].split('PAS_DATA')[1]
    name = path_i.split('.csv')[0].split('PAS_DATA\\')[1]
    print('Current file name {}'.format(name))
    curr_out_path = path + '//' + '{}'.format(name)
    if not os.path.exists(curr_out_path):
        os.mkdir(curr_out_path)
        print('Output folder created')  
    ##CLEAN AND INDEX
    CLEAN_df = CLEAN_df.drop(columns=['Plate ID', 'Row', 'Column'])
    CLEAN_df["group"] = CLEAN_df["Cell Type"].astype(str) + CLEAN_df["Compound"].astype(str)
    CLEAN_df.set_index(["group"], inplace = True,
                            append = True, drop = False)
    CLEAN_df["group_with_id"] = + CLEAN_df["Cell Type"].astype(str) + CLEAN_df["Compound"].astype(str) + CLEAN_df["CELL ID"].astype(str)
    CLEAN_df.set_index(["group_with_id"], inplace = True,
                            append = True, drop = False)
    #drop NA
    CLEAN_df.dropna(axis=0, thresh=30)
    #remove unwanted cells in analysis 
    # clear low and high cell count
    cell_count =  CLEAN_df["Nuclei Nuclei Count wv1"].values
    CLEAN_df = CLEAN_df.loc[cell_count<1200]
    cell_count =  CLEAN_df["Nuclei Nuclei Count wv1"].values
    CLEAN_df = CLEAN_df.loc[cell_count>100]
    ##EXPORT FOR STATITICS AND VIS
    df_clean = CLEAN_df.copy()
    df_clean.to_csv(curr_out_path + '//' + '{}_forstat.csv'.format(name))
    print('done')

In [None]:
## IF per MARKER
path = ('Results_IF_markerx') 
if not os.path.exists(path):
    os.mkdir(path)
    print('Output folder created')
input_path = [f for f in glob.glob(os.path.join('folder', '*.csv'))]
print(input_path) 
verbose = False
for path_i in input_path: 
    CLEAN_df = pd.read_csv(path_i, skip_blank_lines=True)
    path_i.split('.csv')[0].split('folder')[1]
    name = path_i.split('.csv')[0].split('folder\\')[1]
    print('Current file name {}'.format(name))
    curr_out_path = path + '//' + '{}'.format(name)
    if not os.path.exists(curr_out_path):
        os.mkdir(curr_out_path)
        print('Output folder created')  
    # clean and index
    CLEAN_df = CLEAN_df.drop(columns=['Number of Analyzed Fields', 'Time [s]', 'Row', 'Column',
                           'Concentration', 'Cell Count', 'Plane', 'Timepoint'])
    CLEAN_df["group"] = CLEAN_df["Cell Type"].astype(str) + CLEAN_df["Compound"].astype(str)
    CLEAN_df.set_index(["group"], inplace = True,
                            append = True, drop = False)
    CLEAN_df["group_with_id"] = + CLEAN_df["Cell Type"].astype(str) + CLEAN_df["Compound"].astype(str) + CLEAN_df["CELL ID"].astype(str)
    CLEAN_df.set_index(["group_with_id"], inplace = True,
                            append = True, drop = False)
    #drop NA
    CLEAN_df.dropna(axis=0, thresh=30)
    # clear low and high cell count
    cell_count =  CLEAN_df["all_cells - Number of Objects"].values
    CLEAN_df = CLEAN_df.loc[cell_count<1200]
    cell_count =  CLEAN_df["all_cells - Number of Objects"].values
    CLEAN_df = CLEAN_df.loc[cell_count>100]
    print(f'Total number of cells {len(CLEAN_df)}')
    ## examine data 
    df_clean = CLEAN_df.copy()
    sum_data_marker = summarize_dataframe(df_clean, "group")
    file1 = 'marker-summary.csv'
    sum_data_marker.to_csv(file1)
    df_vis = df_clean[['all_cells - chanel_3_Intensity Mean - Mean per Well']]
    dist_check_if_pre(df_vis)
    df_vis.plot(kind = 'bar')
    print('done')
    df_max_scaled = df_clean[['all_cells - chanel_3_Intensity Mean - Mean per Well']]
    for column in df_max_scaled.columns:
        df_max_scaled[column] = df_max_scaled[column]  / df_max_scaled[column].abs().max()
    df_vis2 = df_max_scaled[['all_cells - chanel_3_Intensity Mean - Mean per Well']]
    dist_check_if_max(df_vis2)
    df_vis2.plot(kind = 'bar')
    print('done')
    df_max_scaled.to_csv(curr_out_path + '//' + '{}_max_forstat.csv'.format(name))
    print('max_ready')

In [None]:
##mitophagy
all_files = glob.glob('mitophagy data/*.csv')

li = []

for filename in all_files:
    df = pd.read_csv(filename)

    li.append(df)

data = pd.concat(li, axis=0, ignore_index=True)

In [47]:
#index
data["group"] = data["Cell Type"].astype(str) + data["Compound"].astype(str)
data.set_index(["group"], inplace = True,
                            append = True, drop = False)
data["group_with_id"] =  data["Cell Type"].astype(str) + data["Compound"].astype(str) + data["CELL ID"].astype(str)
data.set_index(["group_with_id"], inplace = True,
                            append = True, drop = False)

In [None]:
##clear cell count
data = data.loc[cell_count<1200]
sns.histplot(data["all_cells - Number of Objects"].values)
plt.show()
cell_count =  data["all_cells - Number of Objects"].values
data = data.loc[cell_count>100]
sns.histplot(data["all_cells - Number of Objects"].values)
plt.show()

In [None]:
data_final = N_df.drop(columns=['CELL ID','Cell Type', 'Column',
       'Compound', 'Height [µm]','Number of Analyzed Fields', 'Plane', 'Row', 'Time [s]', 'Timepoint',
       'all_cells - Number of Objects', 'group', 'group_with_id'
                                                         ])

In [53]:
data_final = data_final.reset_index()

In [None]:
#create final frame
d = data_final[["group", "Image Region chanel_2 - chanel_2_in_chanel_2_intensity Mean - Mean per Well",
           "Image Region chanel_3 - chanel_3_in_chanel_3_intensity Mean - Mean per Well"]]
df_clean_final = d.rename(
     columns={ "Image Region chanel_2 - chanel_2_in_chanel_2_intensity Mean - Mean per Well": "m_cherry",
              "Image Region chanel_3 - chanel_3_in_chanel_3_intensity Mean - Mean per Well": "GFP"})
df_clean_final["total"] = df_clean_final["m_cherry"] + df_clean_final["GFP"]

In [58]:
df_clean_final["mitophagy"] = df_clean_final["m_cherry"] / df_clean_final["total"]

In [64]:
#index and normalize
df_clean_final.set_index(["group"], inplace = True,
                            append = True, drop = False)
hc_frame = df_clean_final[(df_clean_final['group'] == 'HCFCCP')]
hc_mean = hc_frame.mean()
N_df = df_clean_final/hc_mean

In [66]:
#export for data vis and stats
file_mitophagy = 'mito_proccesed.csv'
N_df.to_csv(file_mitophagy)

In [None]:
#####################################################