# <font color='purple'>FUNOVA Preprocessing QC statistics </font> 
### January 2025

In [None]:
import os
NOVA_HOME = "/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA"
NOVA_DATA_HOME = '/home/labs/hornsteinlab/Collaboration/FUNOVA'

LOGS_PATH = os.path.join(NOVA_DATA_HOME, "outputs/preprocessing/logs/")
PLOT_PATH = os.path.join(NOVA_DATA_HOME, "outputs/logs/")

os.chdir(NOVA_HOME)

import pandas as pd
import numpy as np
import contextlib
import io
from IPython.display import display, Javascript
import seaborn as sns
from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
                                                show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
                                                show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
                                                calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
                                                plot_cell_count, plot_catplot, plot_hm_combine_batches, plot_hm, \
                                                run_calc_hist_new, plot_marker_data, find_bad_wells,\
                                                show_site_survival_by_brenner_on_dapi_tiles, show_site_survival_target_brenner_tiles
                                                
from tools.preprocessing_tools.qc_reports.qc_config import (
    funova_markers as markers,
    funova_cell_lines as cell_lines,
    funova_cell_lines_to_cond as cell_lines_to_cond,
    funova_cell_lines_for_disp as cell_lines_for_disp,
    funova_reps as reps,
    funova_line_colors as line_colors,
    funova_lines_order as lines_order,
    funova_custom_palette as custom_palette,
    funova_expected_dapi_raw as expected_dapi_raw,
    funova_panels as panels,
    funova_marker_info as marker_info
)

%load_ext autoreload
%autoreload 2

In [None]:
# choose batches
batches = ['Batch3', 'Batch4']#, 'batch2', 'batch3']
batches

I have created a folder called 'Batch3' in the logs dir and put all files inside

In [None]:
validate_antibody = False

In [None]:
df = log_files_qc(LOGS_PATH, batches, only_wt_cond = False)

In [None]:
df['filename'] = df['filename'].str.split('-').str[0]
df['site_num'] = df['site_num'].str.split('-').str[0]

In [None]:
df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']

## Actual Files Validation

### Raw Files Validation

1. How many site **tiff** files do we have in each folder?
2. Are all existing files valid? (tif, at least 2049kB, not corrupetd)

In [None]:
root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw')

In [None]:
# root_path = "/home/labs/hornsteinlab/Collaboration/FUNOVA/input/images/raw/"
# marker_info_df = create_marker_info_df(root_path)

In [None]:
batches_raw = [batch.replace("_16bit_no_downsample","") for batch in batches]
raws = run_validate_folder_structure(root_directory_raw, False, panels, markers.copy(), PLOT_PATH, marker_info,
                                    cell_lines_to_cond, reps, cell_lines_for_disp, expected_dapi_raw,
                                     batches=batches_raw, fig_width=8,fig_height = 40,
                                    expected_count=100, validate_antibody = validate_antibody)

### Processed Files Validation

1. How many site **npy** files do we have in each folder? -> How many sites survived the pre-processing?
2. Are all existing files valid? (at least 100kB, npy not corrupted)

In [None]:
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed')
procs = run_validate_folder_structure(root_directory_proc, True, panels, markers, PLOT_PATH, marker_info,
                                    cell_lines_to_cond, reps, cell_lines_for_disp, expected_dapi_raw,
                                     batches=batches, fig_width=8,fig_height = 40,
                                    expected_count=100, validate_antibody = validate_antibody)

### Difference between Raw and Processed

In [None]:
display_diff(batches, raws, procs, PLOT_PATH, fig_width=8, fig_height = 40)

### Variance in each batch (of processed files)

In [None]:
for batch in batches:
    with contextlib.redirect_stdout(io.StringIO()):
        var = sample_and_calc_variance(root_directory_proc, batch, 
                                       sample_size_per_markers=200, cond_count=1, rep_count=len(reps), 
                                       num_markers=len(markers))
    print(f'{batch} var: ',var)

## Preprocessing Filtering qc
By order of filtering

### 1. % site survival after Brenner on DAPI channel
Percentage out of the total sites

In [None]:
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,batches, line_colors, panels,
                                                        figsize=(6,18), reps=reps, vmax=100)

### 2. % Site survival after Cellpose
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

**A site will be filtered out if Cellpose found 0 cells in it.**

In [None]:
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, batches, dapi_filter_by_brenner, 
                                                           line_colors, panels, figsize=(6,18), reps=reps)

### 3. % Site survival by tiling
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

**A site will be filtered out if after tiling, no tile is containing at least 85% of a cell that Cellpose detected.**

In [None]:
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, batches, dapi_filter_by_cellpose, 
                                                     line_colors, panels, figsize=(6,18), reps=reps)

### 4. % Site survival after Brenner on DAPI's tiles
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).

In [None]:
dapi_filter_by_brenner_tiles=show_site_survival_by_brenner_on_dapi_tiles(df_dapi, batches, dapi_filter_by_tiling, 
                                                     line_colors, panels, figsize=(6,18), reps=reps)

### 5. % Site survival after Brenner on target channel
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).

In [None]:
show_site_survival_target_brenner(df_dapi, df_target, dapi_filter_by_tiling,
                                 figsize=(6,24), markers=markers)

In [None]:
df_dapi.iloc[np.where(df_dapi['n_valid_tiles']-df_dapi['n_valid_tiles_after_tiles_brenner'])][['n_valid_tiles','n_valid_tiles_after_tiles_brenner']]

Find non valid wells

In [None]:
dfb = pd.read_csv("/home/labs/hornsteinlab/Collaboration/FUNOVA/outputs/preprocessing/brenner/raw_metrics280125_exp3.csv")
# Extract the panel (e.g., "panelD") from the Path column
dfb['Panel'] = dfb['Path'].str.extract(r'/panel([A-Z])/', expand=True)
find_bad_wells(dfb, threshold=10000, percentage_filter=60)

### 6. % Site survival after Brenner on target channel tiles
Percentage out of the tiles that passed the previous filter. In parenthesis are absolute values (if different than the percentages).

In [None]:
show_site_survival_target_brenner_tiles(df_dapi, df_target, dapi_filter_by_brenner_tiles,
                                 figsize=(6,24), markers=markers)

## Statistics About the Processed Files

In [None]:
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles_after_tiles_brenner','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, markers)

In [None]:
total_sum.rename(columns={'n_valid_tiles_after_tiles_brenner': 'n_valid_tiles'}, inplace=True)

In [None]:
plot_marker_data(total_sum, split_by_cell_line=True)

In [None]:
plot_marker_data(total_sum, split_by_cell_line=False)

### Total tiles

In [None]:
total_sum.n_valid_tiles.sum()

### Total whole nuclei in tiles

In [None]:
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()

### Total nuclei in sites

In [None]:
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()

In [None]:
show_total_sum_tables(total_sum)

### Show Total Tile Counts
For each batch, cell line, replicate and markerTotal number of tiles

In [None]:
to_heatmap = total_sum.rename(columns={'n_valid_tiles':'index'})
plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
                      xlabel = 'Total number of tiles', show_sum=True, figsize=(6,24))

### Show Total Whole Cell Counts
For each batch, cell line, replicate and markerTotal number of tiles

In [None]:
to_heatmap = total_sum.rename(columns={'site_whole_cells_counts_sum':'index'})
plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
                      xlabel = 'Total number of whole cells', show_sum=True, figsize=(6,24))

### Show **Cell Count** Statistics per Batch

In [None]:
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles_after_tiles_brenner !=0]
plot_cell_count(df_no_empty_sites, lines_order, custom_palette, y='site_cell_count_sum', 
                title='Cell Count Average per Site (from tiles)', figsize=(16,6))

plot_cell_count(df_no_empty_sites, lines_order, custom_palette, y='site_whole_cells_counts_sum',
                title='Whole Cell Count Average per Site',figsize=(16,6))

plot_cell_count(df_no_empty_sites, lines_order, custom_palette, y='site_cell_count',
               title='Cellpose Cell Count Average per Site',figsize=(16,6))


### Show **Tiles** per Site Statistics


In [None]:
df_dapi.groupby(['cell_line_cond']).n_valid_tiles_after_tiles_brenner.mean()

In [None]:
plot_catplot(df_dapi, sns.color_palette('colorblind'), reps=reps,x='cell_line', y_title='Valid Tiles Count', x_title='Cell Line', y='n_valid_tiles_after_tiles_brenner', hue='rep',
             height=4, aspect=2)


### Show Mean of cell count in valid tiles

In [None]:
# plot_hm(df_dapi, split_by='rep', rows='cell_line', columns='panel', vmax=3)

# Assessing Staining Reproducibility and Outliers

In [None]:
# for batch in batches:
#     print(batch)
#     run_calc_hist_new(batch,cell_lines_for_disp, markers, root_directory_raw, root_directory_proc,
#                            hist_sample=10,sample_size_per_markers=10, ncols=4, nrows=1, figsize=(6,2))
#     print("="*30)
    

In [None]:
# save notebook as HTML ( the HTML will be saved in the same folder the original script is)
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system(f'jupyter nbconvert --to html tools/preprocessing_tools/qc_reports/qc_report_funova_exp4.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/qc_report_funova_exp4_11.03.25.html')