# <font color='purple'>NIH Ward Lab - Neurons Day8 - Preprocessing QC statistics </font> 
### July 2025 - Nancy Y

Reran by Sagy on Sep 15 (2025) - removing CD41 and FUS lines

In [None]:
import io
import os
import sys
import pandas as pd
import contextlib
from IPython.display import display, Javascript

NOVA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
NOVA_DATA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
os.environ['NOVA_HOME'] = NOVA_HOME
sys.path.insert(1, os.getenv("NOVA_HOME"))
print(f"NOVA_HOME: {os.getenv('NOVA_HOME')}")


root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw', 'NIH', 'indi-image-pilot-20241128')
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed', 'ManuscriptFinalData_80pct', 'NIH')

LOGS_PATH = os.path.join(NOVA_HOME, "outputs", "preprocessing", "ManuscriptFinalData_80pct", "NIH", "logs")
PLOT_PATH = os.path.join(NOVA_HOME, 'outputs', 'preprocessing', "ManuscriptFinalData_80pct", 'NIH', 'QC_figures')


from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
                                                show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
                                                show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
                                                calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
                                                plot_cell_count, plot_catplot, plot_hm_of_mean_cell_count_per_tile, \
                                                run_calc_hist_new, show_total_valid_tiles_per_marker_and_batch
                                                
from tools.preprocessing_tools.qc_reports.qc_config import NIH_d8_panels, NIH_d8_markers, NIH_d8_marker_info, NIH_d8_cell_lines, NIH_d8_cell_lines_to_cond,\
                                    NIH_d8_cell_lines_for_disp, NIH_d8_reps, NIH_d8_line_colors, NIH_d8_lines_order, NIH_d8_custom_palette,\
                                    NIH_d8_expected_dapi_raw
%load_ext autoreload
%autoreload 2




In [None]:
# choose batches
batches = [f'batch{i}' for i in range(1,4)]
batches

In [None]:
df = log_files_qc(LOGS_PATH, only_wt_cond=False, batches=batches, filename_split='-',site_location=0)

#df['cell_line_cond'] = df['cell_line_cond'].str.replace(" ", "_")
df = df[df['cell_line'] == 'WT']

df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']

## Actual Files Validation

### Raw Files Validation

1. How many site **tiff** files do we have in each folder?
2. Are all existing files valid? (tif, at least 2049kB, not corrupetd)

In [None]:



raws = run_validate_folder_structure(root_directory_raw, False, 
                                     NIH_d8_panels, 
                                     NIH_d8_markers.copy(),
                                     PLOT_PATH, 
                                     NIH_d8_marker_info,
                                     NIH_d8_cell_lines_to_cond, 
                                     NIH_d8_reps, 
                                     NIH_d8_cell_lines_for_disp,
                                     NIH_d8_expected_dapi_raw,
                                     batches=batches, 
                                     fig_width=8, fig_height = 40,
                                     expected_count=25, check_antibody=False)


In [None]:
## Missing data issue was fixed

differences = (raws[0] != raws[1]).stack()
differences = differences[differences].index.to_frame(index=False)
differences.columns = ["Marker", "Rep", "Condition"]
for condition in differences["Condition"].unique():
    print(f"Condition: {condition}")
    condition_data = differences[differences["Condition"] == condition]
    for rep in condition_data["Rep"].unique():
        markers = condition_data[condition_data["Rep"] == rep]["Marker"].tolist()
        print(f"  Rep: {rep}")
        print(f"    Markers: {', '.join(markers)}")

### Processed Files Validation

1. How many site **npy** files do we have in each folder? -> How many sites survived the pre-processing?
2. Are all existing files valid? (at least 100kB, npy not corrupted)

In [None]:

procs = run_validate_folder_structure(root_directory_proc, True, 
                                      NIH_d8_panels, 
                                      NIH_d8_markers,
                                      PLOT_PATH,
                                      NIH_d8_marker_info,
                                      NIH_d8_cell_lines_to_cond, 
                                      NIH_d8_reps, 
                                      NIH_d8_cell_lines_for_disp, 
                                      NIH_d8_expected_dapi_raw,
                                      fig_width=8, fig_height=40,
                                      expected_count=25, 
                                      check_antibody=False, 
                                      batches=batches)

### Difference between Raw and Processed

In [None]:
display_diff(batches, raws, procs, PLOT_PATH, fig_width=8, fig_height=40)

### Variance in each batch (of processed files)

In [None]:
for batch in batches[:1]:
    with contextlib.redirect_stdout(io.StringIO()):
        var = sample_and_calc_variance(root_directory_proc, 
                                       batch, 
                                       sample_size_per_markers=50, 
                                       cond_count=2, 
                                       rep_count=len(NIH_d8_reps), 
                                       num_markers=len(NIH_d8_markers))
    print(f'{batch} var: ',var)

## Preprocessing Filtering qc
By order of filtering

### 1. % site survival after Brenner on DAPI channel
Percentage out of the total sites

In [None]:

dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,
                                                         batches, 
                                                         NIH_d8_line_colors, 
                                                         NIH_d8_panels, 
                                                         NIH_d8_reps, 
                                                         figsize=(6,18),
                                                         vmax=25)


### 2. % Site survival after Cellpose
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

**A site will be filtered out if Cellpose found 0 cells in it.**

In [None]:
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, 
                                                           batches, 
                                                           dapi_filter_by_brenner, 
                                                           NIH_d8_line_colors, 
                                                           NIH_d8_panels, 
                                                           NIH_d8_reps, 
                                                           figsize=(6,18))

### 3. % Site survival by tiling
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

**A site will be filtered out if after tiling, no tile is containing at least one whole cell that Cellpose detected.**

In [None]:
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, 
                                                     batches, 
                                                     dapi_filter_by_cellpose, 
                                                     NIH_d8_line_colors, 
                                                     NIH_d8_panels, 
                                                     NIH_d8_reps, 
                                                     figsize=(6,18))

### 4. % Site survival after Brenner on target channel
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).

In [None]:
show_site_survival_target_brenner(df_dapi, 
                                  df_target, 
                                  dapi_filter_by_tiling, 
                                  NIH_d8_markers,
                                  figsize=(6,18))

## Statistics About the Processed Files

In [None]:
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, NIH_d8_markers)

### Total tiles

In [None]:
# markers_for_dnls = markers.copy() #TODO need to change according to - if we use all markers or just the d8 ones!!!!
# markers_for_dnls.remove('TIA1')
# markers_for_dnls += ['TDP43B']

total_sum[total_sum.marker.isin(NIH_d8_markers)].n_valid_tiles.sum()

### Total whole nuclei in tiles

In [None]:
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()

### Total nuclei in sites

In [None]:
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()

In [None]:
show_total_sum_tables(total_sum)

### Show Total Tile Counts
For each batch, cell line, replicate and marker: Total number of tiles

#### First, we look at all cell lines togther:

In [None]:
show_total_valid_tiles_per_marker_and_batch(total_sum, vmax=15000)


#### Separating into cell lines & batches:

In [None]:
to_heatmap = total_sum.rename(columns={'n_valid_tiles':'index'})
plot_filtering_heatmap(to_heatmap, 
                       extra_index='marker', 
                       vmin=None, vmax=None,
                       xlabel = 'Total number of tiles', 
                       show_sum=True, figsize=(7,28), 
                       fmt=".0f")

### Show Total Whole Cell Counts
For each batch, cell line, replicate and markerTotal number of tiles

In [None]:
to_heatmap = total_sum.rename(columns={'site_whole_cells_counts_sum':'index'})
plot_filtering_heatmap(to_heatmap, 
                       extra_index='marker', 
                       vmin=None, vmax=None,
                       xlabel = 'Total number of whole cells', 
                       show_sum=True, 
                       figsize=(7,28), 
                       fmt=".0f")

### Show **Cell Count** Statistics per Batch

In [None]:
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]

plot_cell_count(df_no_empty_sites, 
                NIH_d8_lines_order, 
                NIH_d8_custom_palette, 
                y='site_cell_count_sum', 
                title='Cell Count Average per Site (from tiles)', 
                figsize=(16,6))


plot_cell_count(df_no_empty_sites, 
                NIH_d8_lines_order, 
                NIH_d8_custom_palette, 
                y='site_whole_cells_counts_sum',
                title='Whole Cell Count Average per Site',
                figsize=(16,6))


plot_cell_count(df_no_empty_sites, 
                NIH_d8_lines_order, 
                NIH_d8_custom_palette, 
                y='site_cell_count',
                title='Cellpose Cell Count Average per Site',
                figsize=(16,6))


### Show **Tiles** per Site Statistics


In [None]:
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()

In [None]:
# number of valid tiles per site (on average)
import numpy as np
np.mean(df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean())

In [None]:
df_dapi[['site_cell_count']].mean()

In [None]:
# plot_catplot(df_dapi, 
#              NIH_d8_custom_palette,
#              NIH_d8_reps, 
#              x='n_valid_tiles', 
#              x_title='valid tiles count', 
#              batch_min=1, 
#              batch_max=3, 
#              height=6)


### Show Mean of cell count in valid tiles

In [None]:
# plot_hm_of_mean_cell_count_per_tile(df_dapi, 
#                                     split_by='rep', 
#                                     rows='cell_line_cond', 
#                                     columns='panel', 
#                                     figsize=(18,6))


In [None]:
df_dapi[['cells_count_in_valid_tiles_mean']].mean()


In [None]:
df_dapi[['site_cell_count']].mean()


# Assessing Staining Reproducibility and Outliers

In [None]:
# for batch in batches:
#     print(batch)
#     run_calc_hist_new(f'{batch}', dnls_opera_cell_lines_for_disp, dnls_opera_markers,
#                       root_directory_raw, root_directory_proc,
#                            hist_sample=10,sample_size_per_markers=200, ncols=8, nrows=4, dnls=True)
#     print("="*30)

In [None]:
# # save notebook as HTML 
# from IPython.display import display, Javascript
# display(Javascript('IPython.notebook.save_checkpoint();'))
# os.system(f'jupyter nbconvert --to html {NOVA_HOME}/tools/preprocessing_tools/qc_reports/qc_report_NIH_NeuronsDay8.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/ManuscriptFinalData/qc_report_NIH_NeuronsDay8.html')