# Preprocessing Notebook

Notebook to run through data processing steps using default settings, and save the resulting dataframes for use in the accompanying notebooks

In [1]:
import cellPLATO as cp

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

OVERWRITE_DATAFRAMES = False

Dataset in current notebook:  ultimate_high_temp_res
Initializing:  ultimate_high_temp_res
Hypthesis testing using:  st.ttest_ind
Plots will be exported to:  Z://Collaboration_data/Mace_Lab/my_generated/cellPLATO(dev)/ultimate_high_temp_res\2022-07-18_13-33-28-161389\plots/
Using unique embedding per dataset shortname:  ultimate_high_temp_res
Exporting static Superplots
Exporting static Plots of Differences
Exporting static Marginal scatterplots
Exporting static Timeplots
Exporting Bar plots
Dataset in current notebook:  ultimate_high_temp_res
Finished running cellPLATO initialization and loaded config.


Finished intializing visualizations
Finished initializing data_processing
Finished initializing cellPLATO


In [2]:
# Get the experiment list from the experiments listed in the config 
exp_list = cp.populate_experiment_list()
display(exp_list)

Unnamed: 0,Condition,Experiment,Replicate_ID,Condition_shortlabel,Replicate_shortlabel
0,cytoD40uM_day1,cytoD40uM_day1,cytoD40uM_day1,cytoD_day1,cytoD_day1_0
1,cytoD40uM_day2,cytoD40uM_day2,cytoD40uM_day2,cytoD_day2,cytoD_day2_0
2,untreated_day1,untreated_day1,untreated_day1,ctl_day1,ctl_day1_0
3,untreated_day2,untreated_day2,untreated_day2,ctl_day2,ctl_day2_0


In [3]:
# Load, process and combine the dataframes (including segmentation and migration calculations)
comb_df = cp.combine_dataframes(exp_list)

Loading existing file: cytoD40uM_day1, cytoD40uM_day1.csv
Loading existing file: cytoD40uM_day2, cytoD40uM_day2.csv
Loading existing file: untreated_day1, untreated_day1.csv
Loading existing file: untreated_day2, untreated_day2.csv
max x_pix:  1022.4574584960938 , image width:  1024
max y_pix:  1023.0775146484376 , image height:  1024
max x_um:  549.0596552124024 , MICRONS_PER_PIXEL:  0.537
max y_um:  549.3926253662111 , MICRONS_PER_PIXEL:  0.537


In [4]:
comb_df = cp.measurement_pipeline(comb_df)

Calculating ripleys p, K and L with radius:  25  (pixels)
Calibrating with mixed_scaling =  False


In [None]:
# Process a time-averaged DataFrame
tavg_df = cp.time_average(comb_df)
display(tavg_df)

In [7]:
# Make summary calculations from time-averaged dataframe
#Per condition:
avg_df = cp.average_per_condition(tavg_df)

# Per replicate
repavg_df = cp.average_per_condition(tavg_df, avg_per_rep=True)

In [10]:
# Dimension reduction pipeline
dr_df = cp.dr_pipeline(comb_df, dr_factors=DR_FACTORS) 

Running dr_pipeline...
tSNE perplexity =  185
UMAP nearest neighbors =  10  min distance =  0.5
Using standardized factors for dimensionality reduction, matrix shape:  (1574530, 28)
Using openTSNE with perplexity =  185
Using openTSNE to calculate new embedding for input data.
Embedding shape:  (1574530, 2)


In [11]:
# Clustering cell behavior
lab_dr_df = cp.hdbscan_clustering(dr_df, cluster_by=CLUSTER_BY, plot=False)

hdbscan_clustering() with min_cluster_size =  20
DBScan clustering by UMAP...


In [None]:
# Run the trajectory clustering pipeline
lab_dr_df, traj_list, cluster_lst = cp.trajectory_clustering_pipeline(lab_dr_df, traj_factor='umap', dist_metric='hausdorff', filename_out='std_dr_df_traj')

In [None]:
cp.cluster_switching_pipeline(lab_dr_df)

In [None]:
'''
Undecided: What's better:
    - saving the distinct dataframes
    OR
    - Save only the most processed dataframe, then strip columns as needed to revert.
'''

# Save dataframes to shared data folder
tavg_df.to_csv(SAVED_DATA_PATH + 'tavg_df.csv')
comb_df.to_csv(SAVED_DATA_PATH + 'comb_df.csv')
dr_df.to_csv(SAVED_DATA_PATH + 'dr_df.csv')
lab_dr_df.to_csv(SAVED_DATA_PATH + 'lab_dr_df.csv')

### Alternatively, run all the pipelines in a single cell, generate and save all outputs

In [6]:
#Minimal pipelines:
comb_df = cp.combine_dataframes(cp.populate_experiment_list())
comb_df = cp.measurement_pipeline(comb_df)
dr_df = cp.dr_pipeline(comb_df, dr_factors=cp.DR_FACTORS) 
cp.comparative_visualization_pipeline(dr_df)
lab_dr_df = cp.cluster_analysis_pipeline(dr_df,cp.CLUSTER_BY)
lab_dr_df, traj_list, cluster_lst = cp.trajectory_clustering_pipeline(lab_dr_df, traj_factor=CLUSTER_BY, dist_metric='hausdorff', filename_out='std_dr_df_traj')
cp.cluster_switching_pipeline(lab_dr_df)

Loading existing file: cytoD40uM_day1, cytoD40uM_day1.csv
Loading existing file: cytoD40uM_day2, cytoD40uM_day2.csv
Loading existing file: untreated_day1, untreated_day1.csv
Loading existing file: untreated_day2, untreated_day2.csv
max x_pix:  1022.4574584960938 , image width:  1024
max y_pix:  1023.0775146484376 , image height:  1024
max x_um:  549.0596552124024 , MICRONS_PER_PIXEL:  0.537
max y_um:  549.3926253662111 , MICRONS_PER_PIXEL:  0.537
Calculating ripleys p, K and L with radius:  25  (pixels)
Calibrating with mixed_scaling =  False


KeyboardInterrupt: 

In [None]:
if OVERWRITE_DATAFRAMES:
    # Save dataframes to shared data folder
    tavg_df.to_csv(SAVED_DATA_PATH + 'tavg_df.csv')
    comb_df.to_csv(SAVED_DATA_PATH + 'comb_df.csv')
    dr_df.to_csv(SAVED_DATA_PATH + 'dr_df.csv')
    lab_dr_df.to_csv(SAVED_DATA_PATH + 'lab_dr_df.csv')