# Preprocessing Notebook

Notebook to run through data processing steps using default settings, and save the resulting dataframes for use in the accompanying notebooks

In [1]:
import cellPLATO as cp

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

OVERWRITE_DATAFRAMES = True

Finished running cellPLATO initialization and loaded config.
Initializing:  20x_100x_hightr
Hypthesis testing using:  st.ttest_ind
Plots will be exported to:  Z://Collaboration_data/Mace_Lab/my_generated/cellPLATO(dev)/20x_100x_hightr\2022-07-19_10-57-08-496096\plots/
Using unique embedding per dataset shortname:  20x_100x_hightr
Exporting static Superplots
Exporting static Plots of Differences
Exporting static Marginal scatterplots
Exporting static Timeplots
Exporting Bar plots
Using corresponding CTL_SHORTLABEL:  WT_20x  for condition:  CAMWT_20x
Dataset in current notebook:  20x_100x_hightr
Finished initializing data_processing
Finished intializing visualizations


Finished initializing cellPLATO


In [2]:
# Get the experiment list from the experiments listed in the config 
exp_list = cp.populate_experiment_list()
display(exp_list)

Unnamed: 0,Condition,Experiment,Replicate_ID,Condition_shortlabel,Replicate_shortlabel
0,CAMWT_20x,Condition_20x_ICAMWT_tracks,Condition_20x_ICAMWT_tracks,WT_20x,WT_20x_0
1,CAMKO_20x,Condition_20x_ICAMKO_tracks,Condition_20x_ICAMKO_tracks,KO_20x,KO_20x_0
2,NK92WT_100x,WT_100x_mScar_2,WT_100x_mScar_2,WT_100x,WT_100x_9
3,NK92WT_100x,NK92WT_8,NK92WT_8,WT_100x,WT_100x_8
4,NK92WT_100x,NK92WT_5,NK92WT_5,WT_100x,WT_100x_7
5,NK92WT_100x,NK92WT_4,NK92WT_4,WT_100x,WT_100x_6
6,NK92WT_100x,NK92WT_3,NK92WT_3,WT_100x,WT_100x_5
7,NK92WT_100x,NK92WT_2,NK92WT_2,WT_100x,WT_100x_4
8,NK92WT_100x,NK92WT2_7,NK92WT2_7,WT_100x,WT_100x_3
9,NK92WT_100x,NK92WT2_6,NK92WT2_6,WT_100x,WT_100x_2


In [None]:
# Load, process and combine the dataframes (including segmentation and migration calculations)
comb_df = cp.combine_dataframes(exp_list)

----
['CAMWT_20x', 'CAMKO_20x', 'NK92WT_100x', 'NK92KO_100x']
['WT_20x', 'KO_20x', 'WT_100x', 'KO_100x']
[CAMWT_20x, CAMKO_20x, NK92WT_100x, NK92KO_100x]
Categories (4, object): [CAMWT_20x, CAMKO_20x, NK92WT_100x, NK92KO_100x]
---
0    Condition_20x_ICAMWT_tracks
Name: Experiment, dtype: object
h5 file contents:  <KeysViewHDF5 ['objects', 'segmentation', 'tracks']>
2D track with zero as z component. Forcing STC
h5 file contains dummies
btrack_unpack() found h5 file containing regionprops: 
<HDF5 group "/objects/obj_type_1/properties" (12 members)>
False
h5_data passed t h5_to_df() contains regionprops, adding to df.
Index(['area', 'bbox_area', 'eccentricity', 'equivalent_diameter', 'extent',
       'filled_area', 'major_axis_length', 'minor_axis_length', 'orientation',
       'perimeter', 'solidity'],
      dtype='object')
props_arr:  (437663, 11)
coords:  (437663, 5)
0 CAMWT_20x 0.537
No column Replicate_ID, renaming Experiment column
Overlap: []
CALIBRATED_POS ==  False , Input posit

100%|█████████████████████████████████████████████████████████████████████████████| 4989/4989 [01:54<00:00, 43.64it/s]


Saving file: Z://Collaboration_data/Mace_Lab/20x_100x/20x_100x_high_temp_res\CAMWT_20x\Condition_20x_ICAMWT_tracks\seg_mig_calcs.csv
1    Condition_20x_ICAMKO_tracks
Name: Experiment, dtype: object
h5 file contents:  <KeysViewHDF5 ['objects', 'segmentation', 'tracks']>
2D track with zero as z component. Forcing STC
h5 file contains dummies
btrack_unpack() found h5 file containing regionprops: 
<HDF5 group "/objects/obj_type_1/properties" (12 members)>
False
h5_data passed t h5_to_df() contains regionprops, adding to df.
Index(['area', 'bbox_area', 'eccentricity', 'equivalent_diameter', 'extent',
       'filled_area', 'major_axis_length', 'minor_axis_length', 'orientation',
       'perimeter', 'solidity'],
      dtype='object')
props_arr:  (761064, 11)
coords:  (761064, 5)
1 CAMKO_20x 0.537
No column Replicate_ID, renaming Experiment column
Overlap: []
CALIBRATED_POS ==  False , Input positions in pixels.
Index(['frame', 'x', 'y', 'z', '_', 'particle', 'area', 'bbox_area',
       'eccen

  0%|                                                                                       | 0/11167 [00:00<?, ?it/s]

Processing migration_calcs() for condition:  CAMKO_20x
Processing migration_calcs() for experiment:  Condition_20x_ICAMKO_tracks


100%|███████████████████████████████████████████████████████████████████████████| 11167/11167 [03:35<00:00, 51.72it/s]


In [None]:
comb_df = cp.measurement_pipeline(comb_df)

In [None]:
# Process a time-averaged DataFrame
tavg_df = cp.time_average(comb_df)
display(tavg_df)

In [None]:
# Make summary calculations from time-averaged dataframe
#Per condition:
avg_df = cp.average_per_condition(tavg_df)

# Per replicate
repavg_df = cp.average_per_condition(tavg_df, avg_per_rep=True)

In [None]:
# Dimension reduction pipeline
dr_df = cp.dr_pipeline(comb_df, dr_factors=cp.DR_FACTORS) 

In [None]:
# Clustering cell behavior
lab_dr_df = cp.hdbscan_clustering(dr_df, cluster_by=cp.CLUSTER_BY, plot=False)

In [None]:
# Run the trajectory clustering pipeline
lab_dr_df, traj_list, cluster_lst = cp.trajectory_clustering_pipeline(lab_dr_df, traj_factor='umap', dist_metric='hausdorff', filename_out='std_dr_df_traj')

In [None]:
cp.cluster_switching_pipeline(lab_dr_df)

In [None]:
if OVERWRITE_DATAFRAMES:
    # Save dataframes to shared data folder
    tavg_df.to_csv(SAVED_DATA_PATH + 'tavg_df.csv')
    comb_df.to_csv(SAVED_DATA_PATH + 'comb_df.csv')
    dr_df.to_csv(SAVED_DATA_PATH + 'dr_df.csv')
    lab_dr_df.to_csv(SAVED_DATA_PATH + 'lab_dr_df.csv')

### Alternatively, run all the pipelines in a single cell, generate and save all outputs

In [None]:
#Minimal pipelines:
comb_df = cp.combine_dataframes(cp.populate_experiment_list())
comb_df = cp.measurement_pipeline(comb_df)
dr_df = cp.dr_pipeline(comb_df, dr_factors=cp.DR_FACTORS) 
cp.comparative_visualization_pipeline(dr_df)
lab_dr_df = cp.cluster_analysis_pipeline(dr_df,cp.CLUSTER_BY)
lab_dr_df, traj_list, cluster_lst = cp.trajectory_clustering_pipeline(lab_dr_df, traj_factor=CLUSTER_BY, dist_metric='hausdorff', filename_out='std_dr_df_traj')
cp.cluster_switching_pipeline(lab_dr_df)