In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
import re
import utils
import functools

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# defined for each pipeline to simplify data parsing
def read_data(directory, regex=None, dfc=True, bins=4, debug_regex=False):
    directory = Path(directory)
    # some variables reused below
    extra_columns = ['ImageNumber', 'Parent_DilatedGC']
    
    extras = {
        'extra_columns': extra_columns,
        'reduce': True,
        'merge_fcn': functools.partial(utils.merge_reduced, merge_on=extra_columns)
    }
    
    extras_left = {
        'extra_columns': extra_columns,
        'reduce': True,
        'merge_fcn': functools.partial(utils.merge_reduced, merge_on=extra_columns, how='left')
    }

    # Parse information from filename using the provided regex
    result, _ = utils.analyze(directory / 'Image.csv', 
                              parsers=[
                                  utils.ImageParser(regex, debug_regex=debug_regex),                              
                              ])
    # Combine with DilatedGC for using to merge with other measures
    result, _ = utils.analyze(directory / 'DilatedGC.csv',
                              previous_result=result,
                              parsers=[utils.BlankParser(['ObjectNumber'])],
                              extra_columns=['ImageNumber', ],
                              merge_fcn=functools.partial(utils.merge_result, merge_on=['ImageNumber'], how='left'),
                             )
    result = result.rename(columns={'ObjectNumber': 'Parent_DilatedGC'})

    # Measure features from GC objects
    result, _ = utils.analyze(directory / 'InitialGC.csv',
                              previous_result=result,
                              parsers=[
                                  utils.CountingParser(),
                                  utils.ShapeParser(),
                                  utils.IntensityParser(images=['NOP56', 'NPM1', 'Probe'], locations=True, total_intens=True),
                                  utils.RimEnrichmentParser(images=['RPA194', 'NOP56'], area_normalization='GCObjectImage', bins=bins, total_bins=20)
                              ],
                              region='GC',
                              **extras
                             )
    
    # Measure features from FC objects
    result, _ = utils.analyze(directory / 'InitialFC.csv',
                              previous_result=result,
                              parsers=[
                                  utils.CountingParser(),
                                  utils.ShapeParser(),
                                  utils.IntensityParser(images=['RPA194', 'Probe'], locations=True, total_intens=True),
                              ],
                              region='FC',
                              **extras_left
                             )
    
    # Measure number of FCs outside of GC
    result, _ = utils.analyze(directory / 'ExtraNucleolarFCs.csv',
                              previous_result=result,
                              parsers=[
                                  utils.CountingParser(),
                              ],
                              region='Nucleoplasmic_FC',
                              **extras_left
                             )
    result = result.fillna(0)

    # get correlation over combined objects
    result, _ = utils.analyze(directory / 'CombinedObjects.csv',
                              parsers=[
                                  utils.CorrelationParser(measures=['Correlation', 'Overlap'], reduce=True),
                              ],
                              previous_result=result,
                              region='Combined',
                              **extras
                             )
    
    return result

# multiple dataframes can be combined with unique experiments and subsets of overlapping variables (e.g. time or treatment)
full_data = read_data('/scratch/gpfs/tcomi/cp_paper_redo/morphology/testing/outputs', 
                      r'/[A-G]\d+_(?P<treatment>SCR|RPL5KD)_15p(?P<time>\d+)c.*nd2', bins=4).assign(exp='RPL5')
len(full_data)

289

In [None]:
df = full_data[full_data.exp == 'Fib'].copy()
df['time'] = df['time'].astype(int)
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['treatment', 'time']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    for treatment, style in zip(('FibKD', 'ctl'), ('-', '--')):
        sns.ecdfplot(data=df[df.treatment == treatment], x=col, hue='time', linestyle=style, ax=ax, label=treatment)

fig, axes = plt.subplots()
# sns.lineplot(data=df, x='time', y='RWC_FC_Probe', label='FC_Probe', style='treatment', legend=False)
# sns.lineplot(data=df, x='time', y='RWC_DFC_Probe', label='DFC_Probe', style='treatment', legend=False)
# sns.lineplot(data=df, x='time', y='RWC_GC_Probe', label='GC_Probe', style='treatment', legend=False)
sns.lineplot(data=df, x='time', y='Correlation_FC_Probe', label='FC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_DFC_Probe', label='DFC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_GC_Probe', label='GC_Probe', style='treatment')
plt.savefig('EU_fib.pdf')

In [None]:
df = full_data[full_data.exp == 'FVP_EU'].copy()
# drop some data from another experiment...
df = df[~df.time.isna()]
df['time'] = df['time'].astype(int)
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['exp', 'treatment', 'time']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    for treatment, style in zip(('FVP', 'DMSO'), ('-', 'dotted')):
        sns.ecdfplot(data=df[df.treatment == treatment], x=col, hue='time', linestyle=style, ax=ax, label=treatment)

fig, axes = plt.subplots()
sns.lineplot(data=df, x='time', y='Correlation_FC_Probe', label='FC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_DFC_Probe', label='DFC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_GC_Probe', label='GC_Probe', style='treatment')
plt.savefig('EU_fvp.pdf')

In [None]:
df = full_data[full_data.exp == 'FVP_nodfc'].copy()
df['time'] = df['time'].astype(int)
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'DFC' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['exp', 'treatment', 'time']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    for treatment, style in zip(('FVP2uM', 'DMSO'), ('-', 'dotted')):
        sns.ecdfplot(data=df[df.treatment == treatment], x=col, hue='time', linestyle=style, ax=ax, label=treatment)

fig, axes = plt.subplots()
sns.lineplot(data=df, x='time', y='Correlation_FC_Probe', label='FC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Correlation_GC_Probe', label='GC_Probe', style='treatment')
plt.savefig('EU_fvp_nodfc.pdf')

In [None]:
df = full_data[full_data.exp == 'FVP_FISH'].copy()
df['time'] = df['time'].astype(int)
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
print(df.groupby(['exp', 'treatment', 'time', 'probe']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    for treatment, style in zip(('FVP', 'DMSO'), ('-', 'dotted')):
        sns.ecdfplot(data=df[df.treatment == treatment], x=col, hue='time', linestyle=style, ax=ax, label=treatment)

fig, axes = plt.subplots()
sns.lineplot(data=df, x='time', y='Correlation_FC_Probe', label='FC_Probe', style='probe')
sns.lineplot(data=df, x='time', y='Correlation_DFC_Probe', label='DFC_Probe', style='probe')
sns.lineplot(data=df, x='time', y='Correlation_GC_Probe', label='GC_Probe', style='probe')

## Example RPL5

In [None]:
# get subset of data to plot
df = full_data[full_data.exp == 'RPL5'].copy()
df['time'] = df['time'].astype(int)
# get subset of metrics to visualize
toplot = [c for c in df.columns[6:] 
          if not c.startswith('Total') 
          and 'CenterMassIntensity' not in c
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]

# show number of cells in each condition
print(df.groupby(['treatment', 'time']).exp.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

# Plot ecdf of each metric
fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    for treatment, style in zip(('RPL5KD', 'SCR'), ('-', '--')):
        sns.ecdfplot(data=df[df.treatment == treatment], x=col, hue='time', linestyle=style, ax=ax, label=treatment)

# Overlay probe correlations with each image
fig, axes = plt.subplots()
sns.lineplot(data=df, x='time', y='Mean_Combined_Correlation_Probe_RPA194', label='FC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Mean_Combined_Correlation_NOP56_Probe', label='DFC_Probe', style='treatment')
sns.lineplot(data=df, x='time', y='Mean_Combined_Correlation_NPM1_Probe', label='GC_Probe', style='treatment') ; 

In [None]:
df = full_data[full_data.probe == 'U8FISH'].copy()
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          # and not 'Probe' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
df = df[(df.mean_mean_Probe_intens < 0.002) | (df.treatment == 'SCR')]
print(df.groupby(['exp', 'treatment', 'probe']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    sns.ecdfplot(data=df, x=col, hue='treatment', ax=ax)
    
fig, axes = plt.subplots()
sns.ecdfplot(data=df[df.treatment != 'SCR'], x='mean_mean_Probe_intens', hue='treatment', ax=axes)
axes.set_xlim(0, 0.0025)
fig, axes = plt.subplots()
# sns.kdeplot(data=df, x='dfc_rim_enrichment', hue='treatment', ax=axes)
sns.ecdfplot(data=df, x='dfc_rim_enrichment', hue='treatment', ax=axes)

In [None]:
df = full_data[full_data.probe == 'U3FISH'].copy()
toplot = [c for c in df.columns[6:] 
          if not c.startswith('total') 
          and not c.startswith('center') 
          and not 'median' in c 
          # and not 'Probe' in c 
          and not c in ('treatement', 'time', 'probe', 'exp')
         ]
df = df[(df.mean_mean_Probe_intens < 0.005) | (df.treatment == 'SCR')]
print(df.groupby(['exp', 'treatment', 'probe']).CellNumber.count())

n_cols = 4
n_rows = int(np.ceil(len(toplot) / n_cols))

figsize = (5*n_cols+5, 5*n_rows)

fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
for col, ax in zip(toplot, axes.flatten()):
    sns.ecdfplot(data=df, x=col, hue='treatment', ax=ax)
    
fig, axes = plt.subplots()
sns.ecdfplot(data=df[df.treatment != 'SCR'], x='mean_mean_Probe_intens', hue='treatment', ax=axes)
fig, axes = plt.subplots()
sns.ecdfplot(data=df, x='fc_rim_enrichment', hue='treatment', ax=axes)