This notebook extracts cluster data from a CellProfiler .csv measurements of cluster formation movies, where cells are tracked frame-to-frame over the timecourse of stress, and constructs cell-by-cell timecourse plots

In [None]:
# load modules

# uncomment for debugging
%load_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.core.debugger import set_trace

import os, sys, inspect
import matplotlib
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from scipy import stats
import pprint
import re
import time
import seaborn as sns
import warnings
from IPython.core.debugger import set_trace


# Disable future warnings for seaborn
warnings.simplefilter(action='ignore', category=FutureWarning)


# Add source code directory (src) to path to enable module import
curr_frame = inspect.getfile(inspect.currentframe())
curr_dir = os.path.dirname(os.path.abspath(curr_frame))
parent_dir = os.path.dirname(curr_dir)
module_dir = os.path.join(parent_dir, 'src')
os.sys.path.insert(0, module_dir)

import cellprofiler_tools as cpt


In [None]:
# Set up plot export and plotting styles

# Plotting and figure saving params
save_figs = False
save_dir = '../reports/figures/CellProfiler_FociQuant05_LiveCells'
    
# create save figure dir and set up figure/font sizes
if save_figs:
    %matplotlib
    matplotlib.rcParams['figure.figsize'] = 1.6, 1.4
    save_dir_pdf = os.path.join(save_dir, 'pdf')
    if not os.path.exists(save_dir_pdf):
        os.makedirs(save_dir_pdf)
    
    # Set up fonts
    matplotlib.rc("font", family="Arial")

    matplotlib.rcParams['pdf.fonttype'] = 42 # Make fonts editable
    matplotlib.rcParams['axes.linewidth']= 0.5
    matplotlib.rcParams['lines.linewidth'] = 0.5

    SMALL_SIZE = 5
    MEDIUM_SIZE = 6
    BIGGER_SIZE = 7

    plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
    
    
else:
    %matplotlib inline
    matplotlib.rcParams['figure.figsize'] = 8, 6



In [None]:
# Load data from CSV files
data_dir = '../data/processed/CellProfiler_FociQuant05_LiveCells_20movies/csv_outputs'

# CellProfiler outputs everything in pixels. Input size of pixel in microns:
pixel_size = 0.206 # um per pixel
# Indicate frame duration in the movie
time_step = 5 # In minutes

image_file_csv = 'FociQuant05_Image.csv'
er_masks_csv = 'FociQuant05_ER_masks_accepted.csv'
ire1_clust_csv = 'FociQuant05_Clusters_in_ER_masks_masked.csv'

nuclei_all_csv = 'FociQuant05_Nuclei_all.csv'
er_masks_all_csv = 'FociQuant05_ER_masks_all.csv'
nuclei_accepted_csv = 'FociQuant05_Nuclei_accepted.csv'

# Load the image file
image_full_file = os.path.join(data_dir, image_file_csv)
images = cpt.get_data_cp_csv(image_full_file)
#images = cpt.get_data_cp_csv(image_full_file, data_fields=['ImageNumber','FileName_DNA_DAPI'])

er_masks = cpt.get_data_cp_csv(os.path.join(data_dir, er_masks_csv))
ire1_clust = cpt.get_data_cp_csv(os.path.join(data_dir, ire1_clust_csv))

nuclei_all = cpt.get_data_cp_csv(os.path.join(data_dir, nuclei_all_csv))
er_masks_all = cpt.get_data_cp_csv(os.path.join(data_dir, er_masks_all_csv))
nuclei_accepted = cpt.get_data_cp_csv(os.path.join(data_dir, nuclei_accepted_csv))

print('Loaded')

***

Data loading is finished at this point. Analysis cells follow.

***

In [None]:
# Organize cells into single-cell trajectories and create the cleaned-up dataframe
# cell_filt, which excludes short trajectories duplicated trajectories (duplications
# can arise from CellProfiler incorrectly splitting nuclei during tracking).
# 'cells' and 'cells_filt' have a new column, 'Track_and_group', which holds
# a unique ID for that particular cell's trajectory.

min_traj_frames = 100 # minimum length, in frames, of a valid trajectory
max_final_clust = 0 # Maximum number of clusters that are allowed to be left
                    # at the end of the trajectory

# Create a dataframe for all cells that are included in the analysis
cells = er_masks.copy()
cells.index.name = 'Cell_ID'

#cells = cells.loc[cells['ImageNumber'] <= 213, :] # Should be fixed soon
#print("Warning! Cells gets truncated!")

cpt.add_image_prop_to_objects(cells, images, 'Metadata_Frame')
cpt.add_image_prop_to_objects(cells, images, 'Group_Number')

cells['tStress_hrs'] = cells['Metadata_Frame'] * time_step / 60


# Add track and group labels to cells
cpt.add_parent_prop(cells, nuclei_accepted, 'TrackObjects_Label_25', 
                    'Parent_Nuclei_accepted', 'Track_Label')
cells['Track_Label_str'] = 'Track_' + cells['Track_Label'].astype(str)
cells['Track_and_group'] = cells['Track_Label_str'] + '_Group_' + cells['Group_Number'].astype(str)

# Filter cells by min trajectory duration
final_age = 'TrackObjects_FinalAge_25'
num_clust = 'Children_Clusters_in_ER_masks_masked_Count'

cpt.add_parent_prop(cells, nuclei_accepted, final_age, 
                    'Parent_Nuclei_accepted', final_age)
final_frames = cells[cells[final_age].notnull()]
duration_filt = final_frames[final_age] >= min_traj_frames
declust_filt = final_frames[num_clust] <= max_final_clust
unique_filt = ~final_frames.duplicated(subset='Track_and_group', keep=False)
track_labels_filt = final_frames.loc[duration_filt & unique_filt & declust_filt, 'Track_and_group']
cells_filt = cells[cells['Track_and_group'].isin(track_labels_filt)].copy()

# Plot results
"""
fig, ax = plt.subplots(1,2)
fig.tight_layout(pad=2)
result_name = 'Children_Clusters_in_ER_masks_masked_Count'
ax[0] = sns.lineplot(x='tStress_hrs', y=result_name, data=cells_filt, hue='Track_and_group', 
                  ci=None, legend=None, ax=ax[0])
ax[1] = sns.lineplot(x='tStress_hrs', y=result_name, data=cells_filt, 
                  legend=None, ax=ax[1])

plt.show()
"""

In [None]:
# Filter cells further to contain only trajectories that start and end with no clusters.
# Build a dataframe of trajectories containing start and end points.

num_clust = 'Children_Clusters_in_ER_masks_masked_Count'
min_clust = 5 # Minimum number of clusters per frame to count the cell as clustering

t0 = time.time()

trajectories = []
cells_filt['Time_Norm'] = np.nan # will be used to store normalized values
for traj_id in cells_filt['Track_and_group'].unique():
    traj = cells_filt.loc[cells_filt['Track_and_group'] == traj_id, num_clust]
    
    # Only keep trajectories that begin and end with zero clusters
    # but have min_clust clusters in the midle
    if traj.iloc[0] > 0 or traj.iloc[-1] > 0 or max(traj) < min_clust:
        continue
    
    #Find frame-to-frame differences in number of clusters
    deltas = (traj.iloc[1:].values - traj.iloc[:-1].values).astype(bool)
    
    # Normalize time scaling
    first_clust_frame = np.argmax(deltas)
    last_clust_frame = len(deltas) - np.argmax(np.flip(deltas))
    frame_interval = 1.0 / (last_clust_frame - first_clust_frame) 
    
    # Add rescaled time to cells_filt
    cells_in_traj = cells_filt['Track_and_group'] == traj_id
    n_time = cells_filt.loc[cells_in_traj, 'Metadata_Frame'].values
    n_time = (n_time - first_clust_frame) * frame_interval
    cells_filt.loc[cells_in_traj, 'Time_Norm'] = n_time
    trajectories.append({'Track_and_group' : traj_id,
                         'First_clust_frame' : first_clust_frame, 
                         'Last_clust-frame' : last_clust_frame})
trj = pd.DataFrame(trajectories)


# Add column to cells to rescale trajectories based on clustering start and end
cells_filt2 = cells_filt[cells_filt['Track_and_group'].isin(trj['Track_and_group'].unique())]


t1 = time.time()
print(t1-t0)


# Plot results

fig, ax = plt.subplots()
fig.tight_layout(pad=2)
result_name = 'Children_Clusters_in_ER_masks_masked_Count'
#ax = sns.lineplot(x='tStress_hrs', y=result_name, data=cells_filt2, 
#                  ci=None, legend=None, ax=ax)
#ax.set_xlim(left=-0.1, right=1.5)

#ax = sns.lineplot(x='Time_Norm', y=result_name, data=cells_filt2, 
#                  legend=None, ax=ax)

ax = sns.regplot(x='Time_Norm', y=result_name, data=cells_filt2, x_bins=50, fit_reg=False)

ax.set_xlim(left=-0.1, right=1.5)
plt.show()



In [None]:
# Bin clusters by trajectories and plot cluster properties over time

t0 = time.time() # for optimization

cpt.add_image_prop_to_objects(ire1_clust, images, 'Metadata_Frame')

cpt.add_parent_prop(ire1_clust, cells, 'Track_Label',
                    'Parent_ER_masks_accepted', 'Track_Label')
ire1_clust['Track_Label_str'] = 'Track_' + ire1_clust['Track_Label'].astype(str)

t1 = time.time() # for optimization
print(t1 - t0)

print(len(ire1_clust))

# Plot results
fig, ax = plt.subplots()
fig.tight_layout(pad=2)
result_name = 'Children_Clusters_in_ER_masks_masked_Count'
#ax = sns.scatterplot(x='Metadata_Frame', y='AreaShape_Area', data=ire1_clust, hue='Track_Label_str', 
#                  s=3, legend=None, ax=ax)
ax = sns.lineplot(x='Metadata_Frame', y='AreaShape_Area', data=ire1_clust, hue='Track_Label_str', 
                  legend=None, ax=ax)

plt.show()


***
Old or testing cells below 
***

In [None]:
# Plot cell intensities over time

result_name_1 = 'Intensity_IntegratedIntensity_IRE1_mNeonGreen'
result_name_2 = 'Children_Clusters_in_ER_masks_masked_Count'




fig, ax = plt.subplots(1,2)
fig.tight_layout(pad=2)

ax[0] = sns.lineplot(x=t_stress_col, y=result_name_1, data=cells, ax=ax[0])
ax[0].set_title(t_stress_col)
ax[0].set_xlabel(t_stress_col)
ax[0].set_ylabel(result_name)
ax[0].set_ylim(bottom=0)

cells_w_clust = cells.loc[cells[result_name_2] > 0]
ax[1] = sns.lineplot(x=t_stress_col, y=result_name_2, data=cells_w_clust, ax=ax[1], ci=None)

if save_figs:
    fig_filename_pdf = os.path.join(save_dir_pdf, 'Cell_Intensity_over_time.pdf')
    plt.savefig(fig_filename_pdf)

plt.show()
print('Done')


In [None]:
# Plot fraction of cells with clusters per condition
condition = 'tStress_hrs'

#cells['Has_IRE1_clusters'] = cells['Children_IRE1_clusters_Count'].astype('bool')
cells['Has_IRE1_clusters'] = cells['Children_Clusters_in_ER_masks_masked_Count'].astype('bool')

#frac_clust = cpt.bootstrap_cell_prop (cells, 'Has_IRE1_clusters', condition)

fig, ax = plt.subplots()
fig.tight_layout(pad=2)

#ax = sns.barplot(data=frac_clust, color='steelblue', ci="sd")

ax = sns.lineplot(x=condition, y ='Has_IRE1_clusters', data=cells)

ax.set_title('Fraction of cells with clusters over time')
ax.set_xlabel('Hours of Tm treatment')
ax.set_ylabel('Fraction of cells with clusters')
ax.set_ylim(bottom=0)

if save_figs:
    fig_filename_pdf = os.path.join(save_dir_pdf, 'Fraction_cell_with_clusters.pdf')
    plt.savefig(fig_filename_pdf)

plt.show()

In [None]:
# Plot cluster area distribution over conditions
result_name = 'AreaShape_Area'
#result_name2 = 'Intensity_IntegratedIntensity_IRE1_mNeonGreen'
#result_name = 'Intensity_IntegratedIntensity_IRE1_mNeonGreen'

frame = 'Metadata_Frame'
t_stress_col = 'tStress_hrs'
cpt.add_image_prop_to_cells (ire1_clust, images, frame)
ire1_clust[t_stress_col] = ire1_clust[frame] * time_step / 60


result_name_microns = 'Cluster_area_um2'
pixel_area = pixel_size**2
ire1_clust[result_name_microns] = ire1_clust[result_name] *pixel_area


fig, ax = plt.subplots()
fig.tight_layout(pad=2)
    
ax = sns.lineplot(x=condition, y=result_name_microns, data=ire1_clust, ci=None)

if save_figs:
    fig_filename_pdf = os.path.join(save_dir_pdf, 'Cluster_areas_vs_timepoint.pdf')
    plt.savefig(fig_filename_pdf)

plt.show()

In [None]:
# Relate clusters to nuclei and add nuclear geometry params to clusters

# Relate clusters to nuclei
prop = 'ObjectNumber'
rel_col = 'Parent_Nuclei_all'
n = 'Nuclei_Accepted_ObjID'
cpt.add_child_prop_to_parents (nuclei_all, nuclei_accepted, prop, rel_col, n)

cpt.add_parent_prop(er_masks_all, nuclei_all, n, 'Parent_Nuclei_all', n)
cpt.add_parent_prop(er_masks, er_masks_all, n, 'Parent_ER_masks_all', n)
cpt.add_parent_prop(ire1_clust, er_masks, n, 'Parent_ER_masks_accepted', n)

#Add nucleus coordinates and radii to clusters
props = ['AreaShape_Center_X',
         'AreaShape_Center_Y',
         'AreaShape_MeanRadius']
props_mod = []
for prop in props:
    result_name = prop + '_Nucleus'
    cpt.add_parent_prop(ire1_clust, nuclei_accepted, prop, n, result_name)
    props_mod.append(result_name)
    print('Processing property '+prop)
print('Done')

In [None]:
#Calculate and plot cluster to nucleus distances
condition = 'tStress_hrs'
result_1 = 'Dist_to_Nucleus_Edge'
result_2 = 'AreaShape_Area'

n_x = ire1_clust['AreaShape_Center_X_Nucleus']
n_y = ire1_clust['AreaShape_Center_Y_Nucleus']
c_x = ire1_clust['AreaShape_Center_X']
c_y = ire1_clust['AreaShape_Center_Y']

d = np.sqrt(np.square(n_x-c_x)+np.square(n_y-c_y))
ire1_clust['Dist_to_Nucleus_Center'] = d
ire1_clust['Dist_to_Nucleus_Edge'] = d - ire1_clust['AreaShape_MeanRadius_Nucleus']

fig, ax = plt.subplots()
fig.tight_layout(pad=2)

#ax = sns.swarmplot(x=condition, y=result_1, data=clust_filt, color=".25", size=1)
ax = sns.lineplot(x=condition, y=result_1, data=ire1_clust)

if save_figs:
    fig_filename_pdf = os.path.join(save_dir_pdf, 'Cluster_to_nucleus_distance.pdf')
    plt.savefig(fig_filename_pdf)

cond1 = 1
cond2 = 2
data1 = ire1_clust[ire1_clust[condition] == cond1]['Dist_to_Nucleus_Edge']
data2 = ire1_clust[ire1_clust[condition] == cond2]['Dist_to_Nucleus_Edge']

print(stats.ttest_ind(data1,data2, equal_var = False))

In [None]:
# Plot cluster properties per cell

#prop = 'AreaShape_Compactness'
#prop = 'AreaShape_Area'
prop = 'Intensity_IntegratedIntensity_IRE1_mNeonGreen'
stat='sum'

result_name = 'IRE1_clust_'+prop+'_'+stat
rel_col = 'Parent_ER_masks_accepted'
condition = 'tStress_hrs'

cpt.add_child_prop_to_parents (cells, ire1_clust, prop, rel_col, 
                             result_name, statistic=stat)
cells_valid = cells.dropna(subset=[result_name])

fig, ax = plt.subplots()
fig.tight_layout(pad=2)

ax = sns.lineplot(x=condition, y=result_name, data=cells_valid, color='steelblue', ci=68)
ax.set_title(result_name)
ax.set_xlabel(condition)
ax.set_ylabel(result_name)
ax.set_ylim(bottom=0)
plt.show()


if save_figs:
    fig_filename_pdf = os.path.join(save_dir_pdf, 'Sum_cluster_intensity_per_cell.pdf')
    plt.savefig(fig_filename_pdf)


In [None]:
# Plot fraction of IRE1 in clusters per cell

prop_parent = 'Intensity_IntegratedIntensity_IRE1_mNeonGreen'
prop_child = 'Intensity_IntegratedIntensity_IRE1_mNeonGreen'
stat='sum'

child_result = 'IRE1_clust_'+prop_child+'_'+stat
rel_col = 'Parent_ER_masks_accepted'
condition = 'tStress_hrs'
fraction_clust = 'Fraction_IRE1_in_clusters'

cpt.add_child_prop_to_parents (cells, ire1_clust, prop_child, rel_col, 
                             child_result, statistic=stat)

cells[fraction_clust] = cells[child_result] / cells[prop_parent]
cells[fraction_clust].fillna(0, inplace=True)

fig, ax = plt.subplots(1,2)
fig.tight_layout(pad=2)

ax[0] = sns.lineplot(x=condition, y=fraction_clust, 
                 data=cells, ax=ax[0])
#ax = sns.swarmplot(x=group_name, y=fraction_clust, data=cells, color=".25")

ax[0].set_title(fraction_clust)
ax[0].set_xlabel(group_name)
ax[0].set_ylabel(fraction_clust)



ax[1] = sns.scatterplot(cells[prop_parent], cells[child_result], ax=ax[1])
plt.show()

if save_figs:
    fig_filename_pdf = os.path.join(save_dir_pdf, 'Fraction_IRE1_in_clusters.pdf')
    plt.savefig(fig_filename_pdf)

In [None]:
# Plot scatterplot of parent vs. child properties for each cell

cpt.add_child_prop_to_parents (cells, ire1_clust, prop_child, rel_col, 
                             child_result, statistic=stat)
cells_valid = cells.dropna(subset=[child_result])

for group in cells_valid[group_str].unique():
    if group in excluded_groups:
        continue
    cells_group = cells_valid.loc[cells_valid[group_str] == group]
    
ax.legend(loc="best")

ax.set_xlim(left=0)
plt.show()