This is a testing notebook for processing IRE1 cluster data based on CSV output files from Cell Profiler.

In [16]:
# load modules

# uncomment for debugging
%load_ext autoreload
%autoreload 2
%matplotlib

import os, sys, inspect
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import pprint
import re
import time
import seaborn as sns
import warnings


# Disable future warnings for seaborn
warnings.simplefilter(action='ignore', category=FutureWarning)


# Add source code directory (src) to path to enable module import
curr_frame = inspect.getfile(inspect.currentframe())
curr_dir = os.path.dirname(os.path.abspath(curr_frame))
parent_dir = os.path.dirname(curr_dir)
module_dir = os.path.join(parent_dir, 'src')
os.sys.path.insert(0, module_dir)

import cellprofiler_tools as cpt


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using matplotlib backend: Qt5Agg


In [11]:
# Load any required data from HDF5 file
# Path to raw HDF5 file
data_dir = '../data/processed/CellProfiler_FociQuant03/csv_outputs'

image_file_csv = 'FociQuant03_Image.csv'
er_masks_csv = 'FociQuant03_ER_masks_accepted.csv'
ire1_clust_csv = 'FociQuant03_IRE1_clusters.csv'


# Load the image file
image_full_file = os.path.join(data_dir, image_file_csv)
images = cpt.get_data_cp_csv(image_full_file, data_fields=['ImageNumber','FileName_DNA_DAPI'])
images = cpt.get_data_cp_csv(image_full_file)
#images = cpt.get_data_cp_csv(image_full_file, data_fields=['ImageNumber','FileName_DNA_DAPI'])

er_masks = cpt.get_data_cp_csv(os.path.join(data_dir, er_masks_csv))
ire1_clust = cpt.get_data_cp_csv(os.path.join(data_dir, ire1_clust_csv))

#print(er_masks)

print(er_masks)


      ImageNumber  ObjectNumber  AreaShape_Area  AreaShape_Center_X  \
0               1             1           13992               203.0   
1               1             2           29278               346.0   
2               1             3           15968               357.0   
3               1             4           13306               177.0   
4               1             5           11964               336.0   
5               1             6           15408               231.0   
6               2             1           15181               334.0   
7               2             2           18866               221.0   
8               2             3           14385               419.0   
9               2             4           24413               153.0   
10              2             5           10197               367.0   
11              2             6           17229               245.0   
12              3             1            6820               346.0   
13    

In [36]:
# Plot fraction of cells with clusters per condition

# Create a dataframe for all cells that are included in the analysis
cells = er_masks.copy()
cells.index.name = 'Cell_ID'


"""
# Locate image title of each cell
mask_ids = []
clust_ids = []
hours_tm = [] # hours of tunicamycin treatment
positions = []
img_names = []


cells['ER_MaskNum'] = mask_ids
cells['IRE1_clust_IDs'] = clust_ids
cells['Hrs_Tm'] = hours_tm
cells['ImgName'] = img_names
cells['Position'] = positions
cells['NumClusters'] = [len(row.IRE1_clust_IDs) for row in cells.itertuples()]
"""

hours_tm = [] # hours of tunicamycin treatment
img_names = []

for index, cell in cells.iterrows():
    img_name = images.loc[images['ImageNumber'] == cell['ImageNumber'], 'FileName_DNA_DAPI']
    hr_tm = images.loc[images['ImageNumber'] == cell['ImageNumber'], 'Metadata_hours_Tm']
    hours_tm.append(hr_tm)
    img_names.append(img_name)
    
cells['Hrs_Tm'] = hours_tm
cells['ImgName'] = img_names

fraction_clust = {}
bootstrap_nreps = 1000
conditions_unique = images['Metadata_hours_Tm'].unique()

print(conditions_unique)
t = cells.at[5,'Hrs_Tm']
print("t: ", type(t))
print(t)


for condition in conditions_unique:
    
    cells_in_cond = cells.loc[cells['Hrs_Tm'] == condition]
    total_cells = len(cells_in_cond)
    
    # Bootstrap the samples to estimate uncertainties
    metric = []
    for i in range(bootstrap_nreps):
        subsamp_cells = np.random.choice(list(cells_in_cond['Children_IRE1_clusters_Count']), total_cells)
        num_cells_with_clust = np.count_nonzero(subsamp_cells)
        fraction_with_clust = num_cells_with_clust / total_cells
        metric.append(fraction_with_clust)
    
    fraction_clust.update({condition : metric})

frac_clust = pd.DataFrame(data=fraction_clust)

ax = sns.barplot(data=frac_clust, color='steelblue', ci="sd")
ax.set_title('Formation and dissolution of IRE1 clusters in stressed cells')
ax.set_xlabel('Hours of Tm treatment')
ax.set_ylabel('Fraction of cells with clusters')
ax.set_ylim(bottom=0)
plt.show()

[ 0  1  2  4  8 20 24 32]
t:  <class 'pandas.core.series.Series'>
0    0
Name: Metadata_hours_Tm, dtype: int64


TypeError: cannot convert the series to <class 'float'>

In [None]:
"""
# Uncomment for a quick timer
start = time.time()
print("Start timer")

end = time.time()
print(end - start)
"""

In [None]:
string = 'C1-AVG_vVB_190416_03_Well03_02hTm_1_MMStack_Pos0.ome.tif'


z = re.match(r".*_(\d+)hTm_.*", string)

print(z.group(1))

