In [13]:
# want to see the images inline
# %matplotlib inline

In [14]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from scipy.stats import gamma
from scipy.optimize import curve_fit
from scipy import special

from configparser import ConfigParser, ExtendedInterpolation

import utils as utils

from tqdm import tqdm_notebook as tqdm

In [15]:
# read config file 
config = ConfigParser(interpolation=ExtendedInterpolation())
config.read('config.ini')

['config.ini']

In [16]:
TYPES = ['Pha4.ex', 'ama1.ex', 'dpy23.ex', 'mdh1.ex', 'sdc2.ex', 'wdr5.2.ex']
#  'dpy23.int', 'sdc2.int', 

In [17]:
FOLDER = config.get('main', 'ROOTFOLDER')
DB_FILENAME = config.get('main', 'DB_FILENAME')
TYPE = config.get('main', 'TYPE')
COLOR = config.get('main', 'COLOR')
MAKE_IMAGES = config.getboolean('main', 'MAKE_IMAGES')

In [18]:
# some const params for all graphs
num_bins = 100
# graph [xmin, xmax]
xmin = -0.2
xmax = 3.2
binwidth = (xmax - xmin)/(num_bins - 1)

bins = np.arange(xmin, xmax + binwidth, binwidth)
print ('bins: ', bins.shape)

bins:  (100,)


In [19]:
# important indices
type_columns = ['c0_type', 'c1_type', 'c2_type']
stain_prefix = np.array([['C0-', 'C1-', 'C2-', 'C3-', 'C4-']])
filename_column = 'cropped_image_file'

In [20]:
# read the db and parse images that we want to process
df_path = os.path.join(FOLDER, 'smFISH-database', DB_FILENAME)
df = pd.read_csv(df_path, 
                 sep=',', 
                )

In [21]:
# TODO: maybe it is necessary to fill in other values here, too
# fix missing entries in the colmns that we are planning to use 
df['cropped_image_file'].fillna('', inplace=True)
df['c0_type'].fillna('', inplace=True)
df['c1_type'].fillna('', inplace=True)
df['c2_type'].fillna('', inplace=True)

In [23]:
dff = df.copy() # df[df['cropped_image_file'].apply(lambda x: x.startswith(EXPERIMENT))]
# dff = df[df['cropped_image_file'].apply(lambda x: x.startswith(EXPERIMENT))]

In [24]:
dff

Unnamed: 0,c0,c1,c2,c3,c4,c0_lambda,c1_lambda,c2_lambda,c3_lambda,c4_lambda,...,#c0_smfish_adj,#c1_smfish_adj,#c2_smfish_adj,is_male_batch,is_male,is_z_cropped,is_too_bleached,num_z_planes,tx,tx_desc
0,Cy5,GoldFISH,mCherry,GFP,DAPI,670.0,566.0,610.0,507.0,461.0,...,1735.83,-1.00,1408.86,0.0,-1.0,-1.0,-1.0,91.0,-1.0,
1,Cy5,mCherry,GoldFISH,GFP,DAPI,670.0,610.0,566.0,507.0,461.0,...,537.07,838.14,-1.00,0.0,-1.0,1.0,-1.0,81.0,-1.0,
2,Cy5,mCherry,GoldFISH,GFP,DAPI,670.0,610.0,566.0,507.0,461.0,...,1296.55,323.36,-1.00,0.0,-1.0,1.0,-1.0,81.0,-1.0,
3,Cy5,mCherry,GoldFISH,GFP,DAPI,670.0,610.0,566.0,507.0,461.0,...,1681.77,3289.43,-1.00,0.0,-1.0,1.0,-1.0,81.0,-1.0,
4,Cy5,mCherry,GoldFISH,GFP,DAPI,670.0,610.0,566.0,507.0,461.0,...,92.80,1740.47,-1.00,0.0,-1.0,1.0,-1.0,81.0,-1.0,
5,Cy5,mCherry,GoldFISH,GFP,DAPI,670.0,610.0,566.0,507.0,461.0,...,61.99,151.40,-1.00,0.0,-1.0,1.0,-1.0,81.0,-1.0,
6,Cy5,mCherry,GoldFISH,GFP,DAPI,670.0,610.0,566.0,507.0,461.0,...,-1.00,-1.00,-1.00,0.0,-1.0,1.0,-1.0,81.0,-1.0,
7,Cy5,mCherry,GoldFISH,GFP,DAPI,670.0,610.0,566.0,507.0,461.0,...,151.76,3972.97,-1.00,0.0,-1.0,1.0,-1.0,81.0,-1.0,
8,Cy5,mCherry,GoldFISH,GFP,DAPI,670.0,610.0,566.0,507.0,461.0,...,1670.29,396.35,-1.00,0.0,-1.0,1.0,-1.0,81.0,-1.0,
9,Cy5,mCherry,GoldFISH,GFP,DAPI,670.0,610.0,566.0,507.0,461.0,...,216.00,545.59,-1.00,0.0,-1.0,0.0,-1.0,81.0,-1.0,


In [25]:
# seems to be working
row, col = np.where(dff[type_columns].applymap(lambda x: x in TYPES))
n_samples = dff.shape[0]
new_prefix = np.repeat(stain_prefix, n_samples, axis=0)[row, col]
new_filename = dff[filename_column].values[row]
full_filenames = ["{}{}".format(a_, b_[:-4]) for a_, b_ in zip(new_prefix, new_filename)]

In [27]:
dataset = []
pbar = tqdm(total=len(full_filenames))
for ff in full_filenames: 
    tmp = os.path.join(FOLDER, "csv", ff + ".csv")
    dataset.append(tmp)
    pbar.update(1)
pbar.close()

HBox(children=(IntProgress(value=0, max=13399), HTML(value='')))




In [28]:
len(full_filenames)

13399

In [None]:
# center_path = os.path.join(FOLDER, "centers", "all-centers.csv")

In [31]:
# have to perform this step multiple times and choose the best one 
# perform n_fits with different initial parameters
# n_fits = 10

center_set = {}

print(f'Processing: {len(dataset)} files') 

# actual plotting 
pbar = tqdm(total=len(dataset))
for d in dataset:    
    pbar.update(1) 
    if(not os.path.exists(d)):
        continue
        
    try:
        # computation
        
        # load the data and scale it accordingly
        I = utils.load_data(d, skiprows_=0)
        
        # calculate the params for gauss fit
        binned_values, real_bins = np.histogram(I, bins)
        use_median = np.median(I)
        # inititally there was use_median/2 
        fit_alpha, fit_loc, fit_beta = gamma.fit(I, 
                                                 loc=use_median/2, 
                                                 scale=1/np.max(binned_values),
                                                )
        
        # normalization factor
        factor = np.sum(binned_values*np.diff(real_bins))
        yhat = factor*gamma.pdf(real_bins, 
                                fit_alpha, 
                                fit_loc, 
                                fit_beta,
                               )
        if (np.any(np.isnan(yhat))):
            continue     
            
        x = np.linspace(xmin, xmax, 1000)
        y = factor*gamma.pdf(x, 
                             fit_alpha, 
                             fit_loc, 
                             fit_beta)
        
        error = utils.fitter_meter(binned_values, yhat[:-1])
        
        # plotting 
        title = utils.create_title(d, name_id=-1)
        if MAKE_IMAGES:
            # create the canvas
            fig = plt.figure(figsize=(8,5))
            fig.suptitle(title)

            plt.hist(I, 
                     bins=bins, 
                     color=COLOR, 
                     # label=TYPE, 
                     density=False,
                    )
            plt.plot(x, 
                     y, 
                     linewidth=5, 
                     color='#66A5AD',
                    )   
            # vertical line for center
            plt.axvline(x=real_bins[np.argmax(yhat)], 
                        linestyle="--", 
                        linewidth=5, 
                        color='#66A5AD',
                       )

            # print("error: L1, L2", error)
            # print("peak center:", real_bins[np.argmax(yhat)])

            # reasonable adjustments to make the data look nicer
            plt.xlabel('intensity')
            plt.ylabel('# spots')

            info_text = "Total: " + str(I.shape[0]) + "\n" + "Peak: " +  str('%.2f' % real_bins[np.argmax(yhat)]) + "\n" + "L1: " + str('%.2f' % error[0]) + "\n" + "L2: " +  str('%.2f' % error[1]) 

            x_limits = [xmin, xmax]
            ymax = np.max(np.histogram(I, bins)[0])
            y_limits = [0, ymax]

            plt.text(x_limits[1] - (x_limits[1] - x_limits[0])*0.15, y_limits[1]*0.8, info_text, color='black', bbox=dict(facecolor='white', alpha=1))
            plt.xlim([xmin, xmax])
            
            # plt.savefig(os.path.join(folder_path, title + ".pdf"))
            plt.show()
            # plt.close()

        # save the peak values for further 
        center_set[title] = real_bins[np.argmax(yhat)]  
        folder_path = os.path.join(FOLDER, "histograms-2")

        os.makedirs(folder_path, exist_ok=True)
    except(RuntimeError, TypeError, ValueError):
        print("There was an exception but we\'ll fix it for you")
pbar.close()
# df_center = pd.DataFrame(list(center_set.items()), columns=['filename', 'center'])
# if (os.path.exists(center_path)):
#     df_center.to_csv(center_path, index=False, header=False, encoding='utf-8', mode = 'a')
# else:
#     df_center.to_csv(center_path, index=False, header=True, encoding='utf-8', mode = 'w' )
#     print (df_center)
    

Processing: 13399 files


HBox(children=(IntProgress(value=0, max=13399), HTML(value='')))

  data = np.loadtxt(file_path, delimiter = '\t', skiprows=skiprows_)
  return m3 / np.power(m2, 1.5)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  r = func(a, **kwargs)
  if (numpy.max(numpy.ravel(numpy.abs(sim[1:] - sim[0]))) <= xatol and
  xr = (1 + rho) * xbar - rho * sim[-1]
  sim[j] = sim[0] + sigma * (sim[j] - sim[0])
  return (self.a <= x) & (x <= self.b)
  return (self.a <= x) & (x <= self.b)
  r = func(a, **kwargs)
  if (numpy.max(numpy.ravel(numpy.abs(sim[1:] - sim[0]))) <= xatol and
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  xr = (1 + rho) * xbar - rho * sim[-1]
  sim[j] = sim[0] + sigma * (sim[j] - sim[0])
  return (self.a <= x) & (x <= self.b)
  return (self.a <= x) & (x <= self.b)
  r = func(a, **kwargs)
  if (numpy.max(numpy.ravel(numpy.abs(sim[1:] - sim[0]))) <= xatol and
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  xr = (1 + rho) * xbar - rho * sim[-1]
  sim[j] = sim[0] + sigma * (sim[j] - sim[0])
  return (self.a <= x) 

  r = func(a, **kwargs)
  if (numpy.max(numpy.ravel(numpy.abs(sim[1:] - sim[0]))) <= xatol and
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  xr = (1 + rho) * xbar - rho * sim[-1]
  sim[j] = sim[0] + sigma * (sim[j] - sim[0])
  return (self.a <= x) & (x <= self.b)
  return (self.a <= x) & (x <= self.b)


KeyboardInterrupt: 

In [None]:
TYPE

In [None]:
PARAMS