In [None]:
# want to see the images inline
%matplotlib inline

In [None]:
# imports
# general 
import os
import glob
from functools import reduce
import re
import csv as csv
# scientific 
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model, datasets
from scipy.stats import norm, gamma
from scipy.optimize import curve_fit
from scipy import special

import pandas as pd

from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
pd.set_option('display.max_columns', 100)

In [None]:
import utils as utils

In [None]:
FOLDER = '/Volumes/MILKYKLIM/2019-05-06-exp/rnai' # folder contains z-corrected spots 
EXPERIMENT = 'RNAi'

In [None]:
# some const params for all graphs
num_bins = 100; 
# graph [xmin, xmax]
xmin = -0.2
xmax = 1.2
binwidth = (xmax - xmin)/(num_bins - 1)

bins = np.arange(xmin, xmax + binwidth, binwidth)
print ('bins: ', bins.shape)

In [None]:
# for testing 
filename = 'C0-RNAi_868_cropped_4417'
filepath = os.path.join(FOLDER, 'csv', filename + '.csv')

I = utils.load_data(filepath)

fig = plt.figure(figsize=(8,5))
title = filename
plt.title(title)
    
plt.xlabel('intensity')
plt.ylabel('# spots')

print("I_min:", min(I), "I_max:", max(I))

I_res = I
fit_alpha, fit_loc, fit_beta = gamma.fit(I_res)
print(fit_alpha, fit_loc, fit_beta)

plt.hist(I, bins=bins, color='pink', density=True); # 
# plt.text(0.9*xmax, 0.1, "Total: " + str(I.shape[0]), color='black', bbox=dict(facecolor='white', alpha=1))

info_text = "Total: " + str(I.shape[0])

x_limits = [xmin, xmax]
ymax = np.max(np.histogram(I, bins)[0])
y_limits = [0, ymax]

plt.text(x_limits[1] - (x_limits[1] - x_limits[0])*0.1, y_limits[0] + (y_limits[1] - y_limits[0])*0.04, info_text, color='black', bbox=dict(facecolor='white', alpha=1))
   

x = np.linspace(xmin, xmax, 1000)
y = gamma.pdf(x, fit_alpha, fit_loc, fit_beta)
plt.plot(x,y)

print("peak center:", x[np.argmax(y)])

plt.xlim([xmin, xmax])

# plt.legend(loc = 'upper right')

In [None]:
# possible labels 
stain = ['DPY-23_EX', 'MDH-1', 'WDR-5.2', 'DPY-23_INT']
stage = 'E' # only embryos
comment = '' # only empty ones

In [None]:
# TODO: fix this one
# important indices
# stain_columns = ['C0_stain', 'C1_stain', 'C2_stain', 'C3_stain', 'C4_stain']
# type_columns = ['C0_type', 'C1_type', 'C2_type', 'C3_type', 'C4_type']
# smfish_columns = ['#C0_smfish', '#C1_smfish', '#C2_smfish', '#C3_smfish', '#C4_smfish']
# nuclei_column = ['#nuclei']
# stain_prefix = np.array([['C1-', 'C2-', 'C3-', 'C4-', 'C5-']])
# ext = '.csv'
# filename_column = 'new filename'

In [None]:
# TODO: fix this one
# important indices
type_columns = ['c0_type', 'c1_type', 'c2_type']


stain_columns = ['C0_stain', 'C1_stain', 'C2_stain', 'C3_stain', 'C4_stain']

# smfish_columns = ['#C0_smfish', '#C1_smfish', '#C2_smfish', '#C3_smfish', '#C4_smfish']
nuclei_column = ['#nuclei']
stain_prefix = np.array([['C0-', 'C1-', 'C2-', 'C3-', 'C4-']])
ext = '.csv'
filename_column = 'cropped_image_file'

In [None]:
# read the db and parse images that we want to process
df = pd.read_csv(os.path.join(FOLDER, 'smFISH-database/embryo4_cropped.csv'), 
                 sep=',', 
                 # na_values=[''],
                )
df.head().T

In [None]:
# fix missing entries in the colmns that we are planning to use 
df['cropped_image_file'].fillna('', inplace=True)
df['c0_type'].fillna('', inplace=True)
df['c1_type'].fillna('', inplace=True)
df['c2_type'].fillna('', inplace=True)


# TODO: maybe it is necessary to fill in other values here, too

In [None]:
dff = df[df['cropped_image_file'].apply(lambda x: x.startswith('RNAi'))]
dff.head()

In [None]:
np.unique(dff[type_columns])

In [None]:
# suppose we are looking
labels = ['sdc-2'] 
# ['Pha-4_ex', 'ama-1_ex', 'dpy-23_ex', 'dpy-23_int', 'mdh-1_ex', 
#  'sdc-2_ex', 'sdc-2_int', 'wdr-5.2_ex']

In [None]:
# seems to be working
row, col = np.where(dff[type_columns].applymap(lambda x: x == 'sdc-2_ex'))

n_samples = dff.shape[0]
new_prefix = np.repeat(stain_prefix, n_samples, axis=0)[row, col]

new_filename = dff[filename_column].values[row]
dataset2 = ["{}{}".format(a_, b_[:-4]) for a_, b_ in zip(new_prefix, new_filename)]

In [None]:
# actual plotting 
dataset_to_use = dataset2
# if labels[0] == 'MDH-1':
#     dataset_to_use = dataset3
    
dataset = []
for j in range(0, len(dataset_to_use)):    
    tmp = os.path.join(FOLDER, "csv", dataset_to_use[j] + ".csv")
    dataset.append(tmp)
    print(tmp)

In [None]:
center_path = os.path.join(FOLDER, "centers", "all-centers.csv")

In [None]:
color = '#693D3D'
if labels[0] == 'mdh1-1':
    color = "#693D3D"

In [None]:
# have to perform this step multiple times and choose the best one 
# perform n_fits with different initial parameters
# n_fits = 10

center_set = {}

print(f'Processing: {len(dataset)} files') 

# actual plotting 
for idx in range(0, len(dataset)):    
    if(not os.path.exists(dataset[idx])):
        # print("doesn't exist")
        continue
        
    try:
        # create the canvas
        fig = plt.figure(figsize=(8,5))
        title = utils.create_title(dataset[idx], name_id=6)
        fig.suptitle(title + " / " + labels[0])

        # load the data and scale it accordingly
        I = utils.load_data(dataset[idx], skiprows_=0)

        I_res = I
        # calculate the params for gauss fit
        binned_values, real_bins = np.histogram(I, bins)
        use_median = np.median(I_res)
        # inititally there was use_median/2 
        fit_alpha, fit_loc, fit_beta = gamma.fit(I_res, loc=use_median/2, scale=1/np.max(binned_values))
        # normalization factor
        factor = np.sum(binned_values*np.diff(real_bins))

        plt.hist(I, bins=bins, color=color, label=labels, density=False)

        x = np.linspace(xmin, xmax, 1000)
        y = gamma.pdf(x, fit_alpha, fit_loc, fit_beta)*factor
        plt.plot(x,y, linewidth=5, color='#66A5AD')
        yhat = gamma.pdf(real_bins, fit_alpha, fit_loc, fit_beta)*factor

        # vertical line for center
        plt.axvline(x=real_bins[np.argmax(yhat)], linestyle="--", linewidth=5, color='#66A5AD')

        if (np.any(np.isnan(yhat))):
            continue

        error = utils.fitter_meter(binned_values, yhat[:-1])

        print("error: L1, L2", error)
        print("peak center:", real_bins[np.argmax(yhat)])

        # reasonable adjustments to make the data look nicer
        plt.xlabel('intensity')
        plt.ylabel('# spots')

        info_text = "Total: " + str(I.shape[0]) + "\n" + "Peak: " +  str('%.2f' % real_bins[np.argmax(yhat)]) + "\n" + "L1: " + str('%.2f' % error[0]) + "\n" + "L2: " +  str('%.2f' % error[1]) 

        x_limits = [xmin, xmax]
        ymax = np.max(np.histogram(I, bins)[0])
        y_limits = [0, ymax]

        plt.text(x_limits[1] - (x_limits[1] - x_limits[0])*0.15, y_limits[1]*0.8, info_text, color='black', bbox=dict(facecolor='white', alpha=1))
        plt.xlim([xmin, xmax])

        # save the peak values for further 
        center_set[title] = real_bins[np.argmax(yhat)]  
        folder_path = os.path.join(FOLDER, "histograms", labels[0])

        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
            
        plt.savefig(os.path.join(folder_path, title + ".pdf"))
        plt.show()
        # break
    except(RuntimeError, TypeError, ValueError):
        print("There was an exception but we\'ll fix it for you")
df_center = pd.DataFrame(list(center_set.items()), columns=['filename', 'center'])
if (os.path.exists(center_path)):
    df_center.to_csv(center_path, index=False, header=False, encoding='utf-8', mode = 'a')
else:
    df_center.to_csv(center_path, index=False, header=True, encoding='utf-8', mode = 'w' )
    print (df_center)