# Metrics

## Aim:

`screening-fist/sxfst/scripts/analysis2.sh` uses  `screening-fist/sxfst/scripts/data_proc2.py` to generate `data/exp0$i-raw.csv`

In [14]:
import sys
sys.path.append('../sxfst/scripts')
import os

from tqdm import tqdm
import numpy as np
import pandas as pd

import sxfst
import data_analysis

adir = '../lab'
annotations = [os.path.join(adir,i) for i in os.listdir(adir) if 'txt' in i]
annotations

['../lab/workable.txt', '../lab/good.txt', '../lab/hits.txt', '../lab/bad.txt']

## Copied from main loop of `../sxfst/scripts/data_analysis.py`

In [31]:
def get_mm(x,y):
    x = np.nan_to_num(x, nan=1e-9)
    y = np.nan_to_num(y, nan=1e-9)
    #xs, xmin, xmax = scale(x, return_min_max=True)
    #ys, ymin, ymax = scale(y, return_min_max=True)
    try:
        (km, vmax), covariance = curve_fit(mm, x, y,
                                           bounds=((0, 0),
                                                   (1e3, max(y)*8)),
                                           p0=(1e3, max(y)),
                                           )
    except RuntimeError:
        km, vmax = np.inf, np.inf

    yh = mm(x, km=km, vmax=vmax)
    rsq = r_squared(y, yh)
    return {'km':km, 'vmax':vmax, 'rsq':rsq}

In [None]:
%%time

from data_analysis import convolve1d, curve_fit, get_blank_wells, get_experiment,\
                          get_extra_metrics, get_traces, mm,  \
                          plotTraces, plot_mm, r_squared, scale, trace_similarity#, get_mm

root = '../data/raw/'
csvs = [os.path.join(root, i) for i in os.listdir(root)]
df = pd.concat([pd.read_csv(i, low_memory=False) for i in csvs]).reset_index(drop=True)

sigma = 2
o = []

for i in df['protein'].dropna().unique():
    c = 0
    for j in tqdm(df['Cpd'].dropna().unique()):
        test, ctrl = get_experiment(df, i, j)
        test_run_no = test['test_run_no'].unique()
        assert len(test_run_no) == 1, f'{test_run_no, i, j }'
        test_run_no = test_run_no[0]
        ctrl = ctrl.loc[ctrl['test_run_no'] == test_run_no, :]
        if len(test) > 0:
            test_traces = get_traces(test)
            protein_blanks = get_blank_wells(df, test)
            protein_blanks_traces = get_traces(protein_blanks)
            # get most similar at A400 - come back to this
            similarity = sorted(range(len(protein_blanks_traces)),
                                key=lambda idx : abs(protein_blanks_traces.iloc[idx, 400].mean() - \
                                                test_traces.iloc[:,400].mean()))
            protein_blanks_trace = sxfst.data.smooth(protein_blanks_traces.iloc[[similarity[0]],:],
                                                     sigma=sigma)

            test_traces = pd.concat([protein_blanks_trace,
                                      test_traces],
                                    axis=0)

            control_blanks = get_blank_wells(df, ctrl)
            control_blanks_traces = get_traces(control_blanks)
            similarity = sorted(range(len(control_blanks_traces)),
                                key=lambda idx : abs(control_blanks_traces.iloc[idx, 300].mean() - \
                                                test_traces.iloc[:,300].mean()))
            control_blanks_trace = sxfst.data.smooth(control_blanks_traces.iloc[similarity[0],:],
                                                     sigma=sigma)#Series
            control_blanks_trace = control_blanks_trace.sub(control_blanks_trace.iloc[-1].values, axis=1)
            ctrl_traces = get_traces(ctrl)
            ctrl_traces = pd.concat([pd.DataFrame(control_blanks_trace).T,
                                      get_traces(ctrl)],
                                      axis=0)
            ctrl_traces_norm = sxfst.data.norm_traces(ctrl_traces)
            ctrl_traces_norm = ctrl_traces_norm.sub(ctrl_traces_norm.iloc[0,:].values)
            assert sum(ctrl_traces_norm.loc[:,800]) == 0 , f'{ctrl_traces_norm}'
            ctrl_traces_norm_sub = ctrl_traces_norm.sub(control_blanks_trace[0], axis=1)
            ctrl_traces_smooth = sxfst.data.smooth(ctrl_traces_norm, sigma=sigma)
            vols = [0] + test['actual_vol'].to_list()
            concs = np.array([sxfst.data.c2(v1=i,      # vol
                                            c1=10_000, # stock conc - uM
                                            v2=38_000 + i, # total vol nm
                                            ) for i in vols])

            test_traces_norm = sxfst.data.norm_traces(test_traces)
            test_traces_smooth = sxfst.data.smooth(test_traces_norm,
                                                    sigma=sigma,
                                                    axis=1).sub(control_blanks_trace.iloc[:,0],
                                                                axis=1)
            def gradient(df):
                x = convolve1d(test_traces_smooth, [-1,0,1])
                return pd.DataFrame(x, columns=df.columns, index=df.index)

            grad = gradient(test_traces_smooth)
            diff = grad - grad.iloc[0,:]

            response = sxfst.data.response(grad.sub(grad.iloc[0,:].values), a=410, b=439)

            mm_fit = get_mm(concs, response.values) # dict

            extra_metrics = get_extra_metrics(test_traces_smooth, ctrl_traces) # dict
            output_data = {'cpd': j,
                           'protein' : i,
                           **mm_fit,
                           **extra_metrics,
                           }
            odf = pd.DataFrame({0:output_data}).T
            odf = odf.loc[:,['cpd','protein','km','vmax','rsq']]
            o.append(odf)
        #c += 1
        #if c == 4:
        #    break
df = pd.concat(o, axis=0).reset_index(drop=True)
df

100%|████████████| 822/822 [30:48<00:00,  2.25s/it]
100%|████████████| 822/822 [30:56<00:00,  2.26s/it]
100%|████████████| 822/822 [30:31<00:00,  2.23s/it]
 25%|███         | 209/822 [07:45<22:36,  2.21s/it]

In [35]:
df

Unnamed: 0,cpd,protein,km,vmax,rsq
0,S1005,BM3 Heme 1YQO,0.002824,0.0,-1.956221
1,S1021,BM3 Heme 1YQO,0.002715,0.039086,-332.577633
2,S1028,BM3 Heme 1YQO,0.002322,0.0,-2.799982
3,S1039,BM3 Heme 1YQO,0.000764,0.001342,-0.553712
4,S1046,BM3 Heme 1YQO,0.004847,0.079722,-422.243599
...,...,...,...,...,...
4105,S2550,BM3 Heme A82F,0.002792,0.037441,-445.453918
4106,S2555,BM3 Heme A82F,0.001233,0.0,-4.457469
4107,S2560,BM3 Heme A82F,0.003343,0.039233,-281.742218
4108,S2566,BM3 Heme A82F,0.002645,0.035539,-327.109057


In [36]:
df.describe()

Unnamed: 0,cpd,protein,km,vmax,rsq
count,4110,4110,4110.0,4110.0,4110.0
unique,822,5,4110.0,4110.0,4110.0
top,S1005,BM3 Heme 1YQO,0.002824,5.808559e-11,-1.956221
freq,5,822,1.0,1.0,1.0
