In [2]:

import pathlib as pl
import os
import warnings
import tensorflow as tf
warnings.simplefilter("ignore")
# ensure tensorflow doesn't use GPU or too many CPUs
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
tf.config.threading.set_intra_op_parallelism_threads(5)
tf.config.threading.set_inter_op_parallelism_threads(5)
import candas
from candas.learn import ParameterSet, parray
import pandas as pd

# Get the Best Points

This notebook finds the best point on each surfce for both the constrained and unconstrained optimisation cases

First load and format the data

In [3]:
log_transform=False

path = pl.Path(os.getcwd()).parent
ps_df = pd.read_pickle(path / 'data' / 'ADVI_ParameterSets_220528.pkl')
ps_df = ps_df[(ps_df.lg10_Copies == 8)]
ps_df = ps_df.drop(ps_df[ps_df['Experiment'].str.contains("JG073A")].index)
ps = ParameterSet.from_wide(ps_df)
if not log_transform:
    ps.stdzr.transforms['r'] = [candas.utils.skip, candas.utils.skip]
    ps.stdzr.transforms['m'] = [candas.utils.skip, candas.utils.skip]
    for param in ['r', 'm']:
        ps.stdzr[param] = {
            'μ': ps.data.loc[(ps.data['Parameter'] == param) & (ps.data['Metric'] == 'mean'), 'Value'].mean(),
            'σ': ps.data.loc[(ps.data['Parameter'] == param) & (ps.data['Metric'] == 'mean'), 'Value'].std()}
else:
    pass
ps.data['EvaGreen'] = ((ps.data['Reporter'] == "EVAGREEN") | (ps.data['Reporter'] == "SYBR"))
ps.data.loc[ps.data['EvaGreen'] == True, 'EvaGreen'] = 'EvaGreen'
ps.data.loc[ps.data['EvaGreen'] == False, 'EvaGreen'] = 'Probe'
ps.data['PrimerPairReporter'] = ps.data[['PrimerPair', 'EvaGreen']].agg('-'.join, axis=1)

In [4]:
dfs = []
for param in ['r', 'm']:
    df = ps.data[(ps.data['Parameter'] == param) & (ps.data['Metric'] == 'mean')]
    df[param] = df['Value']
    df = df[['PrimerPairReporter', 'BP', 'GC', 'Experiment', 'Well', 'Target', 'Copies', param]]
    dfs.append(df)

df = pd.merge(*dfs, on=['PrimerPairReporter', 'BP', 'GC', 'Experiment', 'Well', 'Target', 'Copies'])

In [5]:
df = df.drop(columns=['Experiment', 'Well', 'Target', 'Copies'])

In [6]:
df = df.groupby(['PrimerPairReporter', 'BP', 'GC']).mean().reset_index()

In [7]:
param_parray = parray(**{'r': df['r'], 'm':df['m']}, stdzr=ps.stdzr)

In [8]:
df['stzd r'] = param_parray['r'].z.values()
df['stzd m'] = param_parray['m'].z.values()

Load the targets

In [9]:
path = pl.Path(os.getcwd()).parent
with open(path / 'data' / 'JG067 sequence targets.csv', "rb") as file:
    targets = pd.read_csv(file)
targets['PrimerPair'] = targets[['FPrimer', 'RPrimer']].agg('-'.join, axis=1)
targets['EvaGreen'] = ((targets['-Strand Label'] == "None") & (targets['+Strand Label'] == "None"))
targets.loc[targets['EvaGreen'] == True, 'EvaGreen'] = 'EvaGreen'
targets.loc[targets['EvaGreen'] == False, 'EvaGreen'] = 'Probe'
targets['PrimerPairReporter'] = targets[['PrimerPair', 'EvaGreen']].agg('-'.join, axis=1)
targets = targets.drop_duplicates(subset=['PrimerPairReporter'], keep='first')

In [10]:
r_targ_parray = parray(**{'r': targets['Target Rate']}, stdzr=ps.stdzr)

targets['Target Rate z'] = r_targ_parray.z.values()

In [11]:
for ppr in targets['PrimerPairReporter'].unique():
    df.loc[df['PrimerPairReporter'] == ppr, 'target r'] = targets.loc[targets['PrimerPairReporter'] == ppr,
                                                                      'Target Rate'].to_numpy()[0]
    df.loc[df['PrimerPairReporter'] == ppr, 'target r z'] = targets.loc[targets['PrimerPairReporter'] == ppr,
                                                                      'Target Rate z'].to_numpy()[0]

In [12]:
df['target m'] = 1e-2

m_targ_parray = parray(**{'m': 1e-2}, stdzr=ps.stdzr)


In [13]:
df['target m z'] = m_targ_parray.z.values()

Find the best points for the unconstrained data_points

In [14]:
from Code.expected_improvements import ExpectedImprovement

ei = ExpectedImprovement(['r'])
df = ei.get_error_from_optimization_target(df)


In [15]:
df = df.sort_values(by='error from optimization target z')
df = df.dropna()

In [16]:
best_df = df.drop_duplicates(subset=['PrimerPairReporter'], keep='first')

In [17]:
for ppr in best_df['PrimerPairReporter']:
    min_df = df[df['PrimerPairReporter']== ppr]['error from optimization target z'].min()
    best = best_df[best_df['PrimerPairReporter']==ppr]['error from optimization target z'].to_numpy()[0]
    assert min_df == best

In [18]:
path = pl.Path(os.getcwd()).parent / 'data'
best_df.to_csv(path / 'best_df.csv')

And for Constrained:

In [19]:
from Code.expected_improvements import ExpectedImprovementConstrained

ei = ExpectedImprovementConstrained(['r', 'm'])
df = ei.get_error_from_optimization_target(df)


In [20]:
df = df.sort_values(by='error from optimization target z')
df = df.dropna()

In [21]:
best_df = df.drop_duplicates(subset=['PrimerPairReporter'], keep='first')

In [22]:
for ppr in best_df['PrimerPairReporter']:
    min_df = df[df['PrimerPairReporter']== ppr]['error from optimization target z'].min()
    best = best_df[best_df['PrimerPairReporter']==ppr]['error from optimization target z'].to_numpy()[0]
    assert min_df == best

In [23]:
best_df.to_csv(path /'best_df_constrained.csv')