In [1]:
import pathlib as pl
import os
import pandas as pd
from candas.learn import ParameterSet, ParameterArray

## Data Summary

In this notebook we check which surfaces are in the targets and how many unique locations there are on each one.

Load data

In [2]:
path = pl.Path(os.getcwd()).parent
with open(path / 'data' / 'JG067 sequence targets.csv', "rb") as file:
    targets = pd.read_csv(file)
targets['PrimerPair'] = targets[['FPrimer', 'RPrimer']].agg('-'.join, axis=1)
targets['EvaGreen'] = ((targets['-Strand Label'] == "None") & (targets['+Strand Label'] == "None"))
targets.loc[targets['EvaGreen'] == True, 'EvaGreen'] = 'EvaGreen'
targets.loc[targets['EvaGreen'] == False, 'EvaGreen'] = 'Probe'
targets['PrimerPairReporter'] = targets[['PrimerPair', 'EvaGreen']].agg('-'.join, axis=1)
# targets = targets.drop_duplicates(subset=['PrimerPairReporter'], keep='first')

In [3]:
path = pl.Path(os.getcwd()).parent
ps_df = pd.read_pickle(path / 'data' / 'ADVI_ParameterSets_220528.pkl')
ps_df = ps_df[(ps_df.lg10_Copies == 8)]
ps_df = ps_df.drop(ps_df[ps_df['Experiment'].str.contains("JG073A")].index)
ps = ParameterSet.from_wide(ps_df)
ps.data['EvaGreen'] = ((ps.data['Reporter'] == "EVAGREEN") | (ps.data['Reporter'] == "SYBR"))
ps.data.loc[ps.data['EvaGreen'] == True, 'EvaGreen'] = 'EvaGreen'
ps.data.loc[ps.data['EvaGreen'] == False, 'EvaGreen'] = 'Probe'
ps.data['PrimerPairReporter'] = ps.data[['PrimerPair', 'EvaGreen']].agg('-'.join, axis=1)

Get data summary

In [4]:
print('no. of surfaces:',len(ps.data['PrimerPairReporter'].unique()))
print('no. surfaces to be optimized:', len(targets['PrimerPairReporter'].unique()))
print('no. unique locations:',len(ps.data[['BP', 'GC', 'PrimerPairReporter']].drop_duplicates()))
print('total number data points:', len(ps.data[(ps.data['Parameter'] == 'r') & (ps.data['Metric'] == 'mean')]))
print('min number of repeats at a location:', ps.data[(ps.data['Parameter'] == 'r') & (ps.data['Metric'] == 'mean')].value_counts(['BP', 'GC', 'PrimerPairReporter']).min())
print('max number of repeats at a location:', ps.data[(ps.data['Parameter'] == 'r') & (ps.data['Metric'] == 'mean')].value_counts(['BP', 'GC', 'PrimerPairReporter']).max())


no. of surfaces: 34
no. surfaces to be optimized: 16
no. unique locations: 327
total number data points: 592
min number of repeats at a location: 1
max number of repeats at a location: 6


Calculate which surfaces are in the targets list and which aren't

In [5]:
ppr_not_in_targets = set(ps.data['PrimerPairReporter'].unique()) - set(targets['PrimerPairReporter'].unique())
ppr_not_in_data = set(targets['PrimerPairReporter'].unique()) - set(ps.data['PrimerPairReporter'].unique())
ppr_in_both = set(ps.data['PrimerPairReporter'].unique()) & set(targets['PrimerPairReporter'].unique())

In [6]:
unique_locations = ps.data[['PrimerPairReporter', 'BP', 'GC']].drop_duplicates()

Print list of the surfaces which are NOT in the targets list and how many unique data locations there are on each

In [7]:
unique_locations[unique_locations['PrimerPairReporter'].isin(ppr_not_in_targets)].value_counts(['PrimerPairReporter'])


PrimerPairReporter       
FP004-RP004-EvaGreen         28
FP002-RP002x-Probe           12
FP004-RP004x-Probe           12
FP001-RP001-Probe             9
FP001-RP005-Probe             8
FP004-RP004x-EvaGreen         8
FP003-RP008-Probe             5
FP006-RP006-Probe             5
FP005-RP005-Probe             5
FP002-RP002-EvaGreen          4
FP002-RP006-Probe             4
FP057.1.0-RP003x-Probe        3
FP003-RP008x-EvaGreen         3
FP003-RP008-EvaGreen          3
FP002-RP002-Probe             3
FP001-RP001-EvaGreen          2
FP003-RP003-Probe             1
FP057.1.0-RP003x-EvaGreen     1
dtype: int64

Print list of the surfaces which are in the target list and the number of data points on each

In [8]:
# amount of data of surfaces in targets:
unique_locations[unique_locations['PrimerPairReporter'].isin(ppr_in_both)].value_counts(['PrimerPairReporter'])

PrimerPairReporter   
FP004-RP004-Probe        53
FP001-RP001x-EvaGreen    24
FP001-RP001x-Probe       20
RP001x-FP002-Probe       19
FP002-RP002x-EvaGreen    15
FP005-FP001-EvaGreen     14
FP004-FP005-Probe         8
FP005-FP001-Probe         8
FP005-FP004-EvaGreen      8
RP002x-FP005-Probe        8
RP008x-FP001-EvaGreen     8
RP008x-FP005-Probe        8
FP001-RP004-EvaGreen      7
RP002x-FP004-EvaGreen     6
FP002-RP004-EvaGreen      3
RP002x-FP002-EvaGreen     2
dtype: int64

Print list of surfaces in targets and not in targets

In [9]:
print('not in targets:', unique_locations[unique_locations['PrimerPairReporter']
      .isin(ppr_not_in_targets)]['PrimerPairReporter'].unique()
)

print('in targets:',unique_locations[unique_locations['PrimerPairReporter']
      .isin(ppr_in_both)]['PrimerPairReporter'].unique()
)

not in targets: ['FP001-RP001-Probe' 'FP002-RP002-Probe' 'FP004-RP004-EvaGreen'
 'FP001-RP001-EvaGreen' 'FP002-RP002-EvaGreen' 'FP001-RP005-Probe'
 'FP005-RP005-Probe' 'FP002-RP006-Probe' 'FP006-RP006-Probe'
 'FP003-RP008-Probe' 'FP002-RP002x-Probe' 'FP004-RP004x-Probe'
 'FP004-RP004x-EvaGreen' 'FP003-RP008-EvaGreen' 'FP003-RP008x-EvaGreen'
 'FP057.1.0-RP003x-EvaGreen' 'FP003-RP003-Probe' 'FP057.1.0-RP003x-Probe']
in targets: ['FP004-RP004-Probe' 'FP001-RP001x-EvaGreen' 'FP002-RP002x-EvaGreen'
 'FP001-RP001x-Probe' 'FP005-FP001-Probe' 'RP001x-FP002-Probe'
 'RP002x-FP005-Probe' 'FP005-FP004-EvaGreen' 'RP002x-FP002-EvaGreen'
 'FP001-RP004-EvaGreen' 'FP002-RP004-EvaGreen' 'FP004-FP005-Probe'
 'RP008x-FP005-Probe' 'FP005-FP001-EvaGreen' 'RP002x-FP004-EvaGreen'
 'RP008x-FP001-EvaGreen']


Print the target values

In [10]:
print(targets[['PrimerPairReporter', 'Target Rate']])

       PrimerPairReporter  Target Rate
0   FP001-RP001x-EvaGreen        0.902
1   FP002-RP002x-EvaGreen        0.902
2       FP005-FP001-Probe        0.866
3      RP001x-FP002-Probe        0.951
4      RP002x-FP005-Probe        0.866
5    FP005-FP004-EvaGreen        0.653
6       FP004-RP004-Probe        0.758
7   FP001-RP001x-EvaGreen        0.952
8   RP002x-FP002-EvaGreen        0.952
9    FP001-RP004-EvaGreen        1.050
10   FP002-RP004-EvaGreen        1.050
11      FP004-RP004-Probe        0.577
12      FP004-FP005-Probe        0.486
13  FP001-RP001x-EvaGreen        0.902
14  FP002-RP002x-EvaGreen        0.902
15     RP008x-FP005-Probe        0.806
16   FP005-FP001-EvaGreen        0.963
17     RP001x-FP002-Probe        0.477
18  RP002x-FP004-EvaGreen        0.963
19      FP004-RP004-Probe        0.806
20  RP008x-FP001-EvaGreen        0.912
21   FP005-FP001-EvaGreen        0.912
22     FP001-RP001x-Probe        1.030
23     RP001x-FP002-Probe        0.506


In [11]:
#In targets:

in_targ = ['FP004-RP004-Probe', 'FP001-RP001x-EvaGreen', 'FP002-RP002x-EvaGreen',
 'FP001-RP001x-Probe', 'FP005-FP001-Probe', 'RP001x-FP002-Probe',
 'RP002x-FP005-Probe', 'FP005-FP004-EvaGreen' ,'RP002x-FP002-EvaGreen',
 'FP001-RP004-EvaGreen', 'FP002-RP004-EvaGreen', 'FP004-FP005-Probe',
 'RP008x-FP005-Probe', 'FP005-FP001-EvaGreen', 'RP002x-FP004-EvaGreen',
 'RP008x-FP001-EvaGreen']


#Not in targets:
out_targ = ['FP001-RP001-Probe', 'FP002-RP002-Probe', 'FP004-RP004-EvaGreen',
 'FP001-RP001-EvaGreen', 'FP002-RP002-EvaGreen', 'FP001-RP005-Probe',
 'FP005-RP005-Probe', 'FP002-RP006-Probe', 'FP006-RP006-Probe',
 'FP003-RP008-Probe', 'FP002-RP002x-Probe', 'FP004-RP004x-Probe',
 'FP004-RP004x-EvaGreen', 'FP003-RP008-EvaGreen', 'FP003-RP008x-EvaGreen',
 'FP057.1.0-RP003x-EvaGreen', 'FP003-RP003-Probe', 'FP057.1.0-RP003x-Probe']


In [12]:
surfs1 = ['FP001-RP001-Probe', 'FP002-RP002-Probe', 'FP004-RP004-EvaGreen',
                 'FP001-RP001-EvaGreen', 'FP002-RP002-EvaGreen', 'FP001-RP005-Probe',
                 'FP005-RP005-Probe', 'FP002-RP006-Probe', 'FP006-RP006-Probe',
                 'FP003-RP008-Probe', 'FP002-RP002x-Probe', 'FP004-RP004x-Probe',
                 'FP004-RP004x-EvaGreen', 'FP003-RP008-EvaGreen', 'FP003-RP008x-EvaGreen',
                 'FP057.1.0-RP003x-EvaGreen', 'FP003-RP003-Probe', 'FP057.1.0-RP003x-Probe',
                 'FP001-RP001x-EvaGreen', 'FP004-RP004-Probe',
                 'FP001-RP001x-Probe', 'FP005-FP001-Probe', 'RP001x-FP002-Probe',
                 'RP002x-FP005-Probe', 'FP005-FP004-EvaGreen', 'RP002x-FP002-EvaGreen',
                 'FP001-RP004-EvaGreen', 'FP002-RP004-EvaGreen', 'FP004-FP005-Probe',
                 'RP008x-FP005-Probe', 'FP005-FP001-EvaGreen', 'RP002x-FP004-EvaGreen',
                 'RP008x-FP001-EvaGreen', 'FP002-RP002x-EvaGreen']

all_surfaces = ['FP004-RP004-Probe', 'FP001-RP001x-EvaGreen', 'FP002-RP002x-EvaGreen',
                     'FP001-RP001x-Probe', 'FP005-FP001-Probe', 'RP001x-FP002-Probe',
                     'RP002x-FP005-Probe', 'FP005-FP004-EvaGreen', 'RP002x-FP002-EvaGreen',
                     'FP001-RP004-EvaGreen', 'FP002-RP004-EvaGreen', 'FP004-FP005-Probe',
                     'RP008x-FP005-Probe', 'FP005-FP001-EvaGreen', 'RP002x-FP004-EvaGreen',
                     'RP008x-FP001-EvaGreen', 'FP001-RP001-Probe', 'FP002-RP002-Probe', 'FP004-RP004-EvaGreen',
                     'FP001-RP001-EvaGreen', 'FP002-RP002-EvaGreen', 'FP001-RP005-Probe',
                     'FP005-RP005-Probe', 'FP002-RP006-Probe', 'FP006-RP006-Probe',
                     'FP003-RP008-Probe', 'FP002-RP002x-Probe', 'FP004-RP004x-Probe',
                     'FP004-RP004x-EvaGreen', 'FP003-RP008-EvaGreen', 'FP003-RP008x-EvaGreen',
                     'FP057.1.0-RP003x-EvaGreen', 'FP003-RP003-Probe', 'FP057.1.0-RP003x-Probe']

out_targ = ['FP001-RP001-Probe', 'FP002-RP002-Probe', 'FP004-RP004-EvaGreen',
                'FP001-RP001-EvaGreen', 'FP002-RP002-EvaGreen', 'FP001-RP005-Probe',
                'FP005-RP005-Probe', 'FP002-RP006-Probe', 'FP006-RP006-Probe',
                'FP003-RP008-Probe', 'FP002-RP002x-Probe', 'FP004-RP004x-Probe',
                'FP004-RP004x-EvaGreen', 'FP003-RP008-EvaGreen', 'FP003-RP008x-EvaGreen',
                'FP057.1.0-RP003x-EvaGreen', 'FP003-RP003-Probe', 'FP057.1.0-RP003x-Probe']

In [13]:
print(len(set(all_surfaces) - set(out_targ)))
len(surfs1)

16


34

In [14]:
from candas.learn import parray

unique_locations = ps.data[['BP', 'GC', 'PrimerPairReporter']].drop_duplicates()
temp_parray = parray(**{'BP': unique_locations['BP'], 'GC': unique_locations['GC'],
                        'PrimerPairReporter':unique_locations['PrimerPairReporter']}, stdzr=ps.stdzr)

In [15]:
import numpy as np
unique_locations['centre dist'] = np.sqrt(
                (temp_parray['BP'].z.values() - 0) ** 2
                + (temp_parray['GC'].z.values() - 0) ** 2)

In [16]:
unique_locations['BP_z'] = temp_parray['BP'].z.values()
unique_locations['GC_z'] = temp_parray['GC'].z.values()

In [17]:
len(unique_locations)

327

In [18]:
sorted_data = pd.merge(ps.data[(ps.data['Parameter'] == 'r') & (ps.data['Metric'] == 'mean')],
                                   unique_locations, on=['BP', 'GC', 'PrimerPairReporter'],
                                   how='left').sort_values(by='centre dist')

In [19]:
for i, target in targets.iterrows():
    ppr = target['PrimerPairReporter']
    targ = target['Target Rate']
    temp_df = ps.data[(ps.data['PrimerPairReporter'] == ppr) & (ps.data['Parameter'] == 'r') & (ps.data['Metric'] == 'mean')]
