In [1]:
import pathlib as pl
import os
import pandas as pd
import numpy as np

import copy
import pickle
import warnings



# from scipy.special import logit, expit
# from scipy.stats import norm, lognorm, chi2, ncx2, rv_continuous, multivariate_normal
from uncertainties import unumpy as unp
from collections import namedtuple




## Data Summary

In this notebook we check which surfaces are in the targets and how many unique locations there are on each one.

Load data

In [52]:
path = pl.Path(os.getcwd())
with open(path / 'data' / 'JG067 sequence targets.csv', "rb") as file:
    targets = pd.read_csv(file)
# print(targets.dtypes)
targets = targets.astype({'-Strand Label': str, '+Strand Label': str})
targets['PrimerPair'] = targets[['FPrimer', 'RPrimer']].agg('-'.join, axis=1)
# print(targets.dtypes)
targets['EvaGreen'] = ((targets['-Strand Label'] == "nan") & (targets['+Strand Label'] == "nan"))
print(targets['EvaGreen'])
targets.loc[targets['EvaGreen'] == True, 'EvaGreen'] = 'EvaGreen'
targets.loc[targets['EvaGreen'] == False, 'EvaGreen'] = 'Probe'
targets['PrimerPairReporter'] = targets[['PrimerPair', 'EvaGreen']].agg('-'.join, axis=1)
targets = targets.drop_duplicates(subset=['PrimerPairReporter'], keep='first')

0      True
1      True
2     False
3     False
4     False
5      True
6     False
7      True
8      True
9      True
10     True
11    False
12    False
13     True
14     True
15    False
16     True
17    False
18     True
19    False
20     True
21     True
22    False
23    False
Name: EvaGreen, dtype: bool


In [53]:
print(targets.to_string())

    Unnamed: 0  Sequence Name  Target Rate FPrimer RPrimer -Strand Label +Strand Label  Min BP       CAN Name                          CAN UUID    PrimerPair  EvaGreen     PrimerPairReporter
0            0  S067_8a718d_α        0.902   FP001  RP001x           nan           nan      10  Penalized XOR  8a718d80064949059a599ea046a959c8  FP001-RP001x  EvaGreen  FP001-RP001x-EvaGreen
1            1  S067_8a718d_β        0.902   FP002  RP002x           nan           nan      10  Penalized XOR  8a718d80064949059a599ea046a959c8  FP002-RP002x  EvaGreen  FP002-RP002x-EvaGreen
2            2  S067_8a718d_a        0.866   FP005   FP001           nan            L0      40  Penalized XOR  8a718d80064949059a599ea046a959c8   FP005-FP001     Probe      FP005-FP001-Probe
3            3  S067_8a718d_b        0.951  RP001x   FP002            L1            L1      70  Penalized XOR  8a718d80064949059a599ea046a959c8  RP001x-FP002     Probe     RP001x-FP002-Probe
4            4  S067_8a718d_c        0.866  R

In [2]:
def logit(x):
    return x
def expit(x):
    return x
def skip(x):
    return x


class Standardizer(dict):
    r"""Container for dict of mean (μ) and standard deviation (σ) for every parameter.

    :class:`Standardizer` objects allow transformation and normalization of datasets. The main methods are :meth:`stdz`,
    which attempts to coerce the values of a given variable to a standard normal distribution (`z-scores`), and its
    complement :meth:`unstdz`. The steps are

    .. math::
        \mathbf{\text{data}} \rightarrow \text{transform} \rightarrow \text{mean-center} \rightarrow \text{scale}
        \rightarrow \mathbf{\text{zdata}}

    For example, reaction `rate` must clearly be strictly positive, so we use a `log` transformation so that it behaves
    as a normally-distributed random variable. We then mean-center and scale this transformed value to obtain `z-scores`
    indicating how similar a given estimate is to all the other estimates we've observed. `Standardizer` stores the
    transforms and population mean and standard deviation for every parameter, allowing us to convert back and forth
    between natural space (:math:`rate`), transformed space (:math:`\text{ln}\; rate`), and standardized space
    (:math:`\left( \text{ln}\; rate  - \mu_{\text{ln}\; rate} \right)/\sigma_{\text{ln}\; rate}`).


    Notes
    -----
    :class:`Standardizer` is just a `dictionary <https://docs.python.org/3/tutorial/datastructures.html#dictionaries>`_
    with some extra methods and defaults, so standard dictionary methods like :meth:`dict.update` still work.

    """

    # TODO: Standardizer: Make required_parameters and required_descriptors optional definition at init
    _required_parameters = ['τ', 'ρ', 'r', 'K', 'm', 'lg10_Copies', 'BP', 'GC']
    _required_descriptors = ['Parameter', 'lg10_Copies', 'BP', 'GC']

    # TODO: Standardizer: make `defaults` optional definition at init
    defaults = {
        'ρ': {'μ': -1.056, 'σ': 0.398},
        'τ': {'μ': 3.34, 'σ': 0.1501},
        'K': {'μ': -0.0368, 'σ': 0.351},
        'm': {'μ': -5.30, 'σ': 0.582},
        'offset': {'μ': 0.214, 'σ': 0.0725},
        'lg10_Copies': {'μ': 5, 'σ': 2},
        'BP': {'μ': 4.48, 'σ': 0.75},
        'GC': {'μ': -0.282, 'σ': 1},
        'r': {'μ': -0.307, 'σ': 0.158},
        'F0_lg': {'μ': -0.762, 'σ': 1.258},
        'bkg_F': {'μ': 0.200, 'σ': 0.0580},
        'bkg_Cycle': {'μ': 38.2, 'σ': 23.0}
    }

    # TODO: Standardizer: make `transforms` and `pymc_transforms` definable via string options
    # TODO: Standardizer: make transform suggestions based on provided data? e.g., all>0 -> log/exp
    transforms = {
        'r': [np.log, np.exp],
        'ρ': [logit, expit],
        'τ': [np.log, np.exp],
        'τ_': [np.log, np.exp],
        'K': [np.log, np.exp],
        'm': [np.log, np.exp],
        'offset': [skip, skip],
        'lg10_Copies': [skip, skip],
        'BP': [np.log, np.exp],
        'GC': [logit, expit]
    }


    def __init__(self, **kwargs):
        self.validate(kwargs)
        super().__init__(**kwargs)

    @classmethod
    def validate(cls, dct: dict):
        """Ensures provided dictionary has all required attributes"""
        assert_is_subset('Parameters', cls._required_parameters, dct.keys())

    @classmethod
    def default(cls):
        """Initializes Standardizer with default values"""
        return cls(**cls.defaults)

    def reset(self):
        """Revert to defaults"""
        self.update(**self.defaults)
        for k in self.keys():
            if k not in self.defaults.keys():
                del self[k]
        return self

    def save(self, filename: str):
        """Save to pickle file"""
        with open(filename, 'wb') as buff:
            pickle.dump(self, buff)

    @classmethod
    def load(cls, filename: str):
        """Load from pickle file"""
        with open(filename, 'rb') as buff:
            dct = pickle.load(buff)
        return cls(**dct)

    @classmethod
    def from_DataFrame(cls, df: pd.DataFrame):
        """Construct from DataFrame"""
        assert_in('"Parameter"', 'Parameter', df.columns)
        if 'Metric' in df.columns:
            if 'mean' in df['Metric'].unique():
                df = df[df['Metric'] == 'mean']
            else:
                raise ValueError('If DataFrame contains column "Metric", "means" must be present in that column')
        dct = (df
               .groupby('Parameter')
               .apply(cls.transform_series)
               .groupby('Parameter')
               .agg([np.mean, np.std])
               .rename(columns={"mean": "μ", "std": "σ"})
               .T
               .to_dict()
               )
        return cls(**{**cls.defaults, **dct})

    @classmethod
    def transform(cls, name: str, x: float, lg10_Copies=5., pymc3=False) -> float:
        """Apply appropriate forward transformation to parameter

        Parameters
        ----------
        x: float
            Value to be transformed
        name: str
            Name of parameter
        lg10_Copies: float, default 5.
            Corresponding log10 concentration; only necessary for τ
        pymc3: bool, optional
            Whether to use pymc3's transforms

        Returns
        -------
        float
        """
        _transforms = cls.transforms if not pymc3 else cls.pymc_transforms
        ftransform = _transforms.get(name, [skip, skip])[0]
        if name == 'τ':
            assert lg10_Copies is not None, 'Concentration must be supplied to transform τ'
            x = x + np.log2(10) * (lg10_Copies - 5)
        return ftransform(x)

    @classmethod
    def untransform(cls, name: str, x: float, lg10_Copies=5., pymc3=False) -> float:
        """Apply appropriate reverse transformation to parameter

        Parameters
        ----------
        x: float
            Value to be transformed
        name: str
            Name of parameter
        lg10_Copies: float, default 5.
            Corresponding log10 concentration; only necessary for τ
        pymc3: bool, optional
            Whether to use pymc3's transforms

        Returns
        -------
        float
        """
        _transforms = cls.transforms if not pymc3 else cls.pymc_transforms
        rtransform = _transforms.get(name, [skip, skip])[1]
        x_ = rtransform(x)
        if name == 'τ':
            assert lg10_Copies is not None, 'Concentration must be supplied to transform τ'
            x_ = x_ - np.log2(10) * (lg10_Copies - 5)
        return x_

    @classmethod
    def transform_series(cls, series: pd.Series, val_column='Value') -> float:
        """Apply appropriate transform to parameter in series

        Parameters
        ----------
        series: pd.Series
            Series containing value to be transformed. Series.name must be the name of the parameter and series must
             also contain a 'lg10_Copies' attribute.
        val_column: str, default 'Value'
            Name of series column containing value to be transformed.

        Returns
        -------
            float
        """
        assert series.name is not None
        return cls.transform(str(series.name), series[val_column], series.lg10_Copies)

    def stdz(self, name: str, x: float, lg10_Copies=5., pymc3=False) -> float:
        """Transforms, mean-centers, and scales parameter

        Parameters
        ----------
        x: float
        name: str
            Name of parameter
        lg10_Copies: float, default 5.
            Corresponding log10 concentration; only necessary for τ
        pymc3: bool, optional
            Whether to use pymc3's transforms

        Returns
        -------
        float
        """
        x_ = self.transform(name, x, lg10_Copies, pymc3)
        μ = self.get(name, {'μ': 0})['μ']
        σ = self.get(name, {'σ': 1})['σ']
        return (x_ - μ) / σ

    def stdz_series(self, series: pd.Series, val_column='Value') -> float:
        """Apply appropriate transform to parameter in series

        Parameters
        ----------
        series: pd.Series
            Series containing value to be transformed.
            Series.name must be the name of the parameter and series must also contain a 'lg10_Copies' attribute.
        val_column: str, default 'Value'
            Name of series column containing value to be transformed.

        Returns
        -------
            float
        """
        assert series.name is not None
        return self.stdz(str(series.name), series[val_column], series.lg10_Copies, pymc3=False)

    def unstdz(self, name: str, z: float, lg10_Copies=5., pymc3=False) -> float:
        """Un-scales, un-centers, and un-transforms parameter

        Parameters
        ----------
        z: float
        name: str
            Name of parameter
        lg10_Copies: float, default 5.
            Corresponding log10 concentration; only necessary for τ
        pymc3: bool, optional
            Whether to use pymc3's transforms

        Returns
        -------
        float
        """
        μ = self.get(name, {'μ': 0})['μ']
        σ = self.get(name, {'σ': 1})['σ']
        x_ = z * σ + μ
        return self.untransform(name, x_, lg10_Copies, pymc3)


In [18]:
from typing import Iterable

def assert_in(name: str, arg, lst: Iterable):
    """Raises error if value not in list"""
    if arg not in lst:
        raise ValueError(f'{name} must be one of {lst}')


def assert_is_subset(name: str, subset: Iterable, superset: Iterable):
    """Raises error if any required value not in list"""
    l_set = set(superset)
    r_set = set(subset)
    if not l_set.issuperset(r_set):
        missing = list(r_set.difference(l_set))
        msg = f'{list_is_are(missing)} missing from {name}'
        raise ValueError(msg)

class ParameterSet:
    """Container for parameter estimates that enforces data integrity.

    :class:`ParameterSet` is a container for a tidy dataframe (:attr:`data`) and a :class:`Standardizer`, allowing
    simple access to standardized data (:attr:`zdata`) and wide-form views of the data (:attr:`wide`/:attr:`zwide`).
    Ensures data integrity by enforcing a set of :attr:`required_columns` and a set of :attr:`required_parameters`.

    Notes
    -----
    :class:`ParameterSet` objects are created by the :meth:`VIResult.summarize` and :meth:`MCMCResult.summarize`
    methods, and forms the basis of the :class:`GP` class.

    Parameters
    ----------
    data: pd.DataFrame

    Attributes
    ----------

    """

    # TODO: ParameterSet: Make required_parameters and required_descriptors optional definition at init
    # TODO: Allow specification of stdz-able columns at init.
    required_columns = ['Target', 'BP', 'GC', 'lg10_Copies', 'Parameter', 'Metric', 'Value']
    required_parameters = ['τ', 'F0_lg', 'ρ', 'r', 'K', 'm']

    def __init__(self, data: pd.DataFrame):
        self.validate(data)
        self._data = data
        self._stdzr = Standardizer.from_DataFrame(data)

    # def _repr_html_(self):
    #     return self.data._repr_html_()

    @property
    def stdzr(self):
        """Standardizer: Container for dict of mean (μ) and standard deviation (σ) for every parameter."""
        return self._stdzr

    @stdzr.setter
    def stdzr(self, new_stdzr: Standardizer):
        assert isinstance(new_stdzr, Standardizer)
        self._stdzr = new_stdzr

    @property
    def data(self) -> pd.DataFrame:
        """pandas.DataFrame: Underlying dataframe"""
        return self._data

    @data.setter
    def data(self, df: pd.DataFrame):
        """Ensure new dataframe passes integrity checks"""
        self.validate(df)
        self._data = df
        self._stdzr = Standardizer.from_DataFrame(df)

    @property
    def wide(self) -> pd.DataFrame:
        """Wide-form copy of data"""
        idx_columns = [col for col in self.data.columns if col not in ['Parameter', 'Value']]
        return (self.data
                .pivot(index=idx_columns, columns='Parameter', values='Value')
                .reset_index()
                .rename_axis(columns=None)
                )

    @property
    def zdata(self) -> pd.DataFrame:
        """Long-form copy of standardized data"""
        return self.standardized

    @property
    def zwide(self) -> pd.DataFrame:
        """Wide-form copy of standardized data"""
        idx_columns = [col for col in self.zdata.columns if col not in ['Parameter', 'Value']]
        return (self.zdata
                .pivot(index=idx_columns, columns='Parameter', values='Value')
                .reset_index()
                .rename_axis(columns=None)
                )

    @property
    def standardized(self):
        """A copy of the instance's dataframe  with key parameters transformed and standardized.

        In addition to values in the ``Value`` column corresponding to the keys in :attr:`stdzr`, columns
        ``'BP'``, ``'GC'``, and ``'lg10_Copies'`` are also manipulated.
        """
        df_ = self.data.copy()
        df_['Value'] = (df_
            .groupby('Parameter')
            .apply(self.stdzr.stdz_series)
            .reset_index()
            .set_index('level_1')
            .sort_index()[0])
        for col in ['BP', 'GC', 'lg10_Copies']:
            df_[col] = (df_[col].map(lambda x: self.stdzr.stdz(col, x)))
        return df_

    @classmethod
    def validate(cls, df: pd.DataFrame):
        """Ensures provided DataFrame has all required attributes"""
        assert isinstance(df, pd.DataFrame)
        assert_is_subset('Columns', cls.required_columns, df.columns)
        assert_is_subset('Parameters', cls.required_parameters, df.Parameter.unique())
        assert 'mean' in df['Metric'].unique(), '"Metric" column must contain value "mean"'

    @property
    def valid(self):
        """Integrity check"""
        return self.validate(self.data) is None

    @classmethod
    def read_pickle(cls, *args, **kwargs):
        """Imports from pickle file

        Returns
        -------
        ParameterSet
        """
        df = pd.read_pickle(*args, **kwargs)
        return cls(df)

    @classmethod
    def read_csv(cls, *args, **kwargs):
        """Imports from comma delimited file

        Returns
        -------
        ParameterSet
        """
        df = pd.read_csv(*args, **kwargs)
        return cls(df)

    @classmethod
    def read_table(cls, *args, **kwargs):
        """Imports from generic delimited file

        Returns
        -------
        ParameterSet
        """
        df = pd.read_table(*args, **kwargs)
        return cls(df)

    def neaten(self):
        """Rearranges columns in a sensible order"""
        other_columns = [col for col in self.data.columns if col not in self.required_columns]
        self.data = self.data[other_columns + self.required_columns]

    @classmethod
    def from_wide(cls, wide, params=None):
        """Reshapes wide-form data to long-form, then instantiates class"""
        params = cls.required_parameters if params is None else params  # Might be broken
        meta = [col for col in wide.columns if col not in params]
        tidy = wide.melt(id_vars=meta, value_vars=params, var_name='Parameter', value_name='Value')
        return cls(tidy)

    def save(self, filename: str):
        """Pickles data in wide-form to save space"""
        self.wide.to_pickle(filename)

    @classmethod
    def load(cls, filename: str, params=None):
        """Un-pickles wide-form data, then reshapes as long-form"""
        wide = pd.read_pickle(filename)
        return cls.from_wide(wide, params)


In [19]:
path = pl.Path(os.getcwd())
ps_df = pd.read_pickle(path / 'data' / 'ADVI_ParameterSets_220528.pkl')
ps_df = ps_df[(ps_df.lg10_Copies == 8)]
ps_df = ps_df.drop(ps_df[ps_df['Experiment'].str.contains("JG073A")].index)
ps = ParameterSet.from_wide(ps_df)
ps.data['EvaGreen'] = ((ps.data['Reporter'] == "EVAGREEN") | (ps.data['Reporter'] == "SYBR"))
ps.data.loc[ps.data['EvaGreen'] == True, 'EvaGreen'] = 'EvaGreen'
ps.data.loc[ps.data['EvaGreen'] == False, 'EvaGreen'] = 'Probe'
ps.data['PrimerPairReporter'] = ps.data[['PrimerPair', 'EvaGreen']].agg('-'.join, axis=1)

Get data summary

In [54]:
print('no. of surfaces:',len(ps.data['PrimerPairReporter'].unique()))
print('no. surfaces to be optimized:', len(targets['PrimerPairReporter'].unique()))
print('no. unique locations:',len(ps.data[['BP', 'GC', 'PrimerPairReporter']].drop_duplicates()))
print('total number data points:', len(ps.data[(ps.data['Parameter'] == 'r') & (ps.data['Metric'] == 'mean')]))
print('min number of repeats at a location:', ps.data[(ps.data['Parameter'] == 'r') & (ps.data['Metric'] == 'mean')].value_counts(['BP', 'GC', 'PrimerPairReporter']).min())
print('max number of repeats at a location:', ps.data[(ps.data['Parameter'] == 'r') & (ps.data['Metric'] == 'mean')].value_counts(['BP', 'GC', 'PrimerPairReporter']).max())


no. of surfaces: 34
no. surfaces to be optimized: 16
no. unique locations: 327
total number data points: 592
min number of repeats at a location: 1
max number of repeats at a location: 6


Calculate which surfaces are in the targets list and which aren't

In [55]:
print(targets['PrimerPairReporter'])

0     FP001-RP001x-EvaGreen
1     FP002-RP002x-EvaGreen
2         FP005-FP001-Probe
3        RP001x-FP002-Probe
4        RP002x-FP005-Probe
5      FP005-FP004-EvaGreen
6         FP004-RP004-Probe
8     RP002x-FP002-EvaGreen
9      FP001-RP004-EvaGreen
10     FP002-RP004-EvaGreen
12        FP004-FP005-Probe
15       RP008x-FP005-Probe
16     FP005-FP001-EvaGreen
18    RP002x-FP004-EvaGreen
20    RP008x-FP001-EvaGreen
22       FP001-RP001x-Probe
Name: PrimerPairReporter, dtype: object


In [56]:
ppr_not_in_targets = set(ps.data['PrimerPairReporter'].unique()) - set(targets['PrimerPairReporter'].unique())
ppr_not_in_data = set(targets['PrimerPairReporter'].unique()) - set(ps.data['PrimerPairReporter'].unique())
ppr_in_both = set(ps.data['PrimerPairReporter'].unique()) & set(targets['PrimerPairReporter'].unique())

In [57]:
unique_locations = ps.data[['PrimerPairReporter', 'BP', 'GC']].drop_duplicates()

Print list of the surfaces which are NOT in the targets list and how many unique data locations there are on each

In [58]:
unique_locations[unique_locations['PrimerPairReporter'].isin(ppr_not_in_targets)].value_counts(['PrimerPairReporter'])


PrimerPairReporter       
FP004-RP004-EvaGreen         28
FP002-RP002x-Probe           12
FP004-RP004x-Probe           12
FP001-RP001-Probe             9
FP001-RP005-Probe             8
FP004-RP004x-EvaGreen         8
FP003-RP008-Probe             5
FP006-RP006-Probe             5
FP005-RP005-Probe             5
FP002-RP002-EvaGreen          4
FP002-RP006-Probe             4
FP057.1.0-RP003x-Probe        3
FP003-RP008x-EvaGreen         3
FP003-RP008-EvaGreen          3
FP002-RP002-Probe             3
FP001-RP001-EvaGreen          2
FP003-RP003-Probe             1
FP057.1.0-RP003x-EvaGreen     1
Name: count, dtype: int64

Print list of the surfaces which are in the target list and the number of data points on each

In [59]:
# amount of data of surfaces in targets:
unique_locations[unique_locations['PrimerPairReporter'].isin(ppr_in_both)].value_counts(['PrimerPairReporter'])

PrimerPairReporter   
FP004-RP004-Probe        53
FP001-RP001x-EvaGreen    24
FP001-RP001x-Probe       20
RP001x-FP002-Probe       19
FP002-RP002x-EvaGreen    15
FP005-FP001-EvaGreen     14
FP004-FP005-Probe         8
FP005-FP001-Probe         8
FP005-FP004-EvaGreen      8
RP002x-FP005-Probe        8
RP008x-FP001-EvaGreen     8
RP008x-FP005-Probe        8
FP001-RP004-EvaGreen      7
RP002x-FP004-EvaGreen     6
FP002-RP004-EvaGreen      3
RP002x-FP002-EvaGreen     2
Name: count, dtype: int64

Print list of surfaces in targets and not in targets

In [25]:
print('not in targets:', unique_locations[unique_locations['PrimerPairReporter']
      .isin(ppr_not_in_targets)]['PrimerPairReporter'].unique()
)

print('in targets:',unique_locations[unique_locations['PrimerPairReporter']
      .isin(ppr_in_both)]['PrimerPairReporter'].unique()
)

not in targets: ['FP001-RP001-Probe' 'FP002-RP002-Probe' 'FP004-RP004-EvaGreen'
 'FP001-RP001-EvaGreen' 'FP002-RP002-EvaGreen' 'FP001-RP005-Probe'
 'FP005-RP005-Probe' 'FP002-RP006-Probe' 'FP006-RP006-Probe'
 'FP003-RP008-Probe' 'FP001-RP001x-EvaGreen' 'FP002-RP002x-EvaGreen'
 'FP004-RP004x-Probe' 'FP004-RP004x-EvaGreen' 'FP003-RP008-EvaGreen'
 'FP003-RP008x-EvaGreen' 'FP057.1.0-RP003x-EvaGreen' 'FP003-RP003-Probe'
 'FP057.1.0-RP003x-Probe' 'FP005-FP004-EvaGreen' 'RP002x-FP002-EvaGreen'
 'FP001-RP004-EvaGreen' 'FP002-RP004-EvaGreen' 'FP005-FP001-EvaGreen'
 'RP002x-FP004-EvaGreen' 'RP008x-FP001-EvaGreen']
in targets: ['FP004-RP004-Probe' 'FP002-RP002x-Probe' 'FP001-RP001x-Probe'
 'FP005-FP001-Probe' 'RP001x-FP002-Probe' 'RP002x-FP005-Probe'
 'FP004-FP005-Probe' 'RP008x-FP005-Probe']


Unnamed: 0,Experiment,Well,Reporter,Copies,lg10_Copies,Target,FPrimer,RPrimer,PrimerPair,BP,GC,Metric,Parameter,Value,EvaGreen,PrimerPairReporter
0,JG034A,1,HEX,100000000.0,8.0,S044.12,FP004,RP004,FP004-RP004,88,0.431818,mean,τ,15.672488,Probe,FP004-RP004-Probe
1,JG034A,1,HEX,100000000.0,8.0,S044.12,FP004,RP004,FP004-RP004,88,0.431818,sd,τ,0.012916,Probe,FP004-RP004-Probe
2,JG034A,2,HEX,100000000.0,8.0,S044.12,FP004,RP004,FP004-RP004,88,0.431818,mean,τ,15.665333,Probe,FP004-RP004-Probe
3,JG034A,2,HEX,100000000.0,8.0,S044.12,FP004,RP004,FP004-RP004,88,0.431818,sd,τ,0.010921,Probe,FP004-RP004-Probe
4,JG034A,4,FAM,100000000.0,8.0,S044.13,FP004,RP004,FP004-RP004,88,0.431818,mean,τ,14.257499,Probe,FP004-RP004-Probe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7099,JG067M,213,FAM,100000000.0,8.0,S067_b12a22_c_LVM_3,FP004,RP004,FP004-RP004,309,0.359223,sd,m,0.000254,Probe,FP004-RP004-Probe
7100,JG067M,214,HEX,100000000.0,8.0,S067_b12a22_d_LVM_3,FP004,FP005,FP004-FP005,476,0.348739,mean,m,0.009874,Probe,FP004-FP005-Probe
7101,JG067M,214,HEX,100000000.0,8.0,S067_b12a22_d_LVM_3,FP004,FP005,FP004-FP005,476,0.348739,sd,m,0.000283,Probe,FP004-FP005-Probe
7102,JG067M,215,EVAGREEN,100000000.0,8.0,S067_b12a22_α_LVM_3,FP001,RP001x,FP001-RP001x,49,0.510204,mean,m,0.000274,EvaGreen,FP001-RP001x-EvaGreen


In [63]:
r_only = ps.data[(ps.data['Parameter'] == 'r') & (ps.data['Metric'] == 'mean')]

all_data = r_only[['PrimerPairReporter', 'BP', 'GC']]

all_data[all_data['PrimerPairReporter'].isin(ppr_not_in_targets)].value_counts(['PrimerPairReporter'])

PrimerPairReporter       
FP004-RP004-EvaGreen         72
FP002-RP002x-Probe           42
FP001-RP001-Probe            26
FP004-RP004x-Probe           24
FP001-RP005-Probe            16
FP004-RP004x-EvaGreen        16
FP003-RP008-Probe            10
FP002-RP002-EvaGreen         10
FP006-RP006-Probe            10
FP002-RP006-Probe            10
FP005-RP005-Probe            10
FP001-RP001-EvaGreen          6
FP003-RP008x-EvaGreen         6
FP002-RP002-Probe             6
FP057.1.0-RP003x-Probe        6
FP003-RP008-EvaGreen          5
FP003-RP003-Probe             3
FP057.1.0-RP003x-EvaGreen     2
Name: count, dtype: int64

In [64]:
all_data[all_data['PrimerPairReporter'].isin(ppr_in_both)].value_counts(['PrimerPairReporter'])

PrimerPairReporter   
FP004-RP004-Probe        87
FP001-RP001x-EvaGreen    55
FP002-RP002x-EvaGreen    38
FP001-RP001x-Probe       32
RP001x-FP002-Probe       20
FP005-FP001-EvaGreen     14
FP004-FP005-Probe         8
FP005-FP001-Probe         8
FP005-FP004-EvaGreen      8
RP002x-FP005-Probe        8
RP008x-FP001-EvaGreen     8
RP008x-FP005-Probe        8
FP001-RP004-EvaGreen      7
RP002x-FP004-EvaGreen     6
FP002-RP004-EvaGreen      3
RP002x-FP002-EvaGreen     2
Name: count, dtype: int64

Print the target values

In [None]:
print(targets[['PrimerPairReporter', 'Target Rate']])

In [None]:
#In targets:

in_targ = ['FP004-RP004-Probe', 'FP001-RP001x-EvaGreen', 'FP002-RP002x-EvaGreen',
 'FP001-RP001x-Probe', 'FP005-FP001-Probe', 'RP001x-FP002-Probe',
 'RP002x-FP005-Probe', 'FP005-FP004-EvaGreen' ,'RP002x-FP002-EvaGreen',
 'FP001-RP004-EvaGreen', 'FP002-RP004-EvaGreen', 'FP004-FP005-Probe',
 'RP008x-FP005-Probe', 'FP005-FP001-EvaGreen', 'RP002x-FP004-EvaGreen',
 'RP008x-FP001-EvaGreen']


#Not in targets:
out_targ = ['FP001-RP001-Probe', 'FP002-RP002-Probe', 'FP004-RP004-EvaGreen',
 'FP001-RP001-EvaGreen', 'FP002-RP002-EvaGreen', 'FP001-RP005-Probe',
 'FP005-RP005-Probe', 'FP002-RP006-Probe', 'FP006-RP006-Probe',
 'FP003-RP008-Probe', 'FP002-RP002x-Probe', 'FP004-RP004x-Probe',
 'FP004-RP004x-EvaGreen', 'FP003-RP008-EvaGreen', 'FP003-RP008x-EvaGreen',
 'FP057.1.0-RP003x-EvaGreen', 'FP003-RP003-Probe', 'FP057.1.0-RP003x-Probe']


In [None]:
surfs1 = ['FP001-RP001-Probe', 'FP002-RP002-Probe', 'FP004-RP004-EvaGreen',
                 'FP001-RP001-EvaGreen', 'FP002-RP002-EvaGreen', 'FP001-RP005-Probe',
                 'FP005-RP005-Probe', 'FP002-RP006-Probe', 'FP006-RP006-Probe',
                 'FP003-RP008-Probe', 'FP002-RP002x-Probe', 'FP004-RP004x-Probe',
                 'FP004-RP004x-EvaGreen', 'FP003-RP008-EvaGreen', 'FP003-RP008x-EvaGreen',
                 'FP057.1.0-RP003x-EvaGreen', 'FP003-RP003-Probe', 'FP057.1.0-RP003x-Probe',
                 'FP001-RP001x-EvaGreen', 'FP004-RP004-Probe',
                 'FP001-RP001x-Probe', 'FP005-FP001-Probe', 'RP001x-FP002-Probe',
                 'RP002x-FP005-Probe', 'FP005-FP004-EvaGreen', 'RP002x-FP002-EvaGreen',
                 'FP001-RP004-EvaGreen', 'FP002-RP004-EvaGreen', 'FP004-FP005-Probe',
                 'RP008x-FP005-Probe', 'FP005-FP001-EvaGreen', 'RP002x-FP004-EvaGreen',
                 'RP008x-FP001-EvaGreen', 'FP002-RP002x-EvaGreen']

all_surfaces = ['FP004-RP004-Probe', 'FP001-RP001x-EvaGreen', 'FP002-RP002x-EvaGreen',
                     'FP001-RP001x-Probe', 'FP005-FP001-Probe', 'RP001x-FP002-Probe',
                     'RP002x-FP005-Probe', 'FP005-FP004-EvaGreen', 'RP002x-FP002-EvaGreen',
                     'FP001-RP004-EvaGreen', 'FP002-RP004-EvaGreen', 'FP004-FP005-Probe',
                     'RP008x-FP005-Probe', 'FP005-FP001-EvaGreen', 'RP002x-FP004-EvaGreen',
                     'RP008x-FP001-EvaGreen', 'FP001-RP001-Probe', 'FP002-RP002-Probe', 'FP004-RP004-EvaGreen',
                     'FP001-RP001-EvaGreen', 'FP002-RP002-EvaGreen', 'FP001-RP005-Probe',
                     'FP005-RP005-Probe', 'FP002-RP006-Probe', 'FP006-RP006-Probe',
                     'FP003-RP008-Probe', 'FP002-RP002x-Probe', 'FP004-RP004x-Probe',
                     'FP004-RP004x-EvaGreen', 'FP003-RP008-EvaGreen', 'FP003-RP008x-EvaGreen',
                     'FP057.1.0-RP003x-EvaGreen', 'FP003-RP003-Probe', 'FP057.1.0-RP003x-Probe']

out_targ = ['FP001-RP001-Probe', 'FP002-RP002-Probe', 'FP004-RP004-EvaGreen',
                'FP001-RP001-EvaGreen', 'FP002-RP002-EvaGreen', 'FP001-RP005-Probe',
                'FP005-RP005-Probe', 'FP002-RP006-Probe', 'FP006-RP006-Probe',
                'FP003-RP008-Probe', 'FP002-RP002x-Probe', 'FP004-RP004x-Probe',
                'FP004-RP004x-EvaGreen', 'FP003-RP008-EvaGreen', 'FP003-RP008x-EvaGreen',
                'FP057.1.0-RP003x-EvaGreen', 'FP003-RP003-Probe', 'FP057.1.0-RP003x-Probe']

In [None]:
print(len(set(all_surfaces) - set(out_targ)))
len(surfs1)

In [None]:
from candas.learn import parray

unique_locations = ps.data[['BP', 'GC', 'PrimerPairReporter']].drop_duplicates()
temp_parray = parray(**{'BP': unique_locations['BP'], 'GC': unique_locations['GC'],
                        'PrimerPairReporter':unique_locations['PrimerPairReporter']}, stdzr=ps.stdzr)

In [None]:
import numpy as np
unique_locations['centre dist'] = np.sqrt(
                (temp_parray['BP'].z.values() - 0) ** 2
                + (temp_parray['GC'].z.values() - 0) ** 2)

In [None]:
unique_locations['BP_z'] = temp_parray['BP'].z.values()
unique_locations['GC_z'] = temp_parray['GC'].z.values()

In [None]:
len(unique_locations)

In [None]:
sorted_data = pd.merge(ps.data[(ps.data['Parameter'] == 'r') & (ps.data['Metric'] == 'mean')],
                                   unique_locations, on=['BP', 'GC', 'PrimerPairReporter'],
                                   how='left').sort_values(by='centre dist')

In [None]:
for i, target in targets.iterrows():
    ppr = target['PrimerPairReporter']
    targ = target['Target Rate']
    temp_df = ps.data[(ps.data['PrimerPairReporter'] == ppr) & (ps.data['Parameter'] == 'r') & (ps.data['Metric'] == 'mean')]
