# Olink validation data

both for 

- `ProDoc` (N=29) samples
- `CircaFlow` samples

In [None]:
from collections import namedtuple
from pathlib import Path

import pandas as pd

import config

## Set default paths and collection

In [None]:
DATA_FOLDER = Path(config.data)

inputs = {}
outputs = {}

## Define Measurment

In [None]:
Measurement = namedtuple('Measurment', 'idx measure')
measure_olink = Measurement(['SampleID', 'Assay'], 'NPX')
measure_olink

## Load Olink validation data

In [None]:
inputs['olink'] = DATA_FOLDER / "Validation Results" / "ProDoc_Olink_bridged_QC_long.tsv"
olink = pd.read_table(inputs['olink'], sep='\t', low_memory=False)
olink = olink.set_index(measure_olink.idx)
olink

## Contains duplicated bridging samples

In [None]:
duplicated = olink[measure_olink.measure].index.duplicated(keep=False)
olink_bridge = olink.loc[duplicated].sort_index(level=-1).set_index('Project', append=True)
olink_bridge.head(20)

In [None]:
outputs['bridging_samples'] = config.data_processed / 'bridges.pkl'
olink_bridge.to_pickle(outputs['bridging_samples'])
olink_bridge.to_excel(outputs['bridging_samples'].with_suffix('.xlsx'))

## Metadata for Olink features

- `UniProt` ID of `OlinkID`
- limit of detection (`LOD`)

In [None]:
inputs['metadata'] = DATA_FOLDER / "Validation Results" / "metadata.tsv"
metadata = pd.read_table(inputs["metadata"])
metadata

## Sample name to ID mapping  - find subcohorts

In [None]:
inputs['id_map'] = DATA_FOLDER / "Validation Results" / "id.xlsx"
id_map = pd.read_excel(inputs["id_map"], index_col='SampleID')
id_map

In [None]:
print(id_map["CBMRID"].str[:4].value_counts().to_string())

## Select cohorts

In [None]:
def _select_idx(query: str,
                expected: int,
                id_map: pd.DataFrame = id_map,
                id_col: str = 'CBMRID'):
    idx = id_map.loc[id_map[id_col].str.contains(query)]
    idx = idx[id_col].to_list()
    assert len(
        idx
    ) == expected, f"Excepcted {expected} Prodoc validation samples, not {len(idx)}"
    return idx

In [None]:
idx_prodoc = _select_idx(query='ProD', expected=29)
# idx_prodoc

In [None]:
idx_circaflow = _select_idx(query='Cflow', expected=101)
# idx_circaflow

In [None]:
olink_prodoc_val = olink.loc[idx_prodoc, measure_olink.measure].unstack()
olink_prodoc_val.describe()

In [None]:
stem = 'olink_prodoc_val'
outputs[f'{stem}'] = config.data_processed / f'{stem}.pkl'
olink_prodoc_val.to_pickle(outputs[f'{stem}'])
olink_prodoc_val.to_excel(outputs[f'{stem}'].with_suffix('.xlsx'))

In [None]:
olink_cflow = olink.loc[idx_circaflow, measure_olink.measure].unstack()
olink_cflow.describe()

Integrate update from Rasmus (last three non-matching IDs)

In [None]:
inputs['olink_update'] = DATA_FOLDER / "Validation Results" / "update_olink_221204.tsv"
olink_update = pd.read_table(inputs['olink_update'], sep='\t', low_memory=False)
olink_update = olink_update.set_index(measure_olink.idx)

olink_cflow_update = olink_update.loc[:, measure_olink.measure].unstack()
olink_cflow_update

In [None]:
olink_cflow.loc[olink_cflow_update.index]

In [None]:
stem = 'olink_cflow'
outputs[stem] = config.data_processed / f'{stem}.xlsx'
olink_cflow.to_excel(outputs[stem])
olink_cflow.to_pickle(outputs[stem].with_suffix('.pkl'))

Log all input and selected output files 

In [None]:
inputs

In [None]:
outputs