# Explorative Analysis

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

import seaborn

import njab.plotting
from njab.sklearn import run_pca, StandardScaler

import config
import njab

# Set parameters

In [None]:
TARGET = 'project'
FOLDER = Path(config.folder_reports) / 'bridges'
OLINK = config.fname_pkl_prodoc_olink

In [None]:
inputs = dict()
inputs['bridging_samples'] = config.data_processed / 'bridges.pkl'
olink_bridge = pd.read_pickle(inputs['bridging_samples'])
olink_bridge.sample(10)

In [None]:
olink_bridge = olink_bridge.reorder_levels(['Project', 'SampleID', 'Assay'])
olink_bridge.sample(2)

In [None]:
if not FOLDER:
    FOLDER = Path(config.folder_reports) / TARGET
FOLDER.mkdir(exist_ok=True, parents=True)
FOLDER

## Compare subsets

In [None]:
olink_bridge = olink_bridge['NPX'].unstack()
olink_bridge

# Differences between two batches

- create dummy to indicate

In [None]:
badge_tag = pd.Series(1, olink_bridge.index, name='batch')
badge_tag.loc['20202249'] = 0
badge_tag

In [None]:
happend = badge_tag.astype(bool)

## Olink - uncontrolled

In [None]:
olink = olink_bridge
olink

In [None]:
assert olink.isna().sum().sum() == 0
# olink.loc[:, olink.isna().any()].describe()

In [None]:
ana_diff_olink = njab.stats.groups_comparision.diff_analysis(
    olink, happend, event_names=('2nd batch', '1st batch')).sort_values(
        ('ttest', 'p-val'))
ana_diff_olink.to_excel(FOLDER / "DA_batches.xlsx")

ana_diff_olink.head(20)

# PCA

## Missing values handling

In [None]:
def info_missing(df):
    N, M = olink.shape
    msg = "{} missing features out of {} measurments, corresponding to {:.3f}%"
    msg = msg.format(df.isna().sum().sum(), N * M,
                     df.isna().sum().sum() / (N * M) * 100)
    print(msg)
    return msg


_ = info_missing(olink)

## PCA on scaled data

- missing values set to zero

In [None]:
olink_scaled = StandardScaler().fit_transform(olink).fillna(0)

PCs, pca = run_pca(olink_scaled, n_components=None)
PCs.iloc[:10, :10]

In [None]:
olink.columns[np.argmax(np.abs(
    pca.components_[:,
                    0]))]  # eigenvector first PCa, absolut arg max -> variable

In [None]:
exp_var_olink = pd.Series(
    pca.explained_variance_ratio_).to_frame('explained variance')
exp_var_olink["explained variance (cummulated)"] = exp_var_olink[
    'explained variance'].cumsum()
exp_var_olink.index.name = 'PC'
ax = exp_var_olink.plot()
fig = ax.get_figure()
njab.plotting.savefig(fig, name=FOLDER / '1_PCs_distribution')

In [None]:
ax = seaborn.scatterplot(x=PCs.iloc[:, 0], y=PCs.iloc[:, 1], hue=badge_tag)
fig = ax.get_figure()
njab.plotting.savefig(fig, name=FOLDER / '1_PC1_vs_PC2.pdf')