# Explorative Analysis

In [None]:
import numpy as np
import pandas as pd

import pingouin as pg

import src.stats
from src.sklearn import run_pca, StandardScaler

import config

## Set parameters

In [None]:
TARGET = 'dead_wi_90_f_infl_sample'

In [None]:
clinic = pd.read_pickle(config.fname_pkl_clinic)
olink = pd.read_pickle(config.fname_pkl_olink)

In [None]:
pd.crosstab(clinic.DiagnosisPlace, clinic.dead)

FirstAdmission is also right-censored

In [None]:
time_from_diagnose_to_first_admission = clinic["DateFirstAdmission"].fillna(config.STUDY_ENDDATE) - clinic["DateDiagnose"]
time_from_diagnose_to_first_admission.describe()

Who dies without having a first Admission date?

In [None]:
dead_wo_adm = clinic["DateFirstAdmission"].isna() & clinic['dead']
idx_dead_wo_adm = dead_wo_adm.loc[dead_wo_adm].index
print('Dead without admission to hospital:', *dead_wo_adm.loc[dead_wo_adm].index)
clinic.loc[dead_wo_adm, ["DateFirstAdmission", "DateDiagnose", "Admissions"]]

## Differences between groups defined by target

In [None]:
clinic

In [None]:
clinic[TARGET].value_counts()

In [None]:
pd.crosstab(clinic[TARGET], clinic["DecomensatedAtDiagnosis"])

In [None]:
happend = clinic[TARGET].astype(bool)

### Continous

In [None]:
var = 'Age'
# import scipy.stats 
# scipy.stats.ttest_ind(clinic.loc[happend, var], clinic.loc[~happend, var], equal_var=False) # same results as pengoin
pg.ttest(clinic.loc[happend, var], clinic.loc[~happend, var])

In [None]:
vars_cont = config.clinic_data.vars_cont
ana_differential = src.stats.diff_analysis(
    clinic[vars_cont],
    happend,
    event_names=('died', 'alive'),
)
ana_differential.sort_values(('ttest', 'p-val'))

### Binary

In [None]:
clinic[config.clinic_data.vars_binary].describe()

Might focus on discriminative power of
  - DecompensatedAtDiagnosis 
  - alcohol consumption
  
but the more accute diseases as heart disease and cancer seem to be distinctive

In [None]:
diff_binomial = []
for var in config.clinic_data.vars_binary[1:]:
      diff_binomial.append(src.stats.binomtest(clinic[var], happend))
pd.concat(diff_binomial).sort_values(('binomial test', 'pvalue'))

## Olink

In [None]:
olink.loc[:, olink.isna().any()].describe()

In [None]:
ana_diff_olink = src.stats.diff_analysis(olink, happend, event_names=('died', 'alive'))
ana_diff_olink.sort_values(('ttest', 'p-val'))

## PCA 

### Missing values handling

In [None]:
def info_missing(df):
    N, M = olink.shape
    msg = "{} missing features out of {} measurments, corresponding to {:.3f}%"
    msg = msg.format(df.isna().sum().sum(), N * M,
                     df.isna().sum().sum() / (N * M) * 100)
    print(msg)
    return msg

_ = info_missing(olink)

### PCA on scaled data 

- missing values set to zero

In [None]:
olink_scaled = StandardScaler().fit_transform(olink).fillna(0)

PCs, pca = run_pca(olink_scaled, n_components=None)
PCs

In [None]:
olink.columns[np.argmax(np.abs(pca.components_[:,0]))] # eigenvector first PCa, absolut arg max -> variable

In [None]:
exp_var_olink = pd.Series(pca.explained_variance_ratio_).to_frame('explained variance')
exp_var_olink["explained variance (cummulated)"] = exp_var_olink['explained variance'].cumsum()
exp_var_olink.index.name = 'PC'
ax = exp_var_olink.plot()

### Logistic Regression

In [None]:
import sklearn
from sklearn.metrics import auc, precision_recall_curve, roc_curve

from src.sklearn.scoring import ConfusionMatrix

y_true = clinic[TARGET]
X = PCs.iloc[:,:5]
y_true.value_counts()

#### With weights

In [None]:
weights= sklearn.utils.class_weight.compute_sample_weight('balanced', y_true)

log_reg = sklearn.linear_model.LogisticRegression()
log_reg = log_reg.fit(X=X, y=y_true, sample_weight=weights)

In [None]:
scores = dict(ref_score=(y_true.value_counts() / len(clinic)).max(),
              model_score=log_reg.score(X, y_true, sample_weight=None))

scores

In [None]:
y_pred = log_reg.predict(X)

ConfusionMatrix(y_true, y_pred).as_dataframe

In [None]:
pivot = y_true.to_frame()
pivot['pred'] = y_pred
pivot = pivot.join(clinic.dead.astype(int))
pivot.describe().iloc[:2]

In [None]:
pd.pivot_table(pivot, values='pred', index=TARGET, columns='dead', aggfunc='sum')

In [None]:
pd.pivot_table(pivot, values='dead', index=TARGET, columns='pred', aggfunc='sum')

In [None]:
pivot.groupby(['pred', TARGET]).agg({'dead': ['count', 'sum']}) # more detailed

#### Without weights, but adapting cutoff

In [None]:
log_reg = log_reg.fit(X=X, y=y_true, sample_weight=None)

y_prob = log_reg.predict_proba(X)[:,1]
y_pred = pd.Series((y_prob > 0.21), index=PCs.index).astype(int)

ConfusionMatrix(y_true, y_pred).as_dataframe # this needs to be augmented with information if patient died by now (to see who is "wrongly classified)")

In [None]:
pivot = y_pred.to_frame('pred').join(y_true).join(clinic.dead.astype(int))
pivot.describe().iloc[:2]

How many will die for those who have been predicted to die?

In [None]:
pd.pivot_table(pivot, values='pred', index=TARGET, columns='dead', aggfunc='sum')

In [None]:
pivot.groupby(['pred', TARGET]).agg({'dead': ['count', 'sum']}) # more detailed

In [None]:
fpr, tpr, cutoffs = roc_curve(y_true, y_prob)
roc = pd.DataFrame([fpr, tpr, cutoffs[::-1]], index='fpr tpr cutoffs'.split())
ax = roc.T.plot('fpr', 'tpr')

In [None]:
precision, recall, cutoffs = precision_recall_curve(y_true, y_prob)
prc = pd.DataFrame([precision, recall, cutoffs[::-1]], index='precision recall cutoffs'.split())
prc

In [None]:
ax = prc.T.plot('recall', 'precision', ylabel='precision')