# Compare data splits

- compare two data splits by a binary target variable

In [None]:
import logging
from pathlib import Path
import pandas as pd
import sweetviz


import config


def find_val_ids(df: pd.DataFrame, val_ids: str = None, val_ids_query: str = None, sep=',') -> list:
    """Find validation IDs based on query or split."""
    if not val_ids:
        if val_ids_query:
            logging.warning(f"Querying index using: {val_ids_query}")
            val_ids = df.filter(like='Cflow', axis=0).index.to_list()
            logging.warning(f"Found {len(val_ids)} Test-IDs")
        else:
            raise ValueError("Provide a query string.")
    elif isinstance(val_ids, str):
        val_ids = val_ids.split(sep)
    else:
        raise ValueError("Provide IDs in csv format as str: 'ID1,ID2'")
    return val_ids

## Parameters

In [None]:
fname_pkl_clinic = config.fname_pkl_all_clinic_num
fname_pkl_olink = config.fname_pkl_all_olink

TARGET = 'hasLiverAdm180'
FOLDER = ''
feat_set_to_consider: str = 'OLINK_AND_CLINIC'
VAL_IDS = ''
VAL_IDS_query = "Cflow"
name_report = 'train_val_comparison'

In [None]:
if not FOLDER:
    FOLDER = Path(config.folder_reports) / TARGET
    FOLDER.mkdir(exist_ok=True)
else:
    FOLDER = Path(FOLDER)
FOLDER

## Read data

In [None]:
data = pd.read_pickle(fname_pkl_clinic).join(pd.read_pickle(fname_pkl_olink))
data

# cols = njab.pandas.get_colums_accessor(clinic)

In [None]:
test_ids = find_val_ids(data, val_ids=VAL_IDS, val_ids_query=VAL_IDS_query)
# val_ids

retain entries with only non-missing targets

In [None]:
test_split = data.loc[test_ids]
train_split = data.drop(test_ids)
train_split.shape, test_split.shape

In [None]:
train_split = train_split.dropna(subset=[TARGET])
test_split = test_split.dropna(subset=[TARGET])
train_split.shape, test_split.shape

In [None]:
# def find_unique(df:pd.DataFrame) -> pd.Index:
#     drop_cols = df.describe(include='all').loc['unique'] == 1
#     drop_cols = df.columns[drop_cols]
#     return drop_cols

# drop_cols = find_unique(test_split)
# test_split[drop_cols].describe(include='all') if not test_split[drop_cols].empty else "None"

In [None]:
# drop_cols = find_unique(train_split)
# train_split[drop_cols].describe(include='all') if not train_split[drop_cols].empty else "None"

In [None]:
# test_split = test_split.drop(drop_cols, axis=1)
# train_split = train_split.drop(drop_cols, axis=1)

## Create Report

In [None]:
sweetviz_report = sweetviz.compare([train_split, 'training data'],
                                   [test_split, 'test data'],
                                   target_feat=TARGET,
                                   pairwise_analysis='off')
sweetviz_report.show_html(filepath=FOLDER / f'{name_report}.html')