In [None]:
import sys
import pandas as pd
sys.path.append('../')
from pepq.io import load_json
from pepq.data import build_data


core = load_json('../data/testcase/core56.json')
dockq_core = pd.read_csv('../data/testcase/dockq_core56.csv')
train = load_json('../data/train/train.json')
dockq_train = pd.read_csv('../data/train/dockq_train.csv')

train_do = load_json('../data/train/train_do.json')
dockq_train_do = pd.read_csv('../data/train/dockq_train_do.csv')

core = build_data(core, dockq_core)
train = build_data(train, dockq_train)
train_docking_only = build_data(train_do, dockq_train_do)
print("Core dataset shape:", core.shape)
print("Train dataset shape:", train.shape)
print("Train dropout dataset shape:", train_docking_only.shape)

train = pd.concat([train, train_docking_only], ignore_index=True)
print("Combined train dataset shape:", train.shape)

In [None]:
from pepq.eda import DockEDA, train_default_rf_importance

eda = (
    DockEDA(core, target_col="dockq", secondary_target_col="label")
    .compute_missing_summary()
    .compute_basic_stats()
    .compute_correlations(with_target_only=True)
)

# RF importances for dockq
imp = train_default_rf_importance(eda, task="regression")
eda.set_feature_importance(imp)

# Existing
eda.plot_overview_nature(top_k=10, standardize=True, annot_corr=False, figsize=(9.0, 7.0))

In [None]:

eda.plot_label_violins()

eda.plot_distributions(hue="label")

eda.plot_pca(color="label")
eda.plot_pairwise(features=eda._top_k_features_by_target_corr(6), kind="reg", hue="label", sample=1500)
eda.plot_corr_clustermap()
eda.plot_hexbin(x="iptm", y="composite_ptm", c="dockq")

In [None]:
from pepq.eda import DockEDA, train_default_rf_importance

eda = (
    DockEDA(core, target_col="dockq", secondary_target_col="label")
    .compute_missing_summary()
    .compute_basic_stats()
    .compute_correlations(with_target_only=True)
)

# Quick RF-based importances (regression on dockq)
imp = train_default_rf_importance(eda, task="regression")
eda.set_feature_importance(imp)

# Nature-style overview with importance panel
eda.plot_overview_nature(top_k=6, standardize=True, annot_corr=False,
                         figsize=(9.0, 7.0))
