Here, we run a simple test. We compare accuracy predicting a source, given just that source's data, to predicting a source, given the entire compiled dataset. This is a demonstration of the "Generalization across assays, cell types, NP types, etc." section of the project report.

We do a class-balanced (stratified K-fold) on the source in question, and train two models for each fold -- one with only access to the source's dataset, and one with access to the entire dataset.

In [153]:
import numpy as np
import pandas as pd

In [154]:
# Load in the dataset
df = pd.read_csv('compiled_datasets/nanotox_features_dataset.csv')
X = df.values[:, 1:-1]
y = df['Cell Viability'].values

In [156]:
# Generate the train/test splits
from sklearn.model_selection import StratifiedKFold

source_values = df['Source'].values
source_to_comp = 'Subramanian, 2021'
rs = 999

source_mask = np.where(source_values == source_to_comp)[0]
oo_source_mask = np.where(source_values != source_to_comp)[0]

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=rs)
within_source_splits = [
    (source_mask[trn], source_mask[tst])
    for trn, tst in skf.split(np.zeros_like(source_mask), y[source_mask])
]
oo_source_splits = [
    (np.concatenate((oo_source_mask, trn)), tst)
    for trn, tst in within_source_splits
]

In [158]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

# Train the model on just the source in question
model = RandomForestClassifier(class_weight='balanced')

{
    name: (np.mean(vals), np.std(vals))
    for name, vals in
    cross_validate(
        estimator=model,
        X=X, y=y, cv=within_source_splits,
        n_jobs=-1,
        scoring=['accuracy', 'balanced_accuracy', 'roc_auc', 'recall']
    ).items()
}

{'fit_time': (0.18665568033854166, 0.004147575493459726),
 'score_time': (0.029611984888712566, 0.0007170995242080531),
 'test_accuracy': (0.8680847220340514, 0.035855387345060746),
 'test_balanced_accuracy': (0.7139877031181379, 0.06586925987457957),
 'test_roc_auc': (0.9436492727006561, 0.0168263267092698),
 'test_recall': (0.4683794466403162, 0.11880741790663352)}

In [159]:
# Train the model using the entire dataset as background
{
    name: (np.mean(vals), np.std(vals))
    for name, vals in
    cross_validate(
        estimator=model,
        X=X, y=y, cv=oo_source_splits,
        n_jobs=-1,
        scoring=['accuracy', 'balanced_accuracy', 'roc_auc', 'recall']
    ).items()
}

{'fit_time': (1.2057866255442302, 0.01910808896375511),
 'score_time': (0.03050971031188965, 0.00027944556075204486),
 'test_accuracy': (0.8735943639073297, 0.028221035620313837),
 'test_balanced_accuracy': (0.7517029209575794, 0.051358046821068835),
 'test_roc_auc': (0.9502311844209078, 0.022102898546548722),
 'test_recall': (0.5573122529644269, 0.1015946121141344)}