In [None]:
from os.path import expanduser

import datamol as dm
import lightgbm as lgb
import numpy as np
import pandas as pd
from molfeat.trans import MoleculeTransformer
from molfeat.trans.fp import FPVecTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from openadmet_models.anvil.anvil_workflow import AnvilSpecification

### Convert target data to categorical

In [77]:
# Configure database and threshold
db = "ache_chembl"
conc_thresh = 30  # uM
pchembl_thresh = -np.log10(conc_thresh * 1e-6)

# Construct data frame
df = pd.read_parquet(expanduser(f"~/sandbox/databases/{db}/source/{db}.parquet"))
df["is_active"] = df["pchembl_value"] > pchembl_thresh

# Stats
print("Conc:\t\t{:.1f} uM".format(conc_thresh))
print("Threshold:\t{:.2f}".format(pchembl_thresh))
print("N negatives:\t{}".format(np.sum(df["pchembl_value"] <= pchembl_thresh)))
print(
    "frac negatives:\t{:.3f}%".format(
        100 * np.sum(df["pchembl_value"] <= log_thresh) / len(df)
    )
)

Conc:		30.0 uM
Threshold:	4.52
N negatives:	503
frac negatives:	8.281%


### "Manual" classification

In [81]:
X = df["PARENT_SMILES"].values
y = df["is_active"].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
vec_featurizer = FPVecTransformer("ecfp:4", dtype=np.float32)
trans = MoleculeTransformer(
    vec_featurizer,
    n_jobs=-1,
    dtype=np.float32,
    parallel_kwargs={"progress": True},
    verbose=False,
)

with dm.without_rdkit_log():
    X_train_feat, _ = trans(X_train, ignore_errors=True)
    X_test_feat, _ = trans(X_test, ignore_errors=True)

X_train_feat = np.squeeze(X_train_feat)
X_test_feat = np.squeeze(X_test_feat)


In [94]:
# Create LightGBM dataset
train_data = lgb.Dataset(X_train_feat, label=y_train)
test_data = lgb.Dataset(X_test_feat, label=y_test, reference=train_data)

# Set model parameters
params = {
    "objective": "binary",
    "metric": "binary_error",
    "boosting_type": "gbdt",
    "learning_rate": 0.1,
    "num_leaves": 31,
    "verbose": -1,
}

# Train model
model = lgb.train(
    params,
    train_data,
    valid_sets=[test_data],
    num_boost_round=100,
)

# Predict on test set
y_pred = (model.predict(X_test_feat) > 0.5).astype(int)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
null = np.sum(y_test) / len(y_test)

print(f"Accuracy: {accuracy:.4f}")
print(f"Null accuracy: {null:.4f}")

Accuracy: 0.9263
Null accuracy: 0.9236


### Run basic anvil (regression)

In [None]:
spec = AnvilSpecification.from_recipe("anvil_recipe.yaml")
wf = spec.to_workflow()
wf.run(debug=False, output_dir="output/")