In [15]:
import pandas as pd
import numpy as np
import catboost as cb
import re

data = pd.read_csv("datasetCELL.csv", index_col=None)
data2 = pd.read_csv("datasetCELL2.csv", index_col=None)

def clean_col(s):
    return s.astype(str).str.lower().str.strip()

for col in data.select_dtypes(include=['object', 'category']).columns:
    data[col] = clean_col(data[col])


organic_families = [
    'liposome', 'thermosensitive liposome', 'cationic liposome',
    'solid lipid nanoparticles', 'polymer-based nanoparticles',
    'albumin nanoparticles', 'dendrimer', 'micelle',
    'extracellular vesicles'
]



inorganic_families = ['magnetic liposome', 'gold nanoparticles','magnetic nanoparticles', 'inorganic nanoparticles']

organic_families2 = ['polymer-based nanoparticles']




data = data.rename(columns={'EE%': 'EE'})
data['group'] = data2['group'].astype(str)
print(data['group'].isnull().sum())

for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].str.lower().str.strip()

data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
data = data[data['prep'].isin(organic_families)]

print(data['prep'].unique())


print(data.columns)
print(data.drugcarrierratio)
data['group']

0
['liposome' 'thermosensitive liposome' 'cationic liposome'
 'solid lipid nanoparticles' 'polymer-based nanoparticles'
 'albumin nanoparticles' 'dendrimer' 'micelle' 'extracellular vesicles']
Index(['prep', 'comp', 'PEG', 'PEGRatio', 'surfacecoating', 'ligandtype',
       'ligand', 'mechanism', 'receptortype', 'receptor', 'Pgp',
       'Pgpinhibitor', 'size', 'PDI', 'zeta', 'EE', 'DL', 'animalmodel',
       'route', 'liganddensitymol', 'liganddensitynum', 'encapsulation',
       'external', 'ID', 'drugcarrierratio', 'modratiounmod', 'charge',
       'exact_mass', 'xlogp', 'tpsa', 'atom_stereo_count', 'bond_stereo_count',
       'h_bond_donor_count', 'h_bond_acceptor_count', 'rotatable_bond_count',
       'heavy_atom_count', 'complexity', 'charge1', 'exact_mass1', 'xlogp1',
       'tpsa1', 'atom_stereo_count1', 'bond_stereo_count1',
       'h_bond_donor_count1', 'h_bond_acceptor_count1',
       'rotatable_bond_count1', 'heavy_atom_count1', 'complexity1', 'group'],
      dtype='object')

0         1
1         1
2         2
3         4
4         4
       ... 
643    1373
646    1376
647    1376
649    1405
650    1405
Name: group, Length: 390, dtype: object

In [16]:
from GradientBoost import cv_regression_catboost, prep_dataset, tune_catboost_with_optuna, cv_regression_lgbm, ensemble
import re

data = data.dropna(subset=['ID'])
data = data[data['ID'] > 0].copy()
data['logID'] = np.log(data['ID'])
data['logDCR'] = np.log(data['drugcarrierratio'])
#data['binID'] = (data['ID'] > 0.05).astype(int)
data = data[np.isfinite(data['logID'])].copy()
target_col = 'logID'
group = 'group'

data = data.dropna(subset=[target_col])

#data['is_dual_ligand'] = data['ligand'].str.contains(r'and|/|\+', case=False)

def peptide_charge(x):
    x = str(x).upper()
    count = x.count('R') + x.count('K')
    if count >= 5: return 'high_cationic'
    if count >= 2: return 'medium_cationic'
    return 'neutral'
    
#data['ligand_charge'] = data['ligand'].apply(peptide_charge)


def peptide_length(x):
    seq = re.findall(r'[A-Z]{2,}', x.replace(' ', '').upper())
    if not seq:
        return 0
    return sum(len(s) for s in seq)

#data['ligand_peptide_length'] = data['ligand'].fillna('').apply(peptide_length)
#data['has_PEG'] = data['ligand'].str.contains('PEG', case=False)


In [22]:
feat_cols = ['prep', 'comp', 'PEG', 'PEGRatio', 'surfacecoating', 'ligandtype',
       'ligand', 'mechanism', 'receptortype', 'receptor', 
       'size', 'PDI', 'zeta', 'EE', 'DL', 'animalmodel',
       'route', 'liganddensitymol',
       'external', 
       'exact_mass', 'xlogp', 'tpsa', 'atom_stereo_count', 
       'h_bond_donor_count', 'h_bond_acceptor_count', 'rotatable_bond_count',
       'heavy_atom_count', 'complexity', 'xlogp1']
cat_cols = [f for f in feat_cols if data[f].dtype == 'object']


efflux_unc = pd.read_csv("efflux_uncertain.csv")
efflux = pd.read_csv("efflux.csv")
morgan = pd.read_csv("morgan.csv")

efflux = efflux_unc.drop(['Name','Original SMILES'], axis=1)
eff_rel = [
    'PGP_substrate',      # P-glycoprotein - PRIMARY BBB efflux pump
    'BCRP_substrate',     # BCRP/ABCG2 - Major BBB efflux pump  
    'PGP_inhibitor',      # P-gp inhibition can improve brain delivery
    'BCRP_inhibitor',     # BCRP inhibition can improve brain delivery
    'MRP1_substrate',     # MRP1 - Secondary BBB efflux pump
    'MRP1_inhibitor'      # MRP1 inhibition effect
]
#morg = [str(x) for x in range(2048)]
#morg2 = [str(x)+" morgan" for x in range(2048)]

data[eff_rel] = efflux[eff_rel]
#data = pd.concat([data,morgan[morg_rel]], axis=1)

feat_cols = ['prep', 'comp', 'PEG', 'PEGRatio', 'surfacecoating', 'ligandtype',
       'ligand', 'mechanism', 'receptortype', 'receptor', 'Pgp',
       'size', 'PDI', 'zeta', 'EE', 'DL', 'animalmodel',
       'route', 'liganddensitymol',
       'external', 
       'exact_mass', 'xlogp', 'tpsa', 'atom_stereo_count', 
       'h_bond_donor_count', 'h_bond_acceptor_count', 'rotatable_bond_count',
       'heavy_atom_count', 'complexity', 'xlogp1']

feat_cols = feat_cols + eff_rel

cat_df = prep_dataset(data, target_col=target_col, one_hot=False, feat=feat_cols,cat=cat_cols, group_col=group)
lgbm_df = prep_dataset(data, target_col=target_col, one_hot=True, feat=feat_cols,cat=cat_cols, group_col=group)
lgbm_df = lgbm_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
lgbm_df = lgbm_df.loc[:, ~lgbm_df.columns.duplicated()].copy()
lgbm_feats = [c for c in lgbm_df.columns if c != group and c!=target_col]

#print(lgbm_df.columns)
cat_df.to_csv('the_supreme.csv')

print(len(data['encapsulation'].unique()))

110


In [23]:
from variance_decomposition import decompose_variance

result = decompose_variance(
        df            = lgbm_df,
        target_col    = target_col,
        group_col     = group,
        feature_cols  = lgbm_feats,
        min_group_size = 2,
        save_plot     = True,
        save_excel    = False,
    )


  Excluding 46 groups with <2 samples (46 rows removed).

  Variance Decomposition Setup
  Samples:  338
  Groups:   112
  Features: 350
  Target:   logID
  Method:   Linear Mixed Effects (REML)

  [1/3] Decomposing TARGET variance...
        ICC = 0.414 (41.4% between-group)
        ðŸ“Š MODERATE ICC: logID has meaningful study variation.
           Group-aware modeling recommended.

  [2/3] Computing ICC for 350 features...
        5/350 done...
        10/350 done...
        15/350 done...
        20/350 done...
        25/350 done...
        30/350 done...
        35/350 done...
        40/350 done...
        45/350 done...
        55/350 done...
        60/350 done...
        65/350 done...
        70/350 done...
        75/350 done...
        80/350 done...
        85/350 done...
        90/350 done...
        95/350 done...
        105/350 done...
        110/350 done...
        115/350 done...
        120/350 done...
        130/350 done...
        135/350 done...
        140/3

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/user-data/outputs/variance_decomposition_plot.png'

In [10]:
from recycling_v14 import TrainConfig, run_training_from_df

lgbm_feats = [c for c in lgbm_df.columns if c != group and c!=target_col]
cfg = TrainConfig(
        epochs=4000, patience=25, post_graph_patience=60,
        rebuild_adj_start=60, min_study_size=5,
        num_iterations=10, mask_rate=0.15,
        mice_max_iter=10, mice_epochs=100,
        max_missing_pct=0.80,
    )

out = run_training_from_df(lgbm_df, target_col, group, lgbm_feats, n_splits=5, cfg=cfg, ablate=True)
print("\n=== Summary ===")
print(f"\n  Hybrid  RMSE : {out['avg_rmse']:.4f} Â± {out['std_rmse']:.4f}  "
              f"RÂ²={out['avg_r2']:.4f}")
print(f"  CatBoost  RMSE : {out['avg_rmse_cb_only']:.4f}  "
      f"RÂ²={out['avg_r2_cb_only']:.4f}")

smoketest()

  [Data] Dropping 3 columns >80% missing: ['DL', 'liganddensitymol', 'xlogp1']

Missingness summary (347 cont features, 0 cat features):
  PEG: 3.1%
  PEGRatio: 63.3%
  Pgp: 45.3%
  size: 17.4%
  PDI: 48.7%
  zeta: 32.6%
  EE: 61.5%
  exact_mass: 33.1%
  xlogp: 39.3%
  tpsa: 33.1%
  atom_stereo_count: 33.1%
  h_bond_donor_count: 33.1%
  h_bond_acceptor_count: 33.1%
  rotatable_bond_count: 33.1%
  heavy_atom_count: 33.1%
  complexity: 33.1%

[Fold 1] Large=12 | Test seen_large=0 unseen=77
  [Fold 1] Fitting MICE imputer...
  [Early stop epoch 50] best_val=1.1102
[Fold 1/5] RMSE=2.8832 MAE=2.4894 RÂ²=-0.2649 | Phase1=3.3378

[Fold 2] Large=11 | Test seen_large=0 unseen=77
  [Fold 2] Fitting MICE imputer...
  [Epoch 60] Graph built (11 studies).
  [Graph] 1/11 isolated studies â€” self-loop only.
  [Early stop epoch 170] best_val=1.2596
[Fold 2/5] RMSE=3.2195 MAE=2.6884 RÂ²=-0.7554 | Phase1=3.2091

[Fold 3] Large=11 | Test seen_large=0 unseen=77
  [Fold 3] Fitting MICE imputer...
  [Epoch

KeyError: 'avg_rmse_cb_only'

In [19]:
results_cat = cv_regression_catboost(
    df=cat_df,
    target_col=target_col,
    cat_cols=cat_cols,
    feature_cols=feat_cols+[group],
    use_lmem_cleaning=False,
    cv_type="kfold",
    binary=False,
    group_col=None,
    n_splits=5,
    iterat=6000, learn=0.04987715801619947, dep=7,rand=42, leaf_reg=6.4038007118811135, rand_str=0.15167836674283808, bag_temp=0.3798466046212996, min_d=10, boot="Bayesian", 
    
)


[CatBoost CV] Fold 1
trained!
diddy: r2=0.9113508916150095
predicted!
R2=0.471, RMSE=1.772, MAE=1.365, Rho=0.674, pval=1.850e-11

[CatBoost CV] Fold 2
trained!
diddy: r2=0.928994591132654
predicted!
R2=0.477, RMSE=1.882, MAE=1.405, Rho=0.675, pval=1.663e-11

[CatBoost CV] Fold 3
trained!
diddy: r2=0.886125792900302
predicted!
R2=0.576, RMSE=1.609, MAE=1.179, Rho=0.765, pval=5.922e-16

[CatBoost CV] Fold 4
trained!
diddy: r2=0.9006405499666664
predicted!
R2=0.516, RMSE=1.598, MAE=1.236, Rho=0.741, pval=1.282e-14

[CatBoost CV] Fold 5
trained!
diddy: r2=0.9157698202654032
predicted!
R2=0.413, RMSE=1.449, MAE=1.077, Rho=0.647, pval=2.695e-10

[CatBoost CV] Overall:
R2 mean=0.491 Â± 0.054
RMSE mean=1.662 Â± 0.150
MAE  mean=1.252 Â± 0.120
Rho  mean=0.700 Â± 0.045
AUROC  mean=nan Â± nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [28]:
models = results_cat['models']
fi = np.mean([model.get_feature_importance() for model in models], axis=0)
print(len(fi), len(feat_cols))

# Build dataframe
fi_df = (
    pd.DataFrame({
        "Feature": feat_cols+[group],
        "Importance": fi
    })
    .sort_values("Importance", ascending=False)
    .reset_index(drop=True)
)

fi_df

37 36


Unnamed: 0,Feature,Importance
0,size,11.824538
1,group,10.168439
2,PEGRatio,8.214317
3,animalmodel,7.417422
4,zeta,6.292335
5,comp,5.263342
6,prep,4.984151
7,xlogp,4.127446
8,PDI,3.921358
9,mechanism,3.48798


In [103]:
#0.04987715801619947
results_ens = ensemble(
    catdf = cat_df,
    lgbmdf = lgbm_df,
    target_col=target_col,
    cat_cols=cat_cols,
    group_col=None,
    n_splits=5,
    #Catboost
    iterat=2920, learn1=0.04987715801619947, dep=7, rand1=42,
    leaf_reg=6.4038007118811135, rand_str=0.15167836674283808, bag_temp=0.3798466046212996, min_d=10, boot="Bayesian",
    #LGBM
    n_est=2977,learn2=0.11127804361710134,max_d=9,sub=0.7287063461374701,col=0.47137369806209434,rand2=42,
    #Verbosity
    verbose=True)
    



results_lgbm = cv_regression_lgbm(
    df=lgbm_df,              # e.g. from prep_dataset(..., one_hot=True)
    target_col=target_col,
    cv_type="kfold",
    group_col=None,
    n_splits=5,
    n_est=977,learn=0.11127804361710134,max_d=9,sub=0.7287063461374701,col=0.47137369806209434,rand=42
)

r2 = np.mean(results_cat['r2_scores'])

print(data[target_col].max() - data[target_col].min())


[CV] Using KFold

[Ensemble CV] Fold 1


ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: group: object

In [11]:
from GradientBoost import cv_gpboost_native_compat

family = [
 'polymer-based nanoparticles']
cat_df_cp = cat_df[cat_df['prep'].isin(family)]


results_gp = cv_gpboost_native_compat(
    df=cat_df,
    target_col=target_col,
    cat_cols=cat_cols,
    cv_type="kfold",
    group_col=group,
    binary=False,
    n_splits=5,
    feature_cols=None,
    # training params
    num_boost_round=40000,
    learning_rate=0.05,
    max_depth=16,
    num_leaves=None,
    min_data_in_leaf=10,
    lambda_l2=10,
    feature_fraction=1,
    bagging_fraction=1,
    seed=42,
    early_stopping_rounds=100,
    approx="vecchia",
    # LMEM cleaning
    use_lmem_cleaning=False,
    lmem_fixed_effects=None,
    lmem_reml=True,
    lmem_verbose=False,
    # likelihood
    likelihood="gaussian",
    verbose=True,
)

data['logID'].max() - data['logID'].min()


[GPBoost CV] Fold 1




R2=0.446, RMSE=1.813, MAE=1.362, Rho=0.639, pval=4.130e-10

[GPBoost CV] Fold 2




R2=0.476, RMSE=1.885, MAE=1.424, Rho=0.709, pval=5.372e-13

[GPBoost CV] Fold 3




R2=0.524, RMSE=1.705, MAE=1.257, Rho=0.720, pval=1.612e-13

[GPBoost CV] Fold 4




R2=0.535, RMSE=1.566, MAE=1.128, Rho=0.737, pval=2.268e-14

[GPBoost CV] Fold 5




R2=0.279, RMSE=1.606, MAE=1.128, Rho=0.572, pval=6.557e-08

[GPBoost CV] Overall:
R2   mean=0.452 Â± 0.092
RMSE mean=1.715 Â± 0.121
MAE  mean=1.260 Â± 0.120
Rho  mean=0.675 Â± 0.061


np.float64(12.91603370896966)

In [57]:
from dataset_handling import augment_with_noise
cont_cols = ['size','zeta','EE','DL','PDI']

data_noised = augment_with_noise(
    cat_df,
    cont_cols,
    n_copies=2,
    rel_noise=None,
    abs_noise=None,
    clip_bounds=None,
    base_weight=0.4,
    noise_weight=1,
    random_state=42,
)[0]

data_noised

Unnamed: 0,logID,group,prep,comp,PEG,PEGRatio,surfacecoating,ligandtype,ligand,mechanism,...,exact_mass,xlogp,tpsa,atom_stereo_count,h_bond_donor_count,h_bond_acceptor_count,rotatable_bond_count,heavy_atom_count,complexity,xlogp1
0,-6.645391,1,liposome,pc,0.0,0.0,none,none,,passive transport,...,,,,,,,,,,
1,-6.377127,1,liposome,dppc,0.0,0.0,none,none,,passive transport,...,,,,,,,,,,
2,-6.032287,2,liposome,pc,0.0,0.0,none,none,,passive transport,...,,,,,,,,,,
3,-3.963316,4,liposome,pc,0.0,0.0,none,glycosol,mannose derivative,transporter mediated,...,,,,,,,,,,
4,-4.509860,4,liposome,pc,0.0,0.0,none,glycosol,fucose\nderivative (l-type),transporter mediated,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1147,-4.039856,1373,solid lipid nanoparticles,gynasan 114/glycerol/lipoid s75,1.0,,none,none,,passive transport,...,172.040341,,36.5,0.0,1.0,3.0,1.0,11.0,125.0,
1148,-3.480598,1376,polymer-based nanoparticles,pcl,0.0,,none,none,,passive transport,...,348.208930,7.6,37.3,0.0,1.0,2.0,3.0,26.0,551.0,
1149,-2.818691,1376,polymer-based nanoparticles,pcl-peg,1.0,5.0,none,none,,passive transport,...,348.208930,7.6,37.3,0.0,1.0,2.0,3.0,26.0,551.0,
1150,-4.480988,1405,albumin nanoparticles,bovine serum albumin,0.0,,none,none,,passive transport,...,170.105528,-0.3,63.4,1.0,1.0,2.0,3.0,12.0,203.0,


In [None]:
#### for i in range(len(data['ligand'])):
    print(str(data['ligand_peptide_length'][i]) + data['ligand'][i])

In [None]:
cat_df = prep_dataset(data, target_col=target_col, one_hot=False, feat=feat_cols,cat=cat_cols)

study = tune_catboost_with_optuna(
    cat_df=cat_df,
    target_col=target_col,
    cat_cols=cat_cols,
    feat_cols=feat_cols,
    n_trials=50,
    n_splits=5,
)
best_params = study.best_params

results_cat_best = cv_regression_catboost(
    df=cat_df,
    target_col=target_col,
    cat_cols=cat_cols,
    feature_cols=feat_cols,
    use_lmem_cleaning=False,
    cv_type="kfold",
    n_splits=5,
    **best_params,
    rand=42,  # you can still fix a seed if it's not in best_params
)
r2_best = np.mean(results_cat_best["r2_scores"])
print("Final CV r2 with tuned params:", r2_best)

In [None]:
from GradientBoost import tune_lgbm_with_optuna
'''
study = tune_lgbm_with_optuna(
    lgbm_df=lgbm_df,
    target_col=target_col,
    n_trials=50,
    cv_type="kfold",
    group_col=None,
    n_splits=5,
)

best_params = study.best_params

# 2. Final CV run with best hyperparameters
results_lgbm_best = cv_regression_lgbm(
    df=lgbm_df,
    target_col=target_col,
    cv_type="kfold",
    group_col=None,
    n_splits=5,
    rand=42,          # keep seed fixed
    **best_params,    # n_est, learn, max_d, sub, col
)

r2_best = np.mean(results_lgbm_best["r2_scores"])
print("Final CV r2 with tuned params:", r2_best)'''

In [None]:
from GradientBoost import tune_ens_with_optuna

study=tune_ens_with_optuna(
    cat_df=cat_df,
    lgbm_df=lgbm_df,
    target_col=target_col,
    cat_cols=cat_cols, 
    n_trials=50,
    cv_type="kfold",
    group_col=None,
    n_splits=5,
)

In [None]:
models = results_cat['models']
fi = np.mean([model.get_feature_importance() for model in models], axis=0)

# Build dataframe
fi_df = (
    pd.DataFrame({
        "Feature": feat_cols,
        "Importance": fi
    })
    .sort_values("Importance", ascending=False)
    .reset_index(drop=True)
)

fi_df


In [None]:
dat = pd.read_excel("Brain-Targeted-Nanomedicines-data.xlsx")
data = dat.drop('highest amount in brain/tumor', axis=1)
data = data.rename(columns={'type of the preparation': 'prep', 'core composition of the preparation': 'comp', 'PEGylation or not': 'PEG', 'PEGylation ratio': 'PEGRatio','surface coating': 'surface-coating',  'type of targeting ligand': 'ligand-type', 'targeting ligand': 'ligand', 'transport mechanism': 'mechanism', 'type of receptor/transporter': 'receptor-type', 'name of receptor/transporter': 'receptor', 'p-gp inhibition?': 'Pgp', 'p-gp inhibitor': 'Pgp-inhibitor', 'size ï¼ˆnmï¼‰': 'size', 'zeta potantialï¼ˆmVï¼‰': 'zeta', 'encapsulation efficiencyï¼ˆ%ï¼‰': 'EE%', ' DLï¼ˆg/gï¼‰': 'DL', 'animal model': 'animal-model', 'administration route': 'route', 'target ligand densityï¼ˆmolï¼Œ%ï¼‰': 'ligand-density-mol', 'target ligand densityï¼ˆnumber per nanoparticleï¼‰': 'ligand-density-num', 'Encapsulation': 'encapsulation', 'laser/magnectic field/radiation': 'external', 'delivering efficiency (% ID)': 'ID%', 'ratio of preparation/free drug': 'delivery-efficiency', 'ratio of modified/unmodified preparation': 'mod-ratio-unmod'})

data.to_csv('datacols.csv', index=False)



In [None]:
import pandas as pd
import numpy as np
import re

data = pd.read_csv('datacols.csv', index_col=None)

#data = data.drop(['number', 'peak-amt'], axis=1)
newest_no = 0

print(data.columns)


s = pd.Series(data['number'])
data['number'] = s.ffill().tolist()

data['group'] = data['number']
print(data['group'])

def clean_to_numeric(series):
    """
    Converts a pandas Series to numeric by removing units, text,
    and non-numeric characters. Returns floats with NaN for invalid entries.
    """
    # Convert to string
    s = series.astype(str)

    # Remove everything except digits, decimal, minus, +, exponent notation
    s = s.str.replace(r"[^0-9eE+\-\.]", "", regex=True)

    # Convert to numeric
    return pd.to_numeric(s, errors="coerce")

data['size'] = clean_to_numeric(data['size'])
data['zeta'] = clean_to_numeric(data['zeta'])


data.to_csv('datasetCELL2.csv', index=False)
data['size'].dtype, data['zeta'].dtype