# Imports

In [1]:
import polaris as po
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Data Processing

In [2]:
benchmark = po.load_benchmark("polaris/pkis1-kit-wt-mut-c-1")

[32m2024-06-21 13:53:11.766[0m | [1mINFO    [0m | [36mpolaris._artifact[0m:[36m_validate_version[0m:[36m66[0m - [1mThe version of Polaris that was used to create the artifact (0.0.0) is different from the currently installed version of Polaris (dev).[0m
[32m2024-06-21 13:53:11.771[0m | [1mINFO    [0m | [36mpolaris._artifact[0m:[36m_validate_version[0m:[36m66[0m - [1mThe version of Polaris that was used to create the artifact (0.0.0) is different from the currently installed version of Polaris (dev).[0m


In [3]:
train = pd.read_csv("train_df.csv")
train_y = np.stack([train[target] for target in benchmark.target_cols], axis=1)
columns_to_drop = benchmark.target_cols + ["SMILES", "KIT_(T6701_mutant)", "KIT_(V560G_mutant)", "KIT"]
train = train.drop(columns=columns_to_drop)
train_x = np.stack([train[col] for col in train.columns if col not in benchmark.target_cols], axis=1)
mask = ~np.any(np.isnan(train_y), axis=1)

In [4]:
train_x.shape

(277, 2238)

In [5]:
test = pd.read_csv("test_df.csv")
test_y = np.stack([test[target] for target in benchmark.target_cols], axis=1)
test = test.drop(columns=columns_to_drop)
test_x = np.stack([test[col] for col in test.columns if col not in benchmark.target_cols], axis=1)

# LightGBM Model

In [6]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor

hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1', 'l2'],
    'learning_rate': 0.005,  # A compromise between original and tuned
    'feature_fraction': 0.95,  # Slightly higher than original based on tuning
    'bagging_fraction': 0.8,  # A compromise between original and tuned
    'bagging_freq': 10,  # Kept from original
    'verbose': 2,
    'max_depth': 8,  # Kept from both
    'num_leaves': 63,  # From tuned parameters
    'min_child_samples': 10,  # From tuned parameters
    'max_bin': 512,  # Kept from original
    'n_estimators': 5000,  # Increased from tuned, decreased from original
}

# Create the base LightGBM model
base_model = LGBMRegressor(**hyper_params)

# Wrap it with MultiOutputRegressor for multi-output regression
model = MultiOutputRegressor(base_model)

# Now you can fit the model
# model.fit(X_train, y_train)
model.fit(train_x[mask], train_y[mask])

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.885722
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.736811
[LightGBM] [Debug] init for col-wise cost 0.014806 seconds, init for row-wise cost 0.013816 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015080 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 2392
[LightGBM] [Info] Number of data points in the train set: 276, number of used features: 711
[LightGBM] [Info] Start training from score 0.025362
[LightGBM] [Debug] Re-bagging, using 223 data to train
[LightGBM] [Debug] Trained a tree with leaves = 3 and depth = 2
[LightGBM] [Debug] Trained a tree with leaves = 5 and depth = 4
[LightGBM] [Debug] Trained a tree with leaves = 7 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves

In [7]:
y_pred = np.exp(model.predict(test_x))
y_prob = np.stack([1-y_pred, y_pred], axis=2)
y_pred = y_pred > 0.5



In [8]:
y_pred = {k: y_pred[:, idx] for idx, k in enumerate(benchmark.target_cols)}
y_prob = {k: y_prob[:, idx, 1] for idx, k in enumerate(benchmark.target_cols)}

In [9]:
results = benchmark.evaluate(y_pred=y_pred, y_prob=y_prob)
results

Test set,Target label,Metric,Score
test,CLASS_KIT_(T6701_mutant),accuracy,0.1609195402
test,CLASS_KIT_(V560G_mutant),accuracy,0.1379310345
test,CLASS_KIT,accuracy,0.3793103448
test,CLASS_KIT_(T6701_mutant),f1,0.2772277228
test,CLASS_KIT_(V560G_mutant),f1,0.2424242424
test,CLASS_KIT,f1,0.55
test,CLASS_KIT_(T6701_mutant),roc_auc,0.8091976517
test,CLASS_KIT_(V560G_mutant),roc_auc,0.7711111111
test,CLASS_KIT,roc_auc,0.8047138047
test,CLASS_KIT_(T6701_mutant),pr_auc,0.7109660113

0,1
slug,polaris
external_id,org_2gtoaJIVrgRqiIR8Qm5BnpFCbxu
type,organization

Test set,Target label,Metric,Score
test,CLASS_KIT_(T6701_mutant),accuracy,0.1609195402
test,CLASS_KIT_(V560G_mutant),accuracy,0.1379310345
test,CLASS_KIT,accuracy,0.3793103448
test,CLASS_KIT_(T6701_mutant),f1,0.2772277228
test,CLASS_KIT_(V560G_mutant),f1,0.2424242424
test,CLASS_KIT,f1,0.55
test,CLASS_KIT_(T6701_mutant),roc_auc,0.8091976517
test,CLASS_KIT_(V560G_mutant),roc_auc,0.7711111111
test,CLASS_KIT,roc_auc,0.8047138047
test,CLASS_KIT_(T6701_mutant),pr_auc,0.7109660113


In [11]:
# Please use the `ML4DD-team#` template for the name
results.name = "ML4DD-team8"

# Short description of your method
results.description = "PhysProp_ECFP_MACCS_fingerprints_with_LightGBM"

# A link to your code, e.g. Github or Google Colab.
results.github_url = "https://github.com/Peterdes/ml4dd"

# A link to a short (<1 page) write-up of your method
# e.g. in Google Docs or Notion
results.paper_url = "https://docs.google.com/document/d/1yFwwPyQZabT8KVgU5EOFDVZ28S68k-WRpCmtNHe0ocM/edit?usp=sharing"

# Specify the username of all your team members
results.contributors = ["ninaad", "piotrsuwara", "bodak", "justinewilliams"]

In [12]:
results.upload_to_hub(owner="ninaad")

[32m2024-06-21 13:56:26.426[0m | [32m[1mSUCCESS [0m | [36mpolaris.hub.client[0m:[36mupload_results[0m:[36m492[0m - [32m[1mYour result has been successfully uploaded to the Hub. View it here: https://polarishub.io/benchmarks/polaris/pkis1-kit-wt-mut-c-1/HArnlkq73nnX04Kd9dtcZ[0m


{'id': 'HArnlkq73nnX04Kd9dtcZ',
 'createdAt': '2024-06-21T17:56:26.156Z',
 'deletedAt': None,
 'name': 'ML4DD-team8',
 'slug': 'ml4dd-team8',
 'description': 'PhysProp_ECFP_MACCS_fingerprints_with_LightGBM',
 'tags': [],
 'userAttributes': {},
 'access': 'private',
 'isCertified': False,
 'polarisVersion': 'dev',
 'ownerId': 'cikP0K5Rkrgdn0BAiLka0',
 'creatorId': 'cikP0K5Rkrgdn0BAiLka0',
 'benchmarkId': 'DZzlykxvBwlSA9uERL17A',
 'results': [{'scores': {'f1': 0.55,
    'mcc': 0,
    'pr_auc': 0.7038769824674294,
    'roc_auc': 0.8047138047138047,
    'accuracy': 0.3793103448275862,
    'cohen_kappa': 0},
   'testSet': 'test',
   'targetLabel': 'CLASS_KIT'},
  {'scores': {'f1': 0.27722772277227725,
    'mcc': 0,
    'pr_auc': 0.710966011273893,
    'roc_auc': 0.8091976516634051,
    'accuracy': 0.16091954022988506,
    'cohen_kappa': 0},
   'testSet': 'test',
   'targetLabel': 'CLASS_KIT_(T6701_mutant)'},
  {'scores': {'f1': 0.24242424242424243,
    'mcc': 0,
    'pr_auc': 0.564734371630

# Hyperparameter Tuning

In [None]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

# Define the parameter space
param_dist = {
    'estimator__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'estimator__n_estimators': [100, 300, 500, 1000],
    'estimator__num_leaves': [31, 63, 127],
    'estimator__max_depth': [-1, 4, 8, 12],  # -1 means no limit
    'estimator__feature_fraction': [0.7, 0.8, 0.9, 1.0],
    'estimator__bagging_fraction': [0.7, 0.8, 0.9, 1.0],
    'estimator__min_child_samples': [10, 20, 30]
}

# Create the base model
base_model = LGBMRegressor(
    objective='regression',
    metric=['l1','l2'],
    verbose=0,
)

# Wrap it with MultiOutputRegressor
model = MultiOutputRegressor(base_model)

# Set up KFold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=100,  # number of parameter settings that are sampled
    cv=cv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # use all available cores
    verbose=2,
    random_state=42
)

# Fit the RandomizedSearchCV object to the data
random_search.fit(train_x[mask], train_y[mask])

# Print the best parameters and score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: ", -random_search.best_score_)

# Use the best model found
best_model = random_search.best_estimator_