In [None]:
import pandas as pd
import polaris as po
import numpy as np

In [None]:
benchmark = po.load_benchmark("polaris/pkis1-kit-wt-mut-c-1")
df = benchmark.dataset.table
df

In [None]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

In [None]:
y_train = train_df[['KIT_(T6701_mutant)', 'KIT_(V560G_mutant)', 'KIT']]
y_test = test_df[['KIT_(T6701_mutant)', 'KIT_(V560G_mutant)', 'KIT']]

In [None]:
X_train = train_df.drop(columns=['SMILES', 'KIT_(T6701_mutant)', 'KIT_(V560G_mutant)', 'KIT', 'CLASS_KIT_(T6701_mutant)', 'CLASS_KIT_(V560G_mutant)', 'CLASS_KIT']).values
X_test = test_df.drop(columns=['SMILES','KIT_(T6701_mutant)', 'KIT_(V560G_mutant)', 'KIT', 'CLASS_KIT_(T6701_mutant)', 'CLASS_KIT_(V560G_mutant)', 'CLASS_KIT']).values

In [None]:
train, test = benchmark.get_train_test_split()
ys = train.y
ys = np.stack([ys[target] for target in benchmark.target_cols], axis=1)
mask = ~np.any(np.isnan(ys), axis=1)
mask.sum()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_curve, auc

# Initialize models
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'Support Vector Classifier': SVC(probability=True),
    'K-Neighbors Classifier': KNeighborsClassifier()
}

# Function to calculate PR-AUC
def calculate_pr_auc(y_true, y_scores):
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    return auc(recall, precision)

# Evaluate each model
results = {}
target_variables = ['CLASS_KIT_(T6701_mutant)', 'CLASS_KIT_(V560G_mutant)', 'CLASS_KIT']

for name, model in models.items():
    model_results = {}
    for target in target_variables:
        y_train_target = y_train[target]
        y_test_target = y_test[target]

        # Cross-validated predictions
        y_scores = cross_val_predict(model, X_train, y_train_target, cv=5, method='predict_proba')[:, 1]

        # Fit the model on the entire training set
        model.fit(X_train, y_train_target)

        # Predict probabilities on the test set
        y_test_scores = model.predict_proba(X_test)[:, 1]

        # Calculate PR-AUC
        pr_auc = calculate_pr_auc(y_test_target, y_test_scores)

        # Store the result
        model_results[target] = pr_auc

    results[name] = model_results

# Display results
results_df = pd.DataFrame(results).T
print(results_df)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def tanimoto_dist(a, b):
    dotprod = np.dot(a, b)
    return 1.0 - (dotprod / (np.sum(a) + np.sum(b) - dotprod))

# Initialize KNN classifier with Tanimoto distance
knn_clf = KNeighborsClassifier(n_neighbors=5, metric=tanimoto_dist)

# Train the model
knn_clf.fit(X_train[mask], y_train.values[mask])

# Predict on the test data
y_pred = knn_clf.predict(X_test)

y_prob = knn_clf.predict_proba(X_test)
y_prob = np.stack(y_prob, axis=1)

y_pred = {k: y_pred[:, idx] for idx, k in enumerate(benchmark.target_cols)}
y_prob = {k: y_prob[:, idx, 1] for idx, k in enumerate(benchmark.target_cols)}

benchmark.evaluate(y_pred=y_pred, y_prob=y_prob)

In [None]:
from xgboost import XGBRegressor

# Initialize the XGBRegressor model
xgb_reg = XGBRegressor(random_state=42)

# Fit the model on the training data
xgb_reg.fit(X_train[mask], y_train[mask])

# Predict on the test data
y_pred = xgb_reg.predict(X_test)

# Inverse the sigmoid transformation
y_prob = np.stack([1-y_pred, y_pred], axis=2)

y_pred = y_pred > 0.5

# Convert the predictions to a dictionary
y_pred = {k: y_pred[:, idx] for idx, k in enumerate(benchmark.target_cols)}
y_prob = {k: y_prob[:, idx, 1] for idx, k in enumerate(benchmark.target_cols)}

results = benchmark.evaluate(y_pred=y_pred, y_prob=y_prob)
results

In [None]:
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor

hyper_params = {
    'task_type': 'CPU',  # Use 'GPU' if you have GPU available
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'learning_rate': 0.001,
    'iterations': 10000,
    'depth': 8,
    'l2_leaf_reg': 3,
    'rsm': 0.9,  # equivalent to feature_fraction
    'subsample': 0.7,  # equivalent to bagging_fraction
    'bagging_temperature': 1,
    'border_count': 512,  # equivalent to max_bin
    'verbose': 2,
    'thread_count': -1  # use all available CPU cores
}

# Create the base CatBoost model
model = CatBoostRegressor(**hyper_params)

# Wrap it with MultiOutputRegressor for multi-output regression
model = MultiOutputRegressor(model)

# Fit the model
model.fit(X_train[mask], y_train[mask])

y_pred = np.exp(model.predict(X_test))
y_prob = np.stack([1-y_pred, y_pred], axis=2)
y_pred = y_pred > 0.5

y_pred = {k: y_pred[:, idx] for idx, k in enumerate(benchmark.target_cols)}
y_prob = {k: y_prob[:, idx, 1] for idx, k in enumerate(benchmark.target_cols)}

results = benchmark.evaluate(y_pred=y_pred, y_prob=y_prob)
results