In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import joblib
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

# Load the data
input_csv_path = 'compounds_with_predictions2.csv'  # Replace with your actual CSV file path
df = pd.read_csv(input_csv_path)

# Convert MACCS fingerprints from comma-separated string to a list of integers
def maccs_to_array(maccs_str):
    try:
        # Convert the string to a list of integers and discard the first bit (bit 0)
        return np.array(list(map(int, maccs_str.split(',')))[1:], dtype=int)
    except ValueError:
        return np.zeros(166, dtype=int)  # Default to an array of zeros if there's an error

# Apply the function to convert the MACCS fingerprint strings
df['MACCS_fingerprint'] = df['MACCS_fingerprint'].apply(maccs_to_array)

# Convert MACCS fingerprints to a feature matrix
X = np.array(df['MACCS_fingerprint'].tolist())

# Extract pIC50 values (target variable y)
y = df['pIC50'].values

# Use all features after discarding the first bit
X_filtered = X

# Initialize MinMaxScaler for scaling target variable
scaler_y = MinMaxScaler()
y_transformed = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_transformed, test_size=0.2, random_state=42)

# Initialize StandardScaler for feature scaling
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Initialize the XGBRegressor
xgb = XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')

# Define the parameter grid for RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(100, 1000),  # Start from a higher number to avoid very low values
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(4, 10),  # Slightly narrower range
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'reg_alpha': uniform(0.0, 1.0),  # L1 regularization
    'reg_lambda': uniform(0.0, 1.0)  # L2 regularization
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=200,  # Number of parameter settings sampled
    cv=15,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# Perform random search to find the best hyperparameters
random_search.fit(X_train_scaled, y_train)

# Retrieve the best model
best_model = random_search.best_estimator_

# Predict transformed pIC50 values for the test set
y_pred_transformed = best_model.predict(X_test_scaled)
y_pred = scaler_y.inverse_transform(y_pred_transformed.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

# Evaluate the model on the test data
mse_test = mean_squared_error(y_test_original, y_pred)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test_original, y_pred)
r2_test = r2_score(y_test_original, y_pred)

# Predict transformed pIC50 values for the training set
y_train_pred_transformed = best_model.predict(X_train_scaled)
y_train_pred = scaler_y.inverse_transform(y_train_pred_transformed.reshape(-1, 1)).flatten()

# Evaluate the model on the training data
mse_train = mean_squared_error(y_train, y_train_pred_transformed)
rmse_train = np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train, y_train_pred_transformed)
r2_train = r2_score(y_train, y_train_pred_transformed)

# Print evaluation metrics for test and training sets
print(f"Test set - Mean Squared Error (MSE): {mse_test:.4f}")
print(f"Test set - Root Mean Squared Error (RMSE): {rmse_test:.4f}")
print(f"Test set - Mean Absolute Error (MAE): {mae_test:.4f}")
print(f"Test set - R-squared (R2): {r2_test:.4f}")

print(f"Training set - Mean Squared Error (MSE): {mse_train:.4f}")
print(f"Training set - Root Mean Squared Error (RMSE): {rmse_train:.4f}")
print(f"Training set - Mean Absolute Error (MAE): {mae_train:.4f}")
print(f"Training set - R-squared (R2): {r2_train:.4f}")


# Save the best model and scalers for future use
joblib.dump(best_model, 'xgb_model_filtered.pkl')
joblib.dump(scaler_y, 'y_scaler_filtered.pkl')
joblib.dump(scaler_X, 'X_scaler_filtered.pkl')


Fitting 15 folds for each of 200 candidates, totalling 3000 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Test set - Mean Squared Error (MSE): 0.4994
Test set - Root Mean Squared Error (RMSE): 0.7067
Test set - Mean Absolute Error (MAE): 0.5194
Test set - R-squared (R2): 0.6005
Training set - Mean Squared Error (MSE): 0.0024
Training set - Root Mean Squared Error (RMSE): 0.0492
Training set - Mean Absolute Error (MAE): 0.0324
Training set - R-squared (R2): 0.9429


['X_scaler_filtered.pkl']

In [5]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys
import joblib

# Load the saved XGBoost model and scalers
best_model = joblib.load('xgb_model_filtered.pkl')  # Load XGBoost model
scaler_y = joblib.load('y_scaler_filtered.pkl')  # Load y-scaler
scaler_X = joblib.load('X_scaler_filtered.pkl')  # Load X-scaler

# Define a function to convert SMILES to MACCS fingerprints and discard the first bit
def smiles_to_maccs(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return np.zeros(166, dtype=int)  # Return a default array of 166 bits if SMILES is invalid
    maccs_fingerprint = MACCSkeys.GenMACCSKeys(molecule)
    return np.array(list(maccs_fingerprint)[1:], dtype=int)  # Convert to 166-bit fingerprint (discarding the first bit)

# Example list of new SMILES strings
smiles_list = [
    'C=C[C@@](C)(O)C[C@@H](C)[C@H]4CCC3C2CCC1C(=C)[C@@H](O)CC[C@]1(C)C2CC[C@]3(C)C4'
]

# Convert SMILES strings to MACCS fingerprints
fingerprints = np.array([smiles_to_maccs(smiles) for smiles in smiles_list])

# Ensure the selection filter aligns with the training step
# std_devs = np.std(fingerprints, axis=0)
# sd_cutoff = 0.1
# selected_features = std_devs > sd_cutoff  # Use the same cutoff used during training

# Filter and scale the new data
# fingerprints_filtered = fingerprints[:, selected_features]  # Apply the same filter used during training
fingerprints_filtered = fingerprints  # No filtering applied
X_new_scaled = scaler_X.transform(fingerprints_filtered)

# Predict using the loaded model
y_pred_transformed = best_model.predict(X_new_scaled)
y_pred = scaler_y.inverse_transform(y_pred_transformed.reshape(-1, 1)).flatten()

# Print predictions
for smiles, prediction in zip(smiles_list, y_pred):
    print(f"SMILES: {smiles}, Predicted pIC50: {prediction:.4f}")
    predicted_IC50 = 10 ** (-prediction) * 1000000
    print(f"Predicted IC50: {predicted_IC50:.4f} uM")


SMILES: C=C[C@@](C)(O)C[C@@H](C)[C@H]4CCC3C2CCC1C(=C)[C@@H](O)CC[C@]1(C)C2CC[C@]3(C)C4, Predicted pIC50: 5.2616
Predicted IC50: 5.4755 uM


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import joblib
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

# Load the data
input_csv_path = 'output_with_maccs_fingerprints.csv'  # Replace with your actual CSV file path
df = pd.read_csv(input_csv_path)

# Convert MACCS fingerprints from comma-separated string to a list of integers
def maccs_to_array(maccs_str):
    try:
        return np.array(list(map(int, maccs_str.split(','))), dtype=int)
    except ValueError:
        return np.zeros(167, dtype=int)  # Default to an array of zeros if there's an error

df['MACCS_fingerprint'] = df['MACCS_fingerprint'].apply(maccs_to_array)

# Convert MACCS fingerprints to a feature matrix
X = np.array(df['MACCS_fingerprint'].tolist())

# Extract pIC50 values (target variable y)
y = df['pIC50'].values

# Calculate standard deviation for each descriptor (each column in X)
#std_devs = np.std(X, axis=0)

# Set the SD cut-off value
#sd_cutoff = 0.1

# Select descriptors with SD > 0.1
#selected_features = std_devs > sd_cutoff
#X_filtered = X[:, selected_features]
X_filtered = X

# Initialize MinMaxScaler for scaling target variable
scaler_y = MinMaxScaler()
y_transformed = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_transformed, test_size=0.2, random_state=42)

# Initialize StandardScaler for feature scaling
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Initialize the XGBRegressor
xgb = XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')

# Define the parameter grid for RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(100, 1000),  # Start from a higher number to avoid very low values
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(4, 10),  # Slightly narrower range
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'reg_alpha': uniform(0.0, 1.0),  # L1 regularization
    'reg_lambda': uniform(0.0, 1.0)  # L2 regularization
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=100,  # Number of parameter settings sampled
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# Perform random search to find the best hyperparameters
random_search.fit(X_train_scaled, y_train)

# Retrieve the best model
best_model = random_search.best_estimator_

# Predict transformed pIC50 values for the test set
y_pred_transformed = best_model.predict(X_test_scaled)
y_pred = scaler_y.inverse_transform(y_pred_transformed.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

# Evaluate the model on the test data
mse_test = mean_squared_error(y_test_original, y_pred)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test_original, y_pred)
r2_test = r2_score(y_test_original, y_pred)

# Predict transformed pIC50 values for the training set
y_train_pred_transformed = best_model.predict(X_train_scaled)
y_train_pred = scaler_y.inverse_transform(y_train_pred_transformed.reshape(-1, 1)).flatten()

# Evaluate the model on the training data
mse_train = mean_squared_error(y_train, y_train_pred_transformed)
rmse_train = np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train, y_train_pred_transformed)
r2_train = r2_score(y_train, y_train_pred_transformed)

# Print evaluation metrics for test and training sets
print(f"Test set - Mean Squared Error (MSE): {mse_test:.4f}")
print(f"Test set - Root Mean Squared Error (RMSE): {rmse_test:.4f}")
print(f"Test set - Mean Absolute Error (MAE): {mae_test:.4f}")
print(f"Test set - R-squared (R2): {r2_test:.4f}")

print(f"Training set - Mean Squared Error (MSE): {mse_train:.4f}")
print(f"Training set - Root Mean Squared Error (RMSE): {rmse_train:.4f}")
print(f"Training set - Mean Absolute Error (MAE): {mae_train:.4f}")
print(f"Training set - R-squared (R2): {r2_train:.4f}")

# Save the best model and scalers for future use
joblib.dump(best_model, 'xgb_model_filtered.pkl')
joblib.dump(scaler_y, 'y_scaler_filtered.pkl')
joblib.dump(scaler_X, 'X_scaler_filtered.pkl')


Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [6]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys
import joblib

# Load the saved XGBoost model and scalers
best_model = joblib.load('xgb_model_filtered.pkl')  # Load XGBoost model
scaler_y = joblib.load('y_scaler_filtered.pkl')  # Load y-scaler
scaler_X = joblib.load('X_scaler_filtered.pkl')  # Load X-scaler

# Define a function to convert SMILES to MACCS fingerprints
def smiles_to_maccs(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return np.zeros(167, dtype=int)  # Return a default array if SMILES is invalid
    maccs_fingerprint = MACCSkeys.GenMACCSKeys(molecule)
    # Skip the first bit as it does not correspond to a MACCS key
    return np.array(list(maccs_fingerprint)[1:], dtype=int)  # Convert to 166-bit fingerprint

# Example list of new SMILES strings
smiles_list = [
    'C=C[C@@](C)(O)C[C@@H](C)[C@H]3CCC4C2CCC1C(=C)[C@@H](O)CC[C@]1(C)C2CC[C@]34C',  # Example SMILES
    'C=C2C[C@]13C[C@](C)(C)C[C@H](O)[C@]1(C)CCC2O3',  # Example SMILES
    'OC(=O)C1=CC=C(C=C1)N1C(=O)\\C(=C\\C2=CC=C(O2)C2=CC=C(C=C2)[N+]([O-])=O)C=C1C1=CC=CC=C1',
    'NC(=O)C1=CC2=C(C=C1)N(CC1=CC=C3C=CC=CC3=C1)C(=O)C2=O'
    # Add more SMILES strings as needed
]

# Convert SMILES strings to MACCS fingerprints
fingerprints = np.array([smiles_to_maccs(smiles) for smiles in smiles_list])

# Ensure the selection filter aligns with the training step
#std_devs = np.std(fingerprints, axis=0)
#sd_cutoff = 0.1
#selected_features = std_devs > sd_cutoff  # Same cutoff used in training

# Filter and scale the new data
#fingerprints_filtered = fingerprints[:, selected_features]  # Apply the same filter used during training
fingerprints_filtered = fingerprints[1:]
X_new_scaled = scaler_X.transform(fingerprints_filtered)

# Predict using the loaded model
y_pred_transformed = best_model.predict(X_new_scaled)
y_pred = scaler_y.inverse_transform(y_pred_transformed.reshape(-1, 1)).flatten()

# Print predictions
for smiles, prediction in zip(smiles_list, y_pred):
    print(f"SMILES: {smiles}, Predicted pIC50: {prediction:.4f}")
    predicted_IC50 = 10 ** (-prediction) * 1000000
    print(f"Predicted IC50: {predicted_IC50:.4f} nM")


ValueError: X has 166 features, but StandardScaler is expecting 167 features as input.

In [1]:
!pip install optuna

Defaulting to user installation because normal site-packages is not writeable
Collecting optuna
  Using cached optuna-4.0.0-py3-none-any.whl (362 kB)
Collecting colorlog
  Using cached colorlog-6.8.2-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.8.2 optuna-4.0.0
You should consider upgrading via the '/cm/local/apps/python37/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.neural_network import MLPRegressor
import joblib
from xgboost import XGBRegressor
from scipy.stats import randint, uniform
import optuna

# Load the data
input_csv_path = 'output_with_maccs_fingerprints.csv'
df = pd.read_csv(input_csv_path)

# Convert MACCS fingerprints from comma-separated string to a list of integers
def maccs_to_array(maccs_str):
    try:
        return np.array(list(map(int, maccs_str.split(',')))[1:], dtype=int)
    except ValueError:
        return np.zeros(166, dtype=int)

# Apply the function to convert the MACCS fingerprint strings
df['MACCS_fingerprint'] = df['MACCS_fingerprint'].apply(maccs_to_array)

# Convert MACCS fingerprints to a feature matrix
X = np.array(df['MACCS_fingerprint'].tolist())

# Extract pIC50 values (target variable y)
y = df['pIC50'].values

# Initialize MinMaxScaler for scaling target variable
scaler_y = MinMaxScaler()
y_transformed = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

# Initialize StandardScaler for feature scaling
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Initialize base models with hyperparameter tuning
def objective(trial):
    xgb = XGBRegressor(
        n_estimators=trial.suggest_int('n_estimators', 100, 1000),
        learning_rate=trial.suggest_uniform('learning_rate', 0.01, 0.3),
        max_depth=trial.suggest_int('max_depth', 4, 10),
        subsample=trial.suggest_uniform('subsample', 0.7, 0.3),
        colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.7, 0.3),
        reg_alpha=trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        reg_lambda=trial.suggest_uniform('reg_lambda', 0.0, 1.0),
        random_state=42,
        use_label_encoder=False,
        eval_metric='rmse'
    )
    
    rf = RandomForestRegressor(
        n_estimators=trial.suggest_int('rf_n_estimators', 50, 500),
        max_depth=trial.suggest_int('rf_max_depth', 5, 20),
        min_samples_split=trial.suggest_int('rf_min_samples_split', 2, 10),
        min_samples_leaf=trial.suggest_int('rf_min_samples_leaf', 1, 10),
        bootstrap=trial.suggest_categorical('rf_bootstrap', [True, False]),
        random_state=42
    )
    
    mlp = MLPRegressor(
        hidden_layer_sizes=trial.suggest_categorical('mlp_hidden_layer_sizes', [(50,), (100,), (50, 50), (100, 50)]),
        activation=trial.suggest_categorical('mlp_activation', ['tanh', 'relu']),
        solver=trial.suggest_categorical('mlp_solver', ['lbfgs', 'adam']),
        alpha=trial.suggest_loguniform('mlp_alpha', 0.0001, 0.1),
        learning_rate=trial.suggest_categorical('mlp_learning_rate', ['constant', 'adaptive']),
        max_iter=500,
        random_state=42
    )
    
    stacking_model = StackingRegressor(
        estimators=[('xgb', xgb), ('rf', rf), ('lasso', Lasso()), ('mlp', mlp)],
        final_estimator=LinearRegression()
    )
    
    # Perform cross-validation
    scores = cross_val_score(stacking_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
    return np.mean(scores)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Best parameters from Optuna
print("Best parameters:")
print(study.best_params)

# Train the stacking model with the best parameters
xgb = XGBRegressor(**study.best_params)
rf = RandomForestRegressor(**study.best_params)
mlp = MLPRegressor(**study.best_params)

stacking_model = StackingRegressor(
    estimators=[('xgb', xgb), ('rf', rf), ('lasso', Lasso()), ('mlp', mlp)],
    final_estimator=LinearRegression()
)

stacking_model.fit(X_train_scaled, y_train)

# Predict transformed pIC50 values for the test set
y_pred_transformed_stack = stacking_model.predict(X_test_scaled)
y_pred_stack = scaler_y.inverse_transform(y_pred_transformed_stack.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

# Predict transformed pIC50 values for the training set
y_train_pred_transformed_stack = stacking_model.predict(X_train_scaled)
y_train_pred_stack = scaler_y.inverse_transform(y_train_pred_transformed_stack.reshape(-1, 1)).flatten()

# Evaluate the stacking model on the test data
mse_test_stack = mean_squared_error(y_test_original, y_pred_stack)
rmse_test_stack = np.sqrt(mse_test_stack)
mae_test_stack = mean_absolute_error(y_test_original, y_pred_stack)
r2_test_stack = r2_score(y_test_original, y_pred_stack)

# Evaluate the stacking model on the training data
mse_train_stack = mean_squared_error(y_train, y_train_pred_transformed_stack)
rmse_train_stack = np.sqrt(mse_train_stack)
mae_train_stack = mean_absolute_error(y_train, y_train_pred_transformed_stack)
r2_train_stack = r2_score(y_train, y_train_pred_transformed_stack)

# Print evaluation metrics for stacking model on test and training sets
print(f"\nStacking Model - Test set - Mean Squared Error (MSE): {mse_test_stack:.4f}")
print(f"Stacking Model - Test set - Root Mean Squared Error (RMSE): {rmse_test_stack:.4f}")
print(f"Stacking Model - Test set - Mean Absolute Error (MAE): {mae_test_stack:.4f}")
print(f"Stacking Model - Test set - R-squared (R2): {r2_test_stack:.4f}")

print(f"\nStacking Model - Training set - Mean Squared Error (MSE): {mse_train_stack:.4f}")
print(f"Stacking Model - Training set - Root Mean Squared Error (RMSE): {rmse_train_stack:.4f}")
print(f"Stacking Model - Training set - Mean Absolute Error (MAE): {mae_train_stack:.4f}")
print(f"Stacking Model - Training set - R-squared (R2): {r2_train_stack:.4f}")

# Save the stacking model and scalers for future use
joblib.dump(stacking_model, 'stacking_model.pkl')
joblib.dump(scaler_y, 'y_scaler_filtered.pkl')
joblib.dump(scaler_X, 'X_scaler_filtered.pkl')


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-09-02 21:15:48,468] A new study created in memory with name: no-name-948f0699-893d-432f-a537-ac97ca0ab8e2
[W 2024-09-02 21:15:48,473] Trial 0 failed with parameters: {'n_estimators': 568, 'learning_rate': 0.12237251203315812, 'max_depth': 10} because of the following error: ValueError('The `low` value must be smaller than or equal to the `high` value (low=0.7, high=0.3).').
Traceback (most recent call last):
  File "/home/aya/.local/lib/python3.7/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/var/tmp/pbs.29301.head-node/ipykernel_2249780/976509243.py", line 52, in objective
    subsample=trial.suggest_uniform('subsample', 0.7, 0.3),
  File "/home/aya/.local/lib/python3.7/site-packages/optuna/_deprecated.py", line 113, in wrapper
    return func(*args, **kwargs)
  File "/home/aya/.local/lib/python3.7/site-packages/optuna/trial/_trial.py", line 185, in suggest_uniform
 

ValueError: The `low` value must be smaller than or equal to the `high` value (low=0.7, high=0.3).

In [3]:
!pip install scikit-optimize

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-optimize
  Using cached scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
Collecting pyaml>=16.9
  Downloading pyaml-23.5.8-py3-none-any.whl (17 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.5.8 scikit-optimize-0.10.2
You should consider upgrading via the '/cm/local/apps/python37/bin/python3 -m pip install --upgrade pip' command.[0m


In [16]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.neural_network import MLPRegressor
import joblib
from xgboost import XGBRegressor
from skopt import BayesSearchCV

# Load the data
input_csv_path = 'output_with_maccs_fingerprints.csv'
df = pd.read_csv(input_csv_path)

# Convert MACCS fingerprints from comma-separated string to a list of integers
def maccs_to_array(maccs_str):
    try:
        return np.array(list(map(int, maccs_str.split(',')))[1:], dtype=int)
    except ValueError:
        return np.zeros(166, dtype=int)

df['MACCS_fingerprint'] = df['MACCS_fingerprint'].apply(maccs_to_array)

# Convert MACCS fingerprints to a feature matrix
X = np.array(df['MACCS_fingerprint'].tolist())

# Extract pIC50 values (target variable y)
y = df['pIC50'].values

# Initialize MinMaxScaler for scaling target variable
scaler_y = MinMaxScaler()
y_transformed = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

# Initialize StandardScaler for feature scaling
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Initialize base models
xgb = XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')
rf = RandomForestRegressor(random_state=42)
lasso = Lasso(alpha=0.1)
mlp = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

# Define parameter space for Bayesian Optimization
param_space = {
    'xgb__n_estimators': (100, 1000),
    'xgb__learning_rate': (0.01, 0.3, 'uniform'),
    'xgb__max_depth': (4, 10),
    'xgb__subsample': (0.7, 0.9, 'uniform'),
    'xgb__colsample_bytree': (0.7, 0.9, 'uniform'),
    'xgb__reg_alpha': (0.0, 1.0, 'uniform'),
    'xgb__reg_lambda': (0.0, 1.0, 'uniform')
}

bayes_search = BayesSearchCV(
    estimator=xgb,
    search_spaces=param_space,
    n_iter=50,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# Perform Bayesian optimization
bayes_search.fit(X_train_scaled, y_train)
best_xgb = bayes_search.best_estimator_

# Define the stacking model with the best XGBRegressor
stacking_model = StackingRegressor(
    estimators=[('xgb', best_xgb), ('rf', rf), ('lasso', lasso), ('mlp', mlp)],
    final_estimator=LinearRegression()
)

# Train the stacking model
stacking_model.fit(X_train_scaled, y_train)

# Print the parameters of the models
print("XGBoost Parameters:")
print(best_xgb.get_params())

print("\nRandomForest Parameters:")
print(rf.get_params())

print("\nLasso Parameters:")
print(lasso.get_params())

print("\nMLP Parameters:")
print(mlp.get_params())

print("\nStacking Model:")
print(stacking_model.named_estimators_)

# Predict transformed pIC50 values for the test set using stacking model
y_pred_transformed_stack = stacking_model.predict(X_test_scaled)
y_pred_stack = scaler_y.inverse_transform(y_pred_transformed_stack.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

# Predict transformed pIC50 values for the training set using stacking model
y_train_pred_transformed_stack = stacking_model.predict(X_train_scaled)
y_train_pred_stack = scaler_y.inverse_transform(y_train_pred_transformed_stack.reshape(-1, 1)).flatten()

# Evaluate the stacking model on the test data
mse_test_stack = mean_squared_error(y_test_original, y_pred_stack)
rmse_test_stack = np.sqrt(mse_test_stack)
mae_test_stack = mean_absolute_error(y_test_original, y_pred_stack)
r2_test_stack = r2_score(y_test_original, y_pred_stack)

# Evaluate the stacking model on the training data
mse_train_stack = mean_squared_error(y_train, y_train_pred_transformed_stack)
rmse_train_stack = np.sqrt(mse_train_stack)
mae_train_stack = mean_absolute_error(y_train, y_train_pred_transformed_stack)
r2_train_stack = r2_score(y_train, y_train_pred_transformed_stack)

# Print evaluation metrics for stacking model on test and training sets
print(f"\nStacking Model - Test set - Mean Squared Error (MSE): {mse_test_stack:.4f}")
print(f"Stacking Model - Test set - Root Mean Squared Error (RMSE): {rmse_test_stack:.4f}")
print(f"Stacking Model - Test set - Mean Absolute Error (MAE): {mae_test_stack:.4f}")
print(f"Stacking Model - Test set - R-squared (R2): {r2_test_stack:.4f}")

print(f"\nStacking Model - Training set - Mean Squared Error (MSE): {mse_train_stack:.4f}")
print(f"Stacking Model - Training set - Root Mean Squared Error (RMSE): {rmse_train_stack:.4f}")
print(f"Stacking Model - Training set - Mean Absolute Error (MAE): {mae_train_stack:.4f}")
print(f"Stacking Model - Training set - R-squared (R2): {r2_train_stack:.4f}")

# Save the stacking model and scalers for future use
joblib.dump(stacking_model, 'stacking_model.pkl')
joblib.dump(scaler_y, 'y_scaler_filtered.pkl')
joblib.dump(scaler_X, 'X_scaler_filtered.pkl')


ModuleNotFoundError: No module named 'importlib.metadata'

In [14]:
!pip install importlib-metadata


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/cm/local/apps/python37/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
from rdkit import Chem
from rdkit.Chem import MACCSkeys
import pandas as pd
import numpy as np

def calculate_tanimoto_similarity(fp1, fp2):
    """Calculates the Tanimoto similarity between two fingerprints."""
    return float(fp1 & fp2) / float(fp1 | fp2)

def cluster_compounds(smiles_list, similarity_threshold=0.8):
    """
    Clusters compounds based on Tanimoto similarity.
    
    Args:
    - smiles_list: A list of SMILES strings representing the compounds.
    - similarity_threshold: The Tanimoto similarity threshold for clustering.
    
    Returns:
    - clusters: A list of clusters, each containing indices of similar compounds.
    """
    # Calculate MACCS fingerprints
    fingerprints = [MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smiles)) for smiles in smiles_list]
    
    # Calculate pairwise Tanimoto similarity matrix
    similarity_matrix = np.zeros((len(fingerprints), len(fingerprints)))
    for i in range(len(fingerprints)):
        for j in range(i + 1, len(fingerprints)):
            similarity_matrix[i, j] = calculate_tanimoto_similarity(fingerprints[i], fingerprints[j])
            similarity_matrix[j, i] = similarity_matrix[i, j]
    
    # Cluster based on similarity threshold
    clusters = []
    for i in range(len(fingerprints)):
        found_cluster = False
        for cluster in clusters:
            if any(similarity_matrix[i, j] >= similarity_threshold for j in cluster):
                cluster.append(i)
                found_cluster = True
                break
        if not found_cluster:
            clusters.append([i])
    
    return clusters

def calculate_modi(smiles_list, activity_labels, similarity_threshold=0.8):
    """
    Calculates the Modelability Index (MODI) for a dataset.
    
    Args:
    - smiles_list: A list of SMILES strings representing the compounds.
    - activity_labels: A list of activity labels (1 for active, 0 for inactive).
    - similarity_threshold: The Tanimoto similarity threshold for clustering.
    
    Returns:
    - modi: The Modelability Index for the dataset.
    """
    clusters = cluster_compounds(smiles_list, similarity_threshold)
    
    pure_compounds = 0
    total_compounds = len(smiles_list)
    
    for cluster in clusters:
        cluster_labels = [activity_labels[i] for i in cluster]
        if len(set(cluster_labels)) == 1:
            pure_compounds += len(cluster)
    
    modi = pure_compounds / total_compounds
    return modi

def predict_activity(modi, cutoff=0.65):
    """
    Predicts if a dataset is likely to be active or inactive based on MODI.
    
    Args:
    - modi: The Modelability Index of the dataset.
    - cutoff: The cutoff value for determining activity.
    
    Returns:
    - str: 'Active' if the MODI is greater than the cutoff, 'Inactive' otherwise.
    """
    return 'Active' if modi > cutoff else 'Inactive'

# Read SMILES and activity from Excel file
file_path = '/home/aya/output_with_maccs_fingerprints.csv'  # Change to your file path
df = pd.read_csv(file_path)

# Convert activity labels to numeric (1 for active, 0 for inactive)
df['Activity'] = df['Activity'].map({'active': 1, 'inactive': 0})

# Extract SMILES and activity lists
smiles_list = df['SMILES'].tolist()
activity_labels = df['Activity'].tolist()

# Calculate MODI
modi = calculate_modi(smiles_list, activity_labels)
print(f"MODI: {modi}")

# Predict activity for each compound based on MODI
df['Predicted Activity'] = predict_activity(modi)

# Save the modified DataFrame to a new CSV file
df.to_csv('compounds_with_predictions.csv', index=False)

print("Predictions saved to 'compounds_with_predictions.csv'")


KeyError: 'Activity'

In [6]:
from rdkit import Chem
from rdkit.Chem import MACCSkeys, DataStructs
import pandas as pd
import numpy as np

def calculate_tanimoto_similarity(fp1, fp2):
    """Calculates the Tanimoto similarity between two fingerprints using RDKit."""
    return DataStructs.FingerprintSimilarity(fp1, fp2)

def cluster_compounds(smiles_list, similarity_threshold=0.8):
    """
    Clusters compounds based on Tanimoto similarity.
    
    Args:
    - smiles_list: A list of SMILES strings representing the compounds.
    - similarity_threshold: The Tanimoto similarity threshold for clustering.
    
    Returns:
    - clusters: A list of clusters, each containing indices of similar compounds.
    """
    # Calculate MACCS fingerprints
    fingerprints = [MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smiles)) for smiles in smiles_list]
    
    # Calculate pairwise Tanimoto similarity matrix
    similarity_matrix = np.zeros((len(fingerprints), len(fingerprints)))
    for i in range(len(fingerprints)):
        for j in range(i + 1, len(fingerprints)):
            similarity = calculate_tanimoto_similarity(fingerprints[i], fingerprints[j])
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity
    
    # Cluster based on similarity threshold
    clusters = []
    for i in range(len(fingerprints)):
        found_cluster = False
        for cluster in clusters:
            if any(similarity_matrix[i, j] >= similarity_threshold for j in cluster):
                cluster.append(i)
                found_cluster = True
                break
        if not found_cluster:
            clusters.append([i])
    
    return clusters

def calculate_modi(smiles_list, similarity_threshold=0.8):
    """
    Calculates the Modelability Index (MODI) for a dataset.
    
    Args:
    - smiles_list: A list of SMILES strings representing the compounds.
    - similarity_threshold: The Tanimoto similarity threshold for clustering.
    
    Returns:
    - modi: The Modelability Index for the dataset.
    """
    clusters = cluster_compounds(smiles_list, similarity_threshold)
    
    total_compounds = len(smiles_list)
    pure_compounds = 0
    
    # Check if all compounds in the cluster have the same activity
    for cluster in clusters:
        if len(cluster) > 1:
            pure_compounds += len(cluster)
    
    modi = pure_compounds / total_compounds
    return modi

def predict_activity(modi, cutoff=0.65):
    """
    Predicts if a dataset is likely to be active or inactive based on MODI.
    
    Args:
    - modi: The Modelability Index of the dataset.
    - cutoff: The cutoff value for determining activity.
    
    Returns:
    - str: 'Active' if the MODI is greater than the cutoff, 'Inactive' otherwise.
    """
    return 'Active' if modi > cutoff else 'Inactive'

# Read SMILES from CSV file
file_path = '/home/aya/output_with_maccs_fingerprints.csv'  # Change to your file path
df = pd.read_csv(file_path)

# Extract SMILES list
smiles_list = df['smiles'].tolist()

# Calculate MODI
modi = calculate_modi(smiles_list)
print(f"MODI: {modi}")

# Predict activity for each compound based on MODI
df['Predicted Activity'] = predict_activity(modi)

# Save the modified DataFrame to a new CSV file
df.to_csv('compounds_with_predictions.csv', index=False)

print("Predictions saved to 'compounds_with_predictions.csv'")


MODI: 0.6952247191011236
Predictions saved to 'compounds_with_predictions.csv'


In [9]:
from rdkit import Chem
from rdkit.Chem import MACCSkeys, DataStructs
import pandas as pd
import numpy as np

def calculate_tanimoto_similarity(fp1, fp2):
    """Calculates the Tanimoto similarity between two fingerprints using RDKit."""
    return DataStructs.FingerprintSimilarity(fp1, fp2)

def cluster_compounds(smiles_list, similarity_threshold=0.8):
    """
    Clusters compounds based on Tanimoto similarity.
    
    Args:
    - smiles_list: A list of SMILES strings representing the compounds.
    - similarity_threshold: The Tanimoto similarity threshold for clustering.
    
    Returns:
    - clusters: A list of clusters, each containing indices of similar compounds.
    """
    # Calculate MACCS fingerprints
    fingerprints = [MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smiles)) for smiles in smiles_list]
    
    # Calculate pairwise Tanimoto similarity matrix
    similarity_matrix = np.zeros((len(fingerprints), len(fingerprints)))
    for i in range(len(fingerprints)):
        for j in range(i + 1, len(fingerprints)):
            similarity = calculate_tanimoto_similarity(fingerprints[i], fingerprints[j])
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity
    
    # Cluster based on similarity threshold
    clusters = []
    for i in range(len(fingerprints)):
        found_cluster = False
        for cluster in clusters:
            if any(similarity_matrix[i, j] >= similarity_threshold for j in cluster):
                cluster.append(i)
                found_cluster = True
                break
        if not found_cluster:
            clusters.append([i])
    
    return clusters

def calculate_modi(smiles_list, similarity_threshold=0.8):
    """
    Calculates the Modelability Index (MODI) for a dataset.
    
    Args:
    - smiles_list: A list of SMILES strings representing the compounds.
    - similarity_threshold: The Tanimoto similarity threshold for clustering.
    
    Returns:
    - modi: The Modelability Index for the dataset.
    """
    clusters = cluster_compounds(smiles_list, similarity_threshold)
    
    total_compounds = len(smiles_list)
    pure_compounds = 0
    
    # Check if all compounds in the cluster have the same activity
    for cluster in clusters:
        if len(cluster) > 1:
            pure_compounds += len(cluster)
    
    modi = pure_compounds / total_compounds
    return modi

def classify_compound(ic50):
    """
    Classifies a compound as 'Active' or 'Inactive' based on IC50 value.
    
    Args:
    - ic50: The IC50 value of the compound.
    
    Returns:
    - str: 'Active' if IC50 is between 0.5 μM and 10 μM, 'Inactive' otherwise.
    """
    if 0.5 <= ic50 <= 10:
        return 'Active'
    else:
        return 'Inactive'

def predict_activity_based_on_modi(modi, cutoff=0.65):
    """
    Predicts if a dataset is likely to be active or inactive based on MODI.
    
    Args:
    - modi: The Modelability Index of the dataset.
    - cutoff: The cutoff value for determining activity.
    
    Returns:
    - str: 'Active' if the MODI is greater than the cutoff, 'Inactive' otherwise.
    """
    return 'Active' if modi > cutoff else 'Inactive'

# Read SMILES and IC50 from CSV file
file_path = '/home/aya/output_with_maccs_fingerprints.csv'  # Change to your file path
df = pd.read_csv(file_path)

# Extract SMILES and IC50 lists
smiles_list = df['smiles'].tolist()
ic50_list = df['IC50'].tolist()

# Classify compounds based on IC50 values
df['Classified Activity'] = [classify_compound(ic50) for ic50 in ic50_list]

# Calculate MODI
modi = calculate_modi(smiles_list)
print(f"MODI: {modi}")

# Predict activity for each compound based on MODI
df['Predicted Activity'] = predict_activity_based_on_modi(modi)

# Save the modified DataFrame to a new CSV file
df.to_csv('compounds_with_predictions.csv', index=False)

print("Predictions saved to 'compounds_with_predictions.csv'")


MODI: 0.6952247191011236
Predictions saved to 'compounds_with_predictions.csv'


In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import joblib
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

# Load the data
input_csv_path = 'output_with_maccs_fingerprints.csv'  # Replace with your actual CSV file path
df = pd.read_csv(input_csv_path)

# Convert MACCS fingerprints from comma-separated string to a list of integers
def maccs_to_array(maccs_str):
    try:
        # Convert the string to a list of integers and discard the first bit (bit 0)
        return np.array(list(map(int, maccs_str.split(',')))[1:], dtype=int)
    except ValueError:
        return np.zeros(166, dtype=int)  # Default to an array of zeros if there's an error

# Apply the function to convert the MACCS fingerprint strings
df['MACCS_fingerprint'] = df['MACCS_fingerprint'].apply(maccs_to_array)

# Convert MACCS fingerprints to a feature matrix
X = np.array(df['MACCS_fingerprint'].tolist())

# Extract pIC50 values (target variable y)
y = df['pIC50'].values

# Initialize MinMaxScaler for scaling target variable
scaler_y = MinMaxScaler()
y_transformed = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

# Initialize StandardScaler for feature scaling
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Initialize the XGBRegressor
xgb = XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')

# Define the parameter grid for RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(100, 1000),  # Start from a higher number to avoid very low values
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(4, 10),  # Slightly narrower range
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'reg_alpha': uniform(0.0, 1.0),  # L1 regularization
    'reg_lambda': uniform(0.0, 1.0)  # L2 regularization
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=200,  # Number of parameter settings sampled
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# Perform random search to find the best hyperparameters
random_search.fit(X_train_scaled, y_train)

# Retrieve the best model
best_model = random_search.best_estimator_

# Predict transformed pIC50 values for the test set
y_pred_transformed = best_model.predict(X_test_scaled)
y_pred = scaler_y.inverse_transform(y_pred_transformed.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

# Evaluate the model on the test data
mse_test = mean_squared_error(y_test_original, y_pred)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test_original, y_pred)
r2_test = r2_score(y_test_original, y_pred)

# Predict transformed pIC50 values for the entire dataset
X_scaled = scaler_X.transform(X)
y_pred_full_transformed = best_model.predict(X_scaled)
y_pred_full = scaler_y.inverse_transform(y_pred_full_transformed.reshape(-1, 1)).flatten()

# Evaluate the model on the full dataset
mse_full = mean_squared_error(y, y_pred_full)
rmse_full = np.sqrt(mse_full)
mae_full = mean_absolute_error(y, y_pred_full)
r2_full = r2_score(y, y_pred_full)

# Print evaluation metrics for test and full datasets
print(f"Test set - Mean Squared Error (MSE): {mse_test:.4f}")
print(f"Test set - Root Mean Squared Error (RMSE): {rmse_test:.4f}")
print(f"Test set - Mean Absolute Error (MAE): {mae_test:.4f}")
print(f"Test set - R-squared (R2): {r2_test:.4f}")

print(f"Full dataset - Mean Squared Error (MSE): {mse_full:.4f}")
print(f"Full dataset - Root Mean Squared Error (RMSE): {rmse_full:.4f}")
print(f"Full dataset - Mean Absolute Error (MAE): {mae_full:.4f}")
print(f"Full dataset - R-squared (R2): {r2_full:.4f}")

# Save the best model and scalers for future use
joblib.dump(best_model, 'xgb_model_filtered.pkl')
joblib.dump(scaler_y, 'y_scaler_filtered.pkl')
joblib.dump(scaler_X, 'X_scaler_filtered.pkl')


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Test set - Mean Squared Error (MSE): 0.4976
Test set - Root Mean Squared Error (RMSE): 0.7054
Test set - Mean Absolute Error (MAE): 0.5066
Test set - R-squared (R2): 0.6019
Full dataset - Mean Squared Error (MSE): 0.1322
Full dataset - Root Mean Squared Error (RMSE): 0.3636
Full dataset - Mean Absolute Error (MAE): 0.1967
Full dataset - R-squared (R2): 0.8786


['X_scaler_filtered.pkl']

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import accuracy_score, recall_score, precision_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import joblib
from xgboost import XGBClassifier
from scipy.stats import randint, uniform

# Load the data
input_csv_path = 'output_with_maccs_fingerprints.csv'  # Replace with your actual CSV file path
df = pd.read_csv(input_csv_path)

# Convert MACCS fingerprints from comma-separated string to a list of integers
def maccs_to_array(maccs_str):
    try:
        # Convert the string to a list of integers and discard the first bit (bit 0)
        return np.array(list(map(int, maccs_str.split(',')))[1:], dtype=int)
    except ValueError:
        return np.zeros(166, dtype=int)  # Default to an array of zeros if there's an error

# Apply the function to convert the MACCS fingerprint strings
df['MACCS_fingerprint'] = df['MACCS_fingerprint'].apply(maccs_to_array)

# Convert MACCS fingerprints to a feature matrix
X = np.array(df['MACCS_fingerprint'].tolist())

# Extract pIC50 values (target variable y)
y = df['pIC50'].values

# Define a threshold to convert regression targets to binary labels
threshold = 5.0  # Example threshold for binary classification
y_binary = (y > threshold).astype(int)  # Convert to binary labels

# Initialize MinMaxScaler for scaling target variable
scaler_y = MinMaxScaler()
y_transformed = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Initialize StandardScaler for feature scaling
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Initialize the XGBClassifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Define the parameter grid for RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(100, 1000),  # Start from a higher number to avoid very low values
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(4, 10),  # Slightly narrower range
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'reg_alpha': uniform(0.0, 1.0),  # L1 regularization
    'reg_lambda': uniform(0.0, 1.0)  # L2 regularization
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=200,  # Number of parameter settings sampled
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# Perform random search to find the best hyperparameters
random_search.fit(X_train_scaled, y_train)

# Retrieve the best model
best_model = random_search.best_estimator_

# Predict binary labels for the test set
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model on the test data
accuracy = accuracy_score(y_test, y_pred)
sensitivity = recall_score(y_test, y_pred)  # Recall for positive class
specificity = recall_score(y_test, y_pred, pos_label=0)  # Recall for negative class
mcc = matthews_corrcoef(y_test, y_pred)

# Predict binary labels for the entire dataset
X_scaled = scaler_X.transform(X)
y_pred_full = best_model.predict(X_scaled)

# Evaluate the model on the full dataset
accuracy_full = accuracy_score(y_binary, y_pred_full)
sensitivity_full = recall_score(y_binary, y_pred_full)
specificity_full = recall_score(y_binary, y_pred_full, pos_label=0)
mcc_full = matthews_corrcoef(y_binary, y_pred_full)

# Print evaluation metrics for test and full datasets
print(f"Test set - Accuracy (Ac): {accuracy:.4f}")
print(f"Test set - Sensitivity (Sn): {sensitivity:.4f}")
print(f"Test set - Specificity (Sc): {specificity:.4f}")
print(f"Test set - Matthews Correlation Coefficient (MCC): {mcc:.4f}")

print(f"Full dataset - Accuracy (Ac): {accuracy_full:.4f}")
print(f"Full dataset - Sensitivity (Sn): {sensitivity_full:.4f}")
print(f"Full dataset - Specificity (Sc): {specificity_full:.4f}")
print(f"Full dataset - Matthews Correlation Coefficient (MCC): {mcc_full:.4f}")

# Save the best model and scalers for future use
joblib.dump(best_model, 'xgb_model_filtered.pkl')
joblib.dump(scaler_y, 'y_scaler_filtered.pkl')
joblib.dump(scaler_X, 'X_scaler_filtered.pkl')


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Test set - Accuracy (Ac): 0.8182
Test set - Sensitivity (Sn): 0.9099
Test set - Specificity (Sc): 0.5000
Test set - Matthews Correlation Coefficient (MCC): 0.4429
Full dataset - Accuracy (Ac): 0.9537
Full dataset - Sensitivity (Sn): 0.9800
Full dataset - Specificity (Sc): 0.8650
Full dataset - Matthews Correlation Coefficient (MCC): 0.8664


['X_scaler_filtered.pkl']

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import joblib
from xgboost import XGBClassifier
from scipy.stats import randint, uniform

# Load the data
input_csv_path = 'output_with_maccs_fingerprints.csv'  # Replace with your actual CSV file path
df = pd.read_csv(input_csv_path)

# Convert MACCS fingerprints from comma-separated string to a list of integers
def maccs_to_array(maccs_str):
    try:
        # Convert the string to a list of integers and discard the first bit (bit 0)
        return np.array(list(map(int, maccs_str.split(',')))[1:], dtype=int)
    except ValueError:
        return np.zeros(166, dtype=int)  # Default to an array of zeros if there's an error

# Apply the function to convert the MACCS fingerprint strings
df['MACCS_fingerprint'] = df['MACCS_fingerprint'].apply(maccs_to_array)

# Convert MACCS fingerprints to a feature matrix
X = np.array(df['MACCS_fingerprint'].tolist())

# Extract pIC50 values (target variable y)
y = df['pIC50'].values

# Define a threshold to convert regression targets to binary labels
threshold = 5.0  # Example threshold for binary classification
y_binary = (y > threshold).astype(int)  # Convert to binary labels

# Initialize MinMaxScaler for scaling target variable
scaler_y = MinMaxScaler()
y_transformed = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Initialize StandardScaler for feature scaling
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Initialize the XGBClassifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Define the parameter grid for RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(100, 1000),  # Start from a higher number to avoid very low values
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(4, 10),  # Slightly narrower range
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'reg_alpha': uniform(0.0, 1.0),  # L1 regularization
    'reg_lambda': uniform(0.0, 1.0)  # L2 regularization
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=200,  # Number of parameter settings sampled
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# Perform random search to find the best hyperparameters
random_search.fit(X_train_scaled, y_train)

# Retrieve the best model
best_model = random_search.best_estimator_

# Predict binary labels for the test set
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model on the test data
accuracy = accuracy_score(y_test, y_pred)
sensitivity = recall_score(y_test, y_pred)  # Recall for positive class
specificity = recall_score(y_test, y_pred, pos_label=0)  # Recall for negative class
mcc = matthews_corrcoef(y_test, y_pred)

# Predict binary labels for the entire dataset
X_scaled = scaler_X.transform(X)
y_pred_full = best_model.predict(X_scaled)

# Evaluate the model on the full dataset
accuracy_full = accuracy_score(y_binary, y_pred_full)
sensitivity_full = recall_score(y_binary, y_pred_full)
specificity_full = recall_score(y_binary, y_pred_full, pos_label=0)
mcc_full = matthews_corrcoef(y_binary, y_pred_full)

# Print evaluation metrics for test and full datasets
print(f"Test set - Accuracy (Ac): {accuracy:.4f}")
print(f"Test set - Sensitivity (Sn): {sensitivity:.4f}")
print(f"Test set - Specificity (Sc): {specificity:.4f}")
print(f"Test set - Matthews Correlation Coefficient (MCC): {mcc:.4f}")

print(f"Full dataset - Accuracy (Ac): {accuracy_full:.4f}")
print(f"Full dataset - Sensitivity (Sn): {sensitivity_full:.4f}")
print(f"Full dataset - Specificity (Sc): {specificity_full:.4f}")
print(f"Full dataset - Matthews Correlation Coefficient (MCC): {mcc_full:.4f}")

# Save the best model and scalers for future use
joblib.dump(best_model, 'xgb_model_filtered.pkl')
joblib.dump(scaler_y, 'y_scaler_filtered.pkl')
joblib.dump(scaler_X, 'X_scaler_filtered.pkl')


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV] END colsample_bytree=0.8173181822719722, learning_rate=0.0646708263364187, max_depth=7, n_estimators=101, reg_alpha=0.4251558744912447, reg_lambda=0.20794166286818883, subsample=0.8703100983459974; total time=   0.9s
[CV] END colsample_bytree=0.890021126953127, learning_rate=0.2714381770563153, max_depth=7, n_estimators=719, reg_alpha=0.18657005888603584, reg_lambda=0.8925589984899778, subsample=0.8618026725746952; total time=   3.7s
[CV] END colsample_bytree=0.8155293185805775, learning_rate=0.26534100145505707, max_depth=6, n_estimators=227, reg_alpha=0.3726868670940493, reg_lambda=0.3946914668094722, subsample=0.9532639422178933; total time=   0.8s
[CV] END colsample_bytree=0.8594063894704443, learning_rate=0.17219053648303195, max_depth=7, n_estimators=755, reg_alpha=0.9758520794625346, reg_lambda=0.5163003483011953, subsample=0.7968869418823737; total time=   2.5s
[CV] END colsample_bytree=0.919494326589892, lear

['X_scaler_filtered.pkl']

[CV] END colsample_bytree=0.7069187275124247, learning_rate=0.16743239807751675, max_depth=5, n_estimators=575, reg_alpha=0.9737555188414592, reg_lambda=0.23277134043030423, subsample=0.7271819303598462; total time=   1.5s
[CV] END colsample_bytree=0.8057706569002506, learning_rate=0.1014343774474087, max_depth=9, n_estimators=555, reg_alpha=0.534089419375442, reg_lambda=0.4848299713589832, subsample=0.9077308098670811; total time=   2.4s
[CV] END colsample_bytree=0.7527775758032036, learning_rate=0.015422609084656261, max_depth=9, n_estimators=376, reg_alpha=0.1788227092213288, reg_lambda=0.3664687845828599, subsample=0.9232511569169686; total time=   2.6s
[CV] END colsample_bytree=0.7293502481953004, learning_rate=0.1574847625350497, max_depth=7, n_estimators=388, reg_alpha=0.4646738129396114, reg_lambda=0.6497736826427634, subsample=0.7144176772591101; total time=   1.8s
[CV] END colsample_bytree=0.72738600303584, learning_rate=0.10579409127712446, max_depth=9, n_estimators=834, reg