In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import joblib
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

# Load the data
input_csv_path = 'compounds_with_predictions2.csv'  # Replace with your actual CSV file path
df = pd.read_csv(input_csv_path)

# Convert MACCS fingerprints from comma-separated string to a list of integers
def maccs_to_array(maccs_str):
    try:
        # Convert the string to a list of integers and discard the first bit (bit 0)
        return np.array(list(map(int, maccs_str.split(',')))[1:], dtype=int)
    except ValueError:
        return np.zeros(166, dtype=int)  # Default to an array of zeros if there's an error

# Apply the function to convert the MACCS fingerprint strings
df['MACCS_fingerprint'] = df['MACCS_fingerprint'].apply(maccs_to_array)

# Convert MACCS fingerprints to a feature matrix
X = np.array(df['MACCS_fingerprint'].tolist())

# Extract pIC50 values (target variable y)
y = df['pIC50'].values

# Use all features after discarding the first bit
X_filtered = X

# Initialize MinMaxScaler for scaling target variable
scaler_y = MinMaxScaler()
y_transformed = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_transformed, test_size=0.2, random_state=42)

# Initialize StandardScaler for feature scaling
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Initialize the XGBRegressor
xgb = XGBRegressor(random_state=42, use_label_encoder=True, eval_metric='rmse')

# Define the parameter grid for RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(100, 1000),  # Start from a higher number to avoid very low values
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(4, 10),  # Slightly narrower range
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'reg_alpha': uniform(0.0, 1.0),  # L1 regularization
    'reg_lambda': uniform(0.0, 1.0)  # L2 regularization
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=100,  # Number of parameter settings sampled
    cv=4,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# Perform random search to find the best hyperparameters
random_search.fit(X_train_scaled, y_train)

# Retrieve the best model
best_model = random_search.best_estimator_

# Predict transformed pIC50 values for the test set
y_pred_transformed = best_model.predict(X_test_scaled)
y_pred = scaler_y.inverse_transform(y_pred_transformed.reshape(-1, 1)).flatten()
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

# Evaluate the model on the test data
mse_test = mean_squared_error(y_test_original, y_pred)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test_original, y_pred)
r2_test = r2_score(y_test_original, y_pred)

# Predict transformed pIC50 values for the training set
y_train_pred_transformed = best_model.predict(X_train_scaled)
y_train_pred = scaler_y.inverse_transform(y_train_pred_transformed.reshape(-1, 1)).flatten()

# Evaluate the model on the training data
mse_train = mean_squared_error(y_train, y_train_pred_transformed)
rmse_train = np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train, y_train_pred_transformed)
r2_train = r2_score(y_train, y_train_pred_transformed)

# Print evaluation metrics for test and training sets
print(f"Test set - Mean Squared Error (MSE): {mse_test:.4f}")
print(f"Test set - Root Mean Squared Error (RMSE): {rmse_test:.4f}")
print(f"Test set - Mean Absolute Error (MAE): {mae_test:.4f}")
print(f"Test set - R-squared (R2): {r2_test:.4f}")

print(f"Training set - Mean Squared Error (MSE): {mse_train:.4f}")
print(f"Training set - Root Mean Squared Error (RMSE): {rmse_train:.4f}")
print(f"Training set - Mean Absolute Error (MAE): {mae_train:.4f}")
print(f"Training set - R-squared (R2): {r2_train:.4f}")


# Save the best model and scalers for future use
joblib.dump(best_model, 'xgb_model_filtered1.pkl')
joblib.dump(scaler_y, 'y_scaler_filtered1.pkl')
joblib.dump(scaler_X, 'X_scaler_filtered1.pkl')

Fitting 4 folds for each of 100 candidates, totalling 400 fits
Test set - Mean Squared Error (MSE): 0.4903
Test set - Root Mean Squared Error (RMSE): 0.7002
Test set - Mean Absolute Error (MAE): 0.5114
Test set - R-squared (R2): 0.6078
Training set - Mean Squared Error (MSE): 0.0025
Training set - Root Mean Squared Error (RMSE): 0.0497
Training set - Mean Absolute Error (MAE): 0.0337
Training set - R-squared (R2): 0.9417


['X_scaler_filtered1.pkl']

In [2]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys
import joblib

# Load the saved XGBoost model and scalers
best_model = joblib.load('xgb_model_filtered1.pkl')  # Load XGBoost model
scaler_y = joblib.load('y_scaler_filtered1.pkl')  # Load y-scaler
scaler_X = joblib.load('X_scaler_filtered1.pkl')  # Load X-scaler

# Define a function to convert SMILES to MACCS fingerprints and discard the first bit
def smiles_to_maccs(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return np.zeros(166, dtype=int)  # Return a default array of 166 bits if SMILES is invalid
    maccs_fingerprint = MACCSkeys.GenMACCSKeys(molecule)
    return np.array(list(maccs_fingerprint)[1:], dtype=int)  # Convert to 166-bit fingerprint (discarding the first bit)

# Example list of new SMILES strings
smiles_list = [
    'CC(C)CCC[C@](C)(O)C4CCC3C/2CCC1C[C@@H](O)CC[C@]1(C)C2=C\C[C@@]34C'
]

# Convert SMILES strings to MACCS fingerprints
fingerprints = np.array([smiles_to_maccs(smiles) for smiles in smiles_list])

# Ensure the selection filter aligns with the training step
# std_devs = np.std(fingerprints, axis=0)
# sd_cutoff = 0.1
# selected_features = std_devs > sd_cutoff  # Use the same cutoff used during training

# Filter and scale the new data
# fingerprints_filtered = fingerprints[:, selected_features]  # Apply the same filter used during training
fingerprints_filtered = fingerprints  # No filtering applied
X_new_scaled = scaler_X.transform(fingerprints_filtered)

# Predict using the loaded model
y_pred_transformed = best_model.predict(X_new_scaled)
y_pred = scaler_y.inverse_transform(y_pred_transformed.reshape(-1, 1)).flatten()

# Print predictions
for smiles, prediction in zip(smiles_list, y_pred):
    print(f"SMILES: {smiles}, Predicted pIC50: {prediction:.4f}")
    predicted_IC50 = 10 ** (-prediction) * 1000000
    print(f"Predicted IC50: {predicted_IC50:.4f} uM")

SMILES: CC(C)CCC[C@](C)(O)C4CCC3C/2CCC1C[C@@H](O)CC[C@]1(C)C2=C\C[C@@]34C, Predicted pIC50: 5.1985
Predicted IC50: 6.3320 uM
