In [6]:
import numpy as np  
from rdkit import Chem  
from rdkit.Chem import AllChem 
from tdc.benchmark_group import admet_group
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

group = admet_group(path = 'data/')
predictions_list = []

def smiles_to_fingerprints(smiles_list, n_bits=2048):  
    fingerprints = []  
    for smiles in smiles_list:  
        mol = Chem.MolFromSmiles(smiles)  
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=n_bits)  
        fingerprints.append(np.array(fp))  
    return np.vstack(fingerprints)  

#for seed in [1, 2, 3, 4, 5]:
benchmark = group.get('Caco2_Wang') 
# all benchmark names in a benchmark group are stored in group.dataset_names
predictions = {}
name = benchmark['name']
train_val, test = benchmark['train_val'], benchmark['test']
train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = 1)
#print(valid)
x_train, y_train = train['Drug'], train['Y']
x_test, y_test = test['Drug'], test['Y']

x_train = smiles_to_fingerprints(x_train)
x_test = smiles_to_fingerprints(x_test)
print(x_train)

Found local copy...
generating training, validation splits...
100%|██████████| 728/728 [00:00<00:00, 3112.03it/s]


[[0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [7]:

#Train SVM Model
svm_model = SVC(kernel='linear')  # You can adjust the kernel and other parameters
svm_model.fit(x_train, y_train)

y_pred_test = svm_model.predict(x_test)


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
predictions[name] = y_pred_test
predictions_list.append(predictions)

results = group.evaluate_many(predictions_list)
# {'caco2_wang': [6.328, 0.101]}

In [None]:
# Step 1: Import Necessary Libraries
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Step 2: Load Dataset
# Assuming you have a CSV file from TDC with a column 'smiles' for molecular structures
dataset = pd.read_csv('path_to_your_dataset.csv')

# Step 3: Preprocess Data (if necessary)
# This step depends on your dataset's specifics

# Step 4: Generate Molecular Fingerprints
def generate_fingerprints(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    return AllChem.GetMorganFingerprintAsBitVect(molecule, radius=2, nBits=2048)

dataset['fingerprints'] = dataset['smiles'].apply(generate_fingerprints)

# Step 5: Prepare Data for Machine Learning
X = list(dataset['fingerprints'])
y = dataset['target_variable']  # Replace 'target_variable' with the name of your target column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train SVM Model
svm_model = SVC(kernel='linear')  # You can adjust the kernel and other parameters
svm_model.fit(X_train, y_train)

# Step 7: Evaluate Model
y_pred = svm_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Step 8: Predictions
# Use svm_model.predict(new_data) to make predictions on new, unseen data