In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import SaltRemover
from mordred import Calculator, descriptors
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem import MACCSkeys, MolFromSmiles
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score, fbeta_score
import pickle

In [None]:
# Load the dataset GSK3 into a pandas DataFrame
dataset = pd.read_csv("datasets/GSK3_JNK3/JNK3/all.txt")

In [None]:
# Function to obtain the Canonical SMILES from a SMILES string
def get_canonical_smiles(smiles):
    # Convert the SMILES string to a molecule
    mol = Chem.MolFromSmiles(smiles)

    # Convert the molecule to a canonical SMILES string
    canonical_smiles = Chem.MolToSmiles(mol)

    return canonical_smiles

# Calculate canonical smiles for every SMILES string in GSK3 DataFrame
canonical_dataset = [get_canonical_smiles(smiles) for smiles in dataset["smiles"]]

# Save the canonical smiles in a new column called canonical_smiles in the dataframe
dataset.insert(loc = 0, column = "canonical_smiles", value = canonical_dataset)

In [None]:
# Salt and solvent removal
def salt_solvent_remover(smiles):
    # Define the salt remover https://github.com/rdkit/rdkit/blob/master/Data/Salts.txt
    remover = SaltRemover.SaltRemover()

    # Convert the SMILES string to a molecule
    mol = Chem.MolFromSmiles(smiles)

    # Remove salts and solvents from the molecule
    stripped_mol = remover.StripMol(mol)

    # Convert the stripped molecule to the stripped SMILES
    stripped_smiles = Chem.MolToSmiles(stripped_mol)

    return stripped_smiles

# Remove salt and solvent from every SMILES string in GSK3 DataFrame
stripped_dataset = [salt_solvent_remover(smiles) for smiles in dataset["canonical_smiles"]]

# Save the stripped SMILES in a new column called stripped_smiles in the dataframe
dataset["stripped_smiles"] = stripped_dataset

In [None]:
# Duplicities analysis and removal
dataset_without_duplicates = dataset.drop_duplicates(subset=['stripped_smiles'])
dataset = dataset_without_duplicates

In [None]:
# Save the updated DataFrame to a new file
dataset.to_csv("dataset_JNK3.csv", index=False)

In [None]:
# Load the dataframe into a new pandas DataFrame
dataset = pd.read_csv("dataset_JNK3.csv")

In [None]:
# Obtain fingerprints as an array of 1024 bits from sanitized SMILES
smiles = dataset['stripped_smiles'].values
fps = [np.array(GetMorganFingerprintAsBitVect(MolFromSmiles(e),2,nBits=1024)) for e in smiles]

In [None]:
# Define columns names
columns = list()
for i in range(0,1024):
    columns.append('ecfp' + str(i))

In [None]:
# Create a Dataframe of fingerprints
fingerprints = pd.DataFrame(fps, columns = columns)
fingerprints

In [None]:
# Concatenate the smiles with their corresponding fingerprint
ataset = pd.concat([dataset, fingerprints], axis = 1)
dataset

In [None]:
# Save the dataset
dataset.to_csv("dataset_fingerprints_JNK3.csv", index=False)

## Model training and evaluation

In [None]:
# Load the dataframe into a new pandas DataFrame
dataset = pd.read_csv("dataset_fingerprints_JNK3.csv")

In [None]:
# Delete columns that aren't necessary to train models
dataset = dataset.drop('stripped_smiles', axis = 1)
dataset = dataset.drop('canonical_smiles', axis = 1)
dataset = dataset.drop('smiles', axis = 1)

In [None]:
# X is used to save fingerprints
X = dataset.drop('jnk3', axis = 1)
# y is used to save the real value of the target
y = dataset['jnk3']

### Split the dataset

In [None]:
# Split the dataset into random train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 0, stratify = y)

## Random Forest Model


In [None]:
n_estimators = [10, 50, 70, 100, 150, 200]
max_depth = [None, 5, 7, 10, 13, 15]
# Create a dataframe to save the metrics for each set of parameters
df_RF = pd.DataFrame(columns = ['n_estimators', 'max_depth', 'sensitivity', 'specificity', 'precision', 'f1_score', 'balanced_accuracy', 'fbeta_score'])
for n in n_estimators:
  for d in max_depth:
    model = RandomForestClassifier(n_estimators = n, max_depth = d, n_jobs = 4, random_state = 0, class_weight = "balanced")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate the performance metrics
    sensitivity = recall_score(y_test, y_pred)
    specificity = recall_score(y_test, y_pred, pos_label = 0)
    precision = precision_score(y_test, y_pred)
    f1_score = sklearn.metrics.f1_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    fbeta_score = sklearn.metrics.fbeta_score(y_test, y_pred, beta = 2)
    df_RF.loc[len(df_RF.index)] = [f"{n}", f"{d}", f"{sensitivity}", f"{specificity}", f"{precision}", f"{f1_score}", f"{balanced_accuracy}", f"{fbeta_score}"]

In [None]:
# Sort the performance metrics by fbeta_score in ascending order
df_RF = df_RF.sort_values(by = "fbeta_score", ascending = False)
df_RF.to_csv("FINGERPRINT_JNK3_RFresults.csv", index = False)

### Save models trained with train data

In [None]:
model = RandomForestClassifier(n_estimators = 150, max_depth = 10, n_jobs = 10, random_state = 0, class_weight = "balanced")
model.fit(X_train, y_train)
pickle.dump(model, open('FP_modelRF_JNK3_150_10.pkl', 'wb'))

In [None]:
model = RandomForestClassifier(n_estimators = 50, max_depth = 10, n_jobs = 10, random_state = 0, class_weight = "balanced")
model.fit(X_train, y_train)
pickle.dump(model, open('FP_modelRF_JNK3_50_10.pkl', 'wb'))

In [None]:
model = RandomForestClassifier(n_estimators = 70, max_depth = 10, n_jobs = 10, random_state = 0, class_weight = "balanced")
model.fit(X_train, y_train)
pickle.dump(model, open('FP_modelRF_JNK3_70_10.pkl', 'wb'))

## C-Support Vector Classification Model

In [None]:
C_range = [1, 0.01, 0.001]
# Create a dataframe to save the metrics for each set of parameters
df_SVC = pd.DataFrame(columns = ['C', 'sensitivity', 'specificity', 'precision', 'f1_score', 'balanced_accuracy', 'fbeta_score'])
for c in C_range:
    model = SVC(C = c, probability = True, class_weight = "balanced",  random_state = 0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(c)

    # Calculate the performance metrics
    sensitivity = recall_score(y_test, y_pred)
    specificity = recall_score(y_test, y_pred, pos_label = 0)
    precision = precision_score(y_test, y_pred)
    f1_score = sklearn.metrics.f1_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    fbeta_score = sklearn.metrics.fbeta_score(y_test, y_pred, beta = 2)
    df_SVC.loc[len(df_SVC.index)] = [f"{c}", f"{sensitivity}", f"{specificity}", f"{precision}", f"{f1_score}", f"{balanced_accuracy}", f"{fbeta_score}"]

In [None]:
# Sort the performance metrics by fbeta_score in ascending order
df_SVC = df_SVC.sort_values(by = "fbeta_score", ascending = False)
df_SVC.to_csv("FINGERPRINT_JNK3_SVCresults.csv", index = False)

### Save models trained with train data

In [None]:
model = SVC(C = 1, class_weight = "balanced",  random_state = 0)
model.fit(X_train, y_train)
pickle.dump(model, open('FP_modelSVC_JNK3_1.pkl', 'wb'))

## Save models trained with all data

### Save Random Forest Models

In [None]:
model = RandomForestClassifier(n_estimators = 150, max_depth = 10, n_jobs = 10, random_state = 0, class_weight = "balanced")
model.fit(X, y)
pickle.dump(model, open('FP_modelALLRF_JNK3_150_10.pkl', 'wb'))

In [None]:
model = RandomForestClassifier(n_estimators = 50, max_depth = 10, n_jobs = 10, random_state = 0, class_weight = "balanced")
model.fit(X, y)
pickle.dump(model, open('FP_modelALLRF_JNK3_50_10.pkl', 'wb'))

In [None]:
model = RandomForestClassifier(n_estimators = 70, max_depth = 10, n_jobs = 10, random_state = 0, class_weight = "balanced")
model.fit(X, y)
pickle.dump(model, open('FP_modelALLRF_JNK3_70_10.pkl', 'wb'))

### Save SVC Models

In [None]:
model = SVC(C = 1, class_weight = "balanced",  random_state = 0)
model.fit(X, y)
pickle.dump(model, open('FP_modelALLSVC_JNK3_1.pkl', 'wb'))