## Calling packages and assigning variables
Here i call the necessary packages as well as assigning variables. Of note are the paths to collected data which will need to be changed for replication in another system

In [1]:
# !pip install -e git+https://github.com/garydoranjr/misvm.git#egg=misvm

In [2]:
import pandas as pd
import misvm
import rdkit
import numpy as np
import pickle
from tpot import TPOTClassifier
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from mordred import Calculator, descriptors
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold


data = pd.read_csv("/home/samuel/honours_redo/1-data/selected_molecules.csv")
metabolite_data = pd.read_csv("/home/samuel/honours_redo/publication_work/biotransformer_output_phaseII.csv").append(pd.read_csv("/home/samuel/honours_redo/publication_work/biotransformer_output_cyp1.csv"))

# validation_data = pd.read_csv()
# validation_metabolite_data = pd.read_csv()

# Cleaning Biotransformer data
This section is to ensure that metabolites label the correct parent molecule as the root parent and not the direct one. Simply it replaces the "parent molecule" if that molecule is a metabolite and records the parent of said molecule.

In [3]:
def normalize_smiles(smi):
    try:
        smi_norm = Chem.MolToSmiles(Chem.MolFromSmiles(smi))
        return smi_norm
    except:
        return np.nan

metabolite_data['smiles'] = metabolite_data['SMILES'].apply(lambda x: normalize_smiles(x))
metabolite_data = metabolite_data.dropna(axis=0,subset=['smiles'])

def parent_finder(smi):
    for parent in data['smiles']:
        try:
            if Chem.MolToSmiles(Chem.MolFromSmiles(smi)) == Chem.MolToSmiles(Chem.MolFromSmiles(parent)):
                return parent
        except:
            continue
    return "No parent found"

metabolite_data['parent smiles'] = metabolite_data['Precursor SMILES'].apply(lambda x:parent_finder(x))

RDKit ERROR: [03:52:42] Explicit valence for atom # 31 N, 4, is greater than permitted
RDKit ERROR: [03:52:42] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [03:52:42] Explicit valence for atom # 10 N, 4, is greater than permitted
RDKit ERROR: [03:52:42] Explicit valence for atom # 10 N, 4, is greater than permitted
RDKit ERROR: [03:52:42] Explicit valence for atom # 10 N, 4, is greater than permitted


## Defining functions
These functions are used to execute the described actions on the data

In [4]:
def build_mil_model(training_data,grouping_data,fingerprint,MIL,name):
    i=0;print(i); i+=1
    if not os.path.isfile("saved_models/"+name):
            [bags,labels] = generate_bags(training_data,grouping_data,fingerprint);print(i); i+=1
            model = MIL;print(i); i+=1
            model.fit(bags,labels);print(i); i+=1
            print(i); i+=1
            pickle.dump(model, open("saved_models/"+name, 'wb'));print(i); i+=1
    else:
        print(name,"is already built")

def build_ml_model(training_data,fingerprint,ML,name):
    if not os.path.isfile("saved_models/"+name):
        [instances,labels] = get_ml_fingerprint(training_data, fingerprint)
        tpot_optimisation = ML
        tpot_optimisation.fit(instances,labels)
        model = tpot_optimisation.fitted_pipeline_
        pickle.dump(model, open("saved_models/"+name, 'wb'))  
    else:
        print(name,"is already built")

def test_mil_model(testing_data,grouping_data,fingerprint,name):
    loaded_model = pickle.load(open("saved_models/"+name, 'rb'))
    [bags,true_labels] = generate_bags(testing_data,grouping_data,fingerprint)
    predictions = loaded_model.predict(bags)
    predicted_labels = list(map(pos_or_neg,predictions))
    df = pd.DataFrame({
        'predicted' : predictions,
        'predicted labal' : predicted_labels,
        'true label' : true_labels
    })
    df.to_csv("saved_tests/"+name.split(".")[0]+".csv",index=False )

def test_ml_model(testing_data,fingerprint,name):
    loaded_model = pickle.load(open("saved_models/"+name, 'rb'))
    [instances,true_labels] = get_ml_fingerprint(testing_data, fingerprint)
    predictions = loaded_model.predict(instances)
    predicted_labels = list(map(pos_or_neg,predictions))
    df = pd.DataFrame({
        'predicted' : predictions,
        'predicted labal' : predicted_labels,
        'true label' : true_labels
    })
    df.to_csv("saved_tests/"+name.split(".")[0]+".csv",index=False )

def generate_bags(df,bt_df, funct):
        bags = []; labels = []
        for smile in df['smiles'].to_list():
            wk_data =   bt_df[bt_df["parent smiles"]==smile]
            if not wk_data.empty:
                wk_data = get_mil_fingerprint(wk_data, funct)
                bags        += [np.array(wk_data['fp_list'].to_list())]
                labels      += [df.loc[df['smiles'] == smile, 'Ames'].item()]
        for i,x in enumerate(labels):
            if x == 0:
                labels[i] = -1
        bags = np.array(bags)
        labels = np.array(labels)
        return [bags, labels]

def get_mil_fingerprint(df, function):
    fn = function
    df1 = df.copy()
    df1['fp_list'] = df1['smiles'].apply(lambda x: list(fn(Chem.MolFromSmiles(x))))
    return df1

def pos_or_neg(x):
    if x>0:
        return 1
    else:
        return -1
    
def number_check(x):
    try:
        return float(x)
    except:
        return "broken"

def get_ml_fingerprint(df, function):
    if function != "Morgan":
        fn = function
        df1 = df.copy()
        df1['fp_list'] = df1['smiles'].apply(lambda x: list(fn(Chem.MolFromSmiles(x))))
        df1 = df1.dropna(axis = 1, how = 'any')

        df2 = pd.DataFrame(df1['fp_list'].to_list())
        df2 = df2.applymap(number_check).dropna(axis =1, how = "any")
        df2 = df2.drop(columns=df2.columns[(df2 == 'broken').any()])
        df1['fp_list'] = df2.values.tolist()
        X = np.array(df1['fp_list'].to_list())
        Y = np.array(df1['Ames'].to_list())
        return [X,Y]
    else:
        radius = 3
        df1 = df.copy()
        df1['fp_list'] = df1['smiles'].apply(lambda x: list(AllChem.GetMorganFingerprint(Chem.MolFromSmiles(x),radius).ToBinary()))
        df2 = pd.DataFrame(df1['fp_list'].to_list())
        df2 = df2.applymap(number_check).dropna(axis =1, how = "any")
        df2 = df2.drop(columns=df2.columns[(df2 == 'broken').any()])
        df1['fp_list'] = df2.values.tolist()
        X = np.array(df1['fp_list'].to_list())
        Y = np.array(df1['Ames'].to_list())
        return [X,Y]

## Building models
Here the above functions are used to build models. This section can be altered to build additional models if desired

In [5]:
def develop_models(training_data,training_data_metabolites,testing_data,testing_data_metabolites,suffix=""):
    tested_mils =  [["MICA", misvm.MICA(max_iters=50,verbose=False)],     
                ["MISVM", misvm.MISVM(kernel='linear', C=1.0, max_iters=50,verbose=False)],
                ['SIL', misvm.SIL(verbose=False)],
                ['NSK', misvm.NSK(verbose=False)],
                ['sMIL', misvm.sMIL(verbose=False)]]

    i = 0
    
    # Iterate over the two used fingerprints
    for fp in [["MACCS",MACCSkeys.GenMACCSKeys],["RDFP",Chem.RDKFingerprint]]:
        # Iterate over the used MILs
        for mil in tested_mils:
            # Build and test MIL model
            build_mil_model(training_data=training_data,grouping_data=training_data_metabolites,name=fp[0]+"_"+mil[0]+suffix+".sav",MIL=mil[1],fingerprint=fp[1])
            test_mil_model(testing_data=testing_data,grouping_data=testing_data_metabolites,name=fp[0]+"_"+mil[0]+suffix+".sav",fingerprint=fp[1])
        # Build and test TPOT model
        build_ml_model(training_data=training_data, ML = TPOTClassifier(generations=10, population_size=100, cv=5, random_state=42, verbosity=2),fingerprint=fp[1],name=fp[0]+suffix+'_tpot.sav')
        test_ml_model(testing_data=testing_data,name=fp[0]+suffix+'_tpot.sav',fingerprint=fp[1])

        # build_ml_model(training_data=training_data, ML = TPOTClassifier(generations=10, population_size=100, cv=5, random_state=42, verbosity=2),fingerprint=fp[1],name=fp[0]+"_tpot"+suffix+'.sav')
        # test_ml_model(testing_data=testing_data,name=fp[0]+"_tpot"+suffix+'.sav',fingerprint=fp[1])

In [6]:
## For precaution i save the indexes of the models into each crossvalidation. This is also usefull incase the code times out part way through, in addition to a set random state

if os.path.isfile("train_test.csv"):
    print("Crossvalidation indices already determined in file: train_test.csv")
    print('To redo them please delete these files and for a different set please choose another random stat or "none" within the code block above')
else:
    skf = StratifiedKFold(n_splits=10, random_state=871923, shuffle=True)
    First = True
    for itr in range(1,11):
        for fold, [train_index, test_index] in enumerate(skf.split(data,data['Ames'])):
            fold +=1
            if First:
                test_df = pd.DataFrame({"index":test_index})
                train_df = pd.DataFrame({"index":train_index})
                test_df["group"] = "test"; train_df["group"] = "train"
                test_df["fold"] = fold; train_df["fold"] = fold
                test_df["iteration"] = itr; train_df["iteration"] = itr
                train_test = test_df.append(train_df)
                First = False
            else:
                test_df = pd.DataFrame({"index":test_index})
                train_df = pd.DataFrame({"index":train_index})
                test_df["group"] = "test"; train_df["group"] = "train"
                test_df["fold"] = fold; train_df["fold"] = fold
                test_df["iteration"] = itr; train_df["iteration"] = itr
                train_test = train_test.append(test_df).append(train_df)
    train_test.to_csv("train_test.csv",index=False)

Crossvalidation indices already determined in file: train_test.csv
To redo them please delete these files and for a different set please choose another random stat or "none" within the code block above


In [7]:
train_test = pd.read_csv("train_test.csv")
for iteration in train_test["iteration"].unique():  
    for fold in train_test["fold"].unique():
        train_index = train_test[(train_test["group"] == "train") & (train_test["fold"] == fold) & (train_test["iteration"] == iteration)]["index"].to_list()
        test_index = train_test[(train_test["group"] == "test") & (train_test["fold"] == fold) & (train_test["iteration"] == iteration)]["index"].to_list()
        train = data.iloc[train_index]
        test =  data.iloc[test_index]
        develop_models(train,metabolite_data,test,metabolite_data,suffix='_kfold'+str(fold))
        print("Done Fold", fold)
    print("Done Iteration", iteration)

0
MACCS_MICA_kfold1.sav is already built


  bags = np.array(bags)


0
MACCS_MISVM_kfold1.sav is already built
0
MACCS_SIL_kfold1.sav is already built
0
MACCS_NSK_kfold1.sav is already built
0
MACCS_sMIL_kfold1.sav is already built
MACCS_kfold1_tpot.sav is already built
0


  bags = np.array(bags)


1
2


## Model Validation
Here the model results are assessed

In [None]:
# develop_models(data,bt_data,validation_data,validation_metabolite_data,suffix="_validation")

## Model Analysis
Here the results of each fold are calculated as well as deviation within crossvalidation

In [None]:
def confusion_matrix(df):
    TP = len(df[(df["predicted label"] == 1) & (df["true label"] == 1)])
    TN = len(df[(df["predicted label"] == 0) & (df["true label"] == 0)])
    FP = len(df[(df["predicted label"] == 1) & (df["true label"] == 0)])
    FN = len(df[(df["predicted label"] == 0) & (df["true label"] == 1)])
    return [TP,TN,FP,FN]

In [None]:
rslt_list = []

for filename in os.listdir("/home/samuel/honours_redo/publication_work/saved_tests"):
    [TP,TN,FP,FN] = confusion_matrix(pd.read_csv(filename))
    fingerprint = filename.split("_")[0]
    model = filename.split("_")[0]
    fold = filename.split("_")[-1].split(".")[0]
    rslt_list += {"fingerprint":fingerprint, "model":model, "fold":fold, "TP":TP, "TN":TN, "FP":FP, "FN":FN}

rslt_df = pd.Dataframe(rslt_list)
rslt_df