In [1]:
import pandas as pd
import rdkit
import numpy as np
import pickle
import os
import sys
from rdkit import Chem
from rdkit.Chem import MACCSkeys,AllChem
from MIL_functions import data_encoding

In [2]:
if os.path.isfile("data/encoded/encoded_data.dat"):      ##  Checks if file already exists. Saves time and no changes are expected
    print("Data already encoded")

else:
    ##          Step 1: Load data into dataframes
    data = pd.read_csv("data/raw/selected_molecules.csv")
    metabolite_data = pd.concat([pd.read_csv("data/raw/biotransformer_output_cyp1.csv"), pd.read_csv("data/raw/biotransformer_output_phaseII.csv")])

    ##          Step 2: Normailizing metabolite smiles and matching to parent (approx 220 secs) 
    metabolite_data['smiles']           = metabolite_data['SMILES'].apply(lambda x: data_encoding.normalize_smiles(x))
    metabolite_data['Precursor smiles']           = metabolite_data['Precursor SMILES'].apply(lambda x: data_encoding.normalize_smiles(x))
    metabolite_data                     = metabolite_data.dropna(axis=0,subset=['smiles']);   print("b")
    metabolite_data['parent smiles']    = metabolite_data['Precursor SMILES'].apply(lambda x:data_encoding.parent_finder(x));   print("c")

    ##          Step 3: Pre calculating encoding for molecules, requires evaluation of lists on loading csv (approx 110 secs)
    data["MACCS"] = data_encoding.get_ml_encoding(df = data, function = MACCSkeys.GenMACCSKeys);   print("d")
    data["RDKF"] = data_encoding.get_ml_encoding(df = data, function =  Chem.RDKFingerprint);   print("e")
    data["MACCS_MIL"] = data.apply(lambda row: data_encoding.bag_parent(smiles = row['smiles'], met_df = metabolite_data, function = MACCSkeys.GenMACCSKeys),axis=1);   print("f")
    data["RDKF_MIL"] = data.apply(lambda row: data_encoding.bag_parent(smiles = row['smiles'], met_df = metabolite_data, function = Chem.RDKFingerprint),axis=1);   print("g")
    data["Morgan"] = data_encoding.get_ml_encoding(df = data, function = 'Morgan')
    data["Morgan_MIL"] = data.apply(lambda row: data_encoding.bag_parent(smiles = row['smiles'], met_df = metabolite_data, function = 'Morgan'),axis=1)

    ##          Step 3: Saved to a pickle, rather than a csv this stores the lists and is much faster to load (~10x)
    try:
        data = data.drop(["Molecule"],axis=1)
    except:
        a=1    
    data_encoding.create_compressed_pickle(data=data,path="data/encoded/encoded_data.dat")

Data already encoded


In [1]:
done=0
if os.path.isfile("data/encoded/encoded_data_hansen.dat"):
    data = data_encoding.load_compressed_pickle("data/encoded/encoded_data_hansen.dat")
    if all([i in data.columns for i in ['MACCS','RDKF','PaDEL','MACCS_MIL','RDKF_MIL','PaDEL_MIL','Morgan','Morgan_MIL']]):
        done =1
        print("All data already encoded")
    else:
        print('Some data already encoded')
if not done:
    ##          Step 1: Load data into dataframes
    if os.path.isfile("data/encoded/encoded_data_hansen.dat"):
        data = data_encoding.load_compressed_pickle("data/encoded/encoded_data_hansen.dat")
    else:
        data = pd.read_csv("data/raw/hansen_raw/Hansen_all_mols.csv")
    metabolite_data = pd.concat([pd.read_csv("data/raw/hansen_raw/biotransformer_hansen_output_cyp1.csv"),pd.read_csv("data/raw/hansen_raw/biotransformer_hansen_output_phaseII.csv")])

    ##          Step 2: Normailizing metabolite smiles and matching to parent (approx 220 secs) 
    metabolite_data['smiles']           = metabolite_data['SMILES'].apply(lambda x: data_encoding.normalize_smiles(x))
    metabolite_data['Precursor smiles'] = metabolite_data['Precursor SMILES'].apply(lambda x: data_encoding.normalize_smiles(x))
    metabolite_data                     = metabolite_data.dropna(axis=0,subset=['smiles'])
    metabolite_data['parent smiles']    = metabolite_data['Precursor smiles'].apply(lambda x:data_encoding.faster_parent_finder(x))

    ##          Step 3: Pre calculating encoding for molecules, requires evaluation of lists on loading csv (approx 110 secs)
    if not 'MACCS' in data.columns:
        data["MACCS"] = data_encoding.get_ml_encoding(df = data, function = MACCSkeys.GenMACCSKeys);   data_encoding.create_compressed_pickle(data=data,path="data/encoded/encoded_data_hansen.dat")
    if not 'RDKF' in data.columns:
        data["RDKF"] = data_encoding.get_ml_encoding(df = data, function =  Chem.RDKFingerprint);   data_encoding.create_compressed_pickle(data=data,path="data/encoded/encoded_data_hansen.dat")
    if not 'MACCS_MIL' in data.columns:
        data["MACCS_MIL"] = data.apply(lambda row: data_encoding.bag_parent(smiles = row['smiles'], met_df = metabolite_data, function = MACCSkeys.GenMACCSKeys),axis=1);  data_encoding.create_compressed_pickle(data=data,path="data/encoded/encoded_data_hansen.dat")
    if not 'RDKF_MIL' in data.columns:
        data["RDKF_MIL"] = data.apply(lambda row: data_encoding.bag_parent(smiles = row['smiles'], met_df = metabolite_data, function = Chem.RDKFingerprint),axis=1);  data_encoding.create_compressed_pickle(data=data,path="data/encoded/encoded_data_hansen.dat")
    if not 'Morgan' in data.columns:
        data["Morgan"] = data_encoding.get_ml_encoding(df = data, function = 'Morgan');   data_encoding.create_compressed_pickle(data=data,path="data/encoded/encoded_data_hansen.dat")
    if not 'Morgan_MIL' in data.columns:
        data["Morgan_MIL"] = data.apply(lambda row: data_encoding.bag_parent(smiles = row['smiles'], met_df = metabolite_data, function = 'Morgan'),axis=1);   data_encoding.create_compressed_pickle(data=data,path="data/encoded/encoded_data_hansen.dat")

    data_encoding.create_compressed_pickle(data=data,path="data/encoded/encoded_data_hansen.dat")

NameError: name 'pd' is not defined