## Define inputs

In [11]:
parameter = 'Km' # allowed values: ["kcat", "Km", "Ki"] 
parameter = parameter.lower()

use_cpu = 1 # set to 0 if you have GPU enabled

uniprot_id = "P35557" 
# If you do not have a uniprot-id, enter some name (for eg: "enzyme1")
sequence = 'MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLRLETHEEASVKMLPTYVRSTPEGSEVGDFLSLDLGGTNFRVMLVKVGEGEEGQWSVKTKHQMYSIPEDAMTGTAEMLFDYISECISDFLDKHQMKHKKLPLGFTFSFPVRHEDIDKGILLNWTKGFKASGAEGNNVVGLLRDAIKRRGDFEMDVVAMVNDTVATMISCYYEDHQCEVGMIVGTGCNACYMEEMQNVELVEGDEGRMCVNTEWGAFGDSGELDEFLLEYDRLVDESSANPGQQLYEKLIGGKYMGELVRLVLLRLVDENLLFHGEASEQLRTRGAFETRFVSQVESDTGDRKQIYNILSTLGLRPSTTDCDIVRRACESVSTRAAHMCSAGLAGVINRMRESRSEDVMRITVGVDGSVYKLHPSFKERFHASVRRLTPSCEITFIESEEGSGRGAALVSAVACKKACMLGQ'
SMILES = "C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O" 

## Navigate to below cell and click "Run->Run Selected Cell" to get prediction

The result will be printed on the right column

In [16]:
import time
import os
import pandas as pd
import numpy as np
from IPython.display import Image, display
from rdkit import Chem
from IPython.display import display, Latex, Math

def create_csv_sh(parameter, uni, seq, smi):
  try:
    mol = Chem.MolFromSmiles(smi)
    smi = Chem.MolToSmiles(mol)
  except:
    print('Invalid SMILES input!')
    print('Correct your input! Exiting..')
    return
  valid_aas = list('ACDEFGHIKLMNPQRSTVWY')
  for aa in seq:
    if not aa in valid_aas:
      print('Invalid Enzyme sequence input!')
      print('Correct your input! Exiting..')
      return
  if parameter=='kcat':
    if '.' in smi:
      x = smi.split('.')
      y = sorted(x)
      smi = '.'.join(y)
  f = open(f'{uni}_{parameter}_input.csv', 'w')
  f.write('name,sequence,SMILES,pdbpath\n')
  f.write(f'{uni},{seq},{smi},{uni}.pdb\n')
  f.close()

  f = open(f'predict.sh', 'w')
  f.write(f'''
TEST_FILE_PREFIX={uni}_{parameter}
RECORDS_FILE=${{TEST_FILE_PREFIX}}.json
CHECKPOINT_DIR=./production_models/{parameter}/

python ./scripts/create_pdbrecords.py --data_file ${{TEST_FILE_PREFIX}}_input.csv --out_file ${{RECORDS_FILE}}
python predict.py --test_path ${{TEST_FILE_PREFIX}}_input.csv --preds_path ${{TEST_FILE_PREFIX}}_output.csv --checkpoint_dir $CHECKPOINT_DIR --uncertainty_method mve --smiles_column SMILES --individual_ensemble_predictions --protein_records_path $RECORDS_FILE
''')
  f.close()

  print('Input success!')
  print('Enzyme sequence length:', len(sequence))
  print('Substrate structure:')
  # display(ShowMols([mol]))

  return seq, smi

seq, smi = create_csv_sh(parameter, uniprot_id, sequence, SMILES)

print('Predicting.. This will take a while..\n')

def get_predictions(parameter, uniprot_id):
  df = pd.read_csv(f'{uniprot_id}_{parameter}_output.csv')
  unit = ' mM'
  if parameter=='kcat':
    parameter_print = 'k_{cat}'
    parameter_print_log = 'log_{10}(k_{cat})'
    target_col = 'log10kcat_max'
    unit = ' s^{-1}'
  elif parameter=='km':
    target_col = 'log10km_mean'
    parameter_print = 'K_{m}'
    parameter_print_log = 'log_{10}(K_{m})'
  else:
    target_col = 'log10ki_mean'
    parameter_print = 'K_{i}'
    parameter_print_log = 'log_{10}(K_{i})'

  unc_col = f'{target_col}_mve_uncal_var'
  model_cols = [col for col in df.columns if col.startswith(target_col) and 'model_' in col]

  unc = df[unc_col].iloc[0]

  prediction = df[target_col].iloc[0]
  prediction_linear = np.power(10, prediction)

  model_out = df[target_col].iloc[0]
  model_outs = np.array([df[col].iloc[0] for col in model_cols])
  # print(model_outs)
  epi_unc = np.var(model_outs)#np.sum(np.power(2, model_outs))/10. - np.power(2, model_out)
  alea_unc = unc - epi_unc
  epi_unc = np.sqrt(epi_unc)
  alea_unc = np.sqrt(alea_unc)
  unc = np.sqrt(unc)

  # print(unc-epi_unc-alea_unc)
  # def display_outs(prediction_type, out, alea_output, epi_output, unit):
  display(Math((parameter_print + f' = {prediction_linear:.5f}'+ unit)))
  print('\n')
  display(Math((parameter_print_log + f' = {prediction:.5f}')))
  display(Math(('SD_{total}'+f' = {unc:.5f}')))
  display(Math(('SD_{aleatoric}'+f' = {alea_unc:.5f}')))
  display(Math(('SD_{epistemic}'+f' = {epi_unc:.5f}')))

if use_cpu:
    os.system("export PROTEIN_EMBED_USE_CPU=1;./predict.sh >/dev/null 2>&1")
else:
    os.system("export PROTEIN_EMBED_USE_CPU=0;./predict.sh >/dev/null 2>&1")

get_predictions(parameter,uniprot_id)

Input success!
Enzyme sequence length: 465
Substrate structure:
Predicting.. This will take a while..



<IPython.core.display.Math object>





<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>