In [4]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
from rdkit import Chem

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
def network_inchikey(array): #returns first layer of InChIKey (excluding stereochemical information), passed array of dataframe containing the SMILES for a molecule as its 2nd element
    Smiles = array[1]
    mol = Chem.MolFromSmiles(Smiles)
    dummy = (Chem.inchi.MolToInchiKey(mol))[:14] #retrieves first layer only
    return(dummy)

In [26]:
def inchikey(Smiles): #returns first layer of InChIKey (excluding stereochemical information), passed array of dataframe containing the SMILES for a molecule as its 2nd element
    mol = Chem.MolFromSmiles(Smiles)
    dummy = (Chem.inchi.MolToInchiKey(mol))[:14] #retrieves first layer only
    return(dummy)

In [29]:
def find_smiles(molecules, smiles_list): #function which identifies which networks target molecules appear in and which generation they are first produced in, passed list of molecules to search for and their SMILES representations
  formose_data = pd.read_csv('/content/drive/MyDrive/BMSIS /MinimalDirectory/OriginalData/OriginalNetworkData/Products/formose_output.tsv', sep='\t')
  formose_data['INCHIKEY'] = formose_data.apply(network_inchikey, axis=1, raw=True, result_type='expand')
  print(1)
  formoseamm_data = pd.read_csv('/content/drive/MyDrive/BMSIS /MinimalDirectory/OriginalData/OriginalNetworkData/Products/formose_amm_output.tsv', sep='\t')
  formoseamm_data['INCHIKEY'] = formoseamm_data.apply(network_inchikey, axis=1, raw=True, result_type='expand')
  print(2)
  glucose_data = pd.read_csv('/content/drive/MyDrive/BMSIS /MinimalDirectory/OriginalData/OriginalNetworkData/Products/glucose_degradation_output.tsv', sep='\t')
  glucose_data['INCHIKEY'] = glucose_data.apply(network_inchikey, axis=1, raw=True, result_type='expand')
  print(3)
  glucoseamm_data = pd.read_csv('/content/drive/MyDrive/BMSIS /MinimalDirectory/OriginalData/OriginalNetworkData/Products/glucose_amm_output.tsv', sep='\t')
  glucoseamm_data['INCHIKEY'] = glucoseamm_data.apply(network_inchikey, axis=1, raw=True, result_type='expand')
  print(4)
  pyruvic_data = pd.read_csv('/content/drive/MyDrive/BMSIS /MinimalDirectory/OriginalData/OriginalNetworkData/Products/pyruvic_output.tsv', sep='\t')
  pyruvic_data['INCHIKEY'] = pyruvic_data.apply(network_inchikey, axis=1, raw=True, result_type='expand')
  print(5)
  hcn_data = pd.read_csv('/content/drive/MyDrive/BMSIS /MinimalDirectory/OriginalData/OriginalNetworkData/Products/hcn_output.tsv', sep='\t')
  hcn_data['INCHIKEY'] = hcn_data.apply(network_inchikey, axis=1, raw=True, result_type='expand')
  print(6)
  maillard_data = pd.read_csv('/content/drive/MyDrive/BMSIS /MinimalDirectory/OriginalData/OriginalNetworkData/Products/maillard_output.tsv', sep='\t')
  maillard_data['INCHIKEY'] = maillard_data.apply(network_inchikey, axis=1, raw=True, result_type='expand')
  print(7)
  ureymiller_data = pd.read_csv('/content/drive/MyDrive/BMSIS /MinimalDirectory/OriginalData/OriginalNetworkData/Products/urey_miller_output.tsv', sep='\t')
  ureymiller_data['INCHIKEY'] = ureymiller_data.apply(network_inchikey, axis=1, raw=True, result_type='expand')
  print(8) #loads all network data and identify INCHIKEY representations for all molecules
  nucleoside_data = pd.read_csv('/content/drive/MyDrive/BMSIS /MinimalDirectory/ProcessedData/Nucleoside_Stereoisomers.tsv', sep='\t')
  print('Checkpoint 1')
  all_data = [formose_data, formoseamm_data, glucose_data, glucoseamm_data, pyruvic_data, hcn_data, maillard_data, ureymiller_data]
  networks = ['Formose', 'FormoseAmm', 'Glucose', 'GlucoseAmm', 'PyruvicAcid', 'HCN', 'Maillard', 'UreyMiller']
  in_network = [[], [], [], [], [], [], [], []]
  in_nalibrary = []
  inchikey_list = []
  for i in range(len(smiles_list)): #iterates over all target molecules
    INCHIKEY = inchikey(smiles_list[i])
    inchikey_list.append(INCHIKEY)
    nucleoside = False
    for j in range(len(nucleoside_data['INCHIKEY'])):
      if INCHIKEY == nucleoside_data['INCHIKEY'][j]: #checks if molecule in nucleoside analogue library
        nucleoside = True
        break
    if nucleoside == False:
      in_nalibrary.append('False')
    else:
      in_nalibrary.append('True')
    for j in range(len(all_data)): #iterates over all networks
      status = False
      dummy_index = 'NaN'
      for k in range(len(all_data[j]['INCHIKEY'])):
        if INCHIKEY == all_data[j]['INCHIKEY'][k]:
          status = True
          dummy_index = k #identifies if the current molecule is formed in the current network
          break
      if status == True:
        in_network[j].append(all_data[j]['Generation'][dummy_index]) #if present, identifies the generation the current molecule is formed in the current network
      else:
        in_network[j].append('Not in network')
  output_df = pd.DataFrame({networks[0]:in_network[0], networks[1]:in_network[1], networks[2]:in_network[2], networks[3]:in_network[3], networks[4]:in_network[4], networks[5]:in_network[5], networks[6]:in_network[6], networks[7]:in_network[7]})
  output_df.insert(0, 'Match?', in_nalibrary)
  output_df.insert(0, 'INCHIKEY', inchikey_list)
  output_df.insert(0, 'Smiles', smiles_list)
  output_df.insert(0, 'Molecule', molecules)
  #output_df.to_csv('/content/drive/MyDrive/BMSIS /MinimalDirectory/ProcessedData/SmilesInNetwork.tsv', index=False, sep='\t')
  return(output_df)

In [1]:
%%time
molecules = ['Deoxyribose', 'Ribose', 'Threose', 'Xylose', '2,6-diaminopurine', 'Inosine', 'Pyrimidone', 'Xanthine', 'Hypoxanthine', 'Xanthosine',\
             'Pentose aminoxazoline', 'Tetrose aminoxazoline']
smiles_list = ['C(CC(C(CO)O)O)=O', 'C(C(C(C(CO)O)O)O)=O', 'C(C(C(CO)O)O)=O', 'C1C(C(C(C(O)O1)O)O)O', 'C1=NC2=NC(=NC(=C2N1)N)N', 'C1=NC2=C(C(=O)N1)N=CN2C3C(C(C(O3)CO)O)O',\
               'O=C1NC=NC=C1', 'C1=NC2=C(N1)C(=O)NC(=O)N2', 'C1=NC2=C(N1)C(=O)NC=N2', 'C1=NC2=C(N1C3C(C(C(O3)CO)O)O)NC(=O)NC2=O', 'C(C1C(C2C(O1)N=C(O2)N)O)O',\
               'C1C(C2C(O1)N=C(O2)N)O']
a = find_smiles(molecules, smiles_list)