In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
import pandas as pd
from rdkit import Chem

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def inchikey(Smiles): #returns first layer of InChIKey (excluding stereochemical information), passed SMILES for a molecule
    mol = Chem.MolFromSmiles(Smiles)
    dummy = (Chem.inchi.MolToInchiKey(mol))[:14] #retrieves first layer only
    return(dummy)

In [None]:
def prep_analogues(path): #creates dataframe containing nucleoside analogues of an input library with placeholder Cl atoms replaced with an OH or NH2 moiety, passed the filepath for the input nucleoside analogue library
  input_df = pd.read_csv(path, sep='\t')
  inchikeys = []
  smiles = []
  for i in range(len(input_df['SMILES'])):
    if i%10000 == 0:
      print(i)
    OH_dummy = input_df['SMILES'][i].replace('Cl', 'O[H]')
    inchikeys.append(inchikey(OH_dummy))
    smiles.append(OH_dummy) #replaces placeholder Cl atom in SMILES with an OH moiety and identified the InChIKey representation for the resultant molecule

    NH2_dummy = input_df['SMILES'][i].replace('Cl', 'N([H])[H]')
    inchikeys.append(inchikey(NH2_dummy))
    smiles.append(NH2_dummy) #as above but with an NH2 moiety

  output_df = pd.DataFrame({'INCHIKEY':inchikeys, 'SMILES':smiles})
  return(output_df)

In [None]:
def prep_and_removedegen(CHNO_path, CHO_path): #creates dataframe housing all unique InChIKey representations across the 2 nucleoside analogue libraries, along with all unique SMILES representations associated with each InChIKey, passed filepaths for the 2 input nucleoside analogue libraries
  CHNO_prepped = prep_analogues(CHNO_path)
  print('Checkpoint 1')
  CHO_prepped = prep_analogues(CHO_path)
  print('Checkpoint 2')
  all_inchikeys = list(CHNO_prepped['INCHIKEY']) + list(CHO_prepped['INCHIKEY'])
  all_smiles = list(CHNO_prepped['SMILES']) + list(CHO_prepped['SMILES'])

  final_inchikeys = []
  final_smiles = []

  for i in range(len(all_inchikeys)):
    if i%10000 == 0:
      print(i)
    if all_inchikeys[i] not in final_inchikeys: #identifies if an InChIKey is unique
      final_inchikeys.append(all_inchikeys[i])
      final_smiles.append([all_smiles[i]])
    else:
      final_smiles[final_inchikeys.index(all_inchikeys[i])].append(all_smiles[i]) #if the InChIKey is not unique, the corresponding SMILES is added to the existing list corresponding to that InChIKey

  for i in range(len(final_inchikeys)):
    final_smiles[i] = set(final_smiles[i]) #the list of SMILES for a particular InChIKey is reduced to the set to remove degeneracy (between the 2 input libraries)

  output_df = pd.DataFrame({'INCHIKEY':final_inchikeys, 'SMILES':final_smiles})
  output_df.to_csv('/content/drive/MyDrive/BMSIS /MinimalDirectory/Nucleoside_Stereoisomers.tsv', index=None, sep='\t')
  return(output_df)

In [None]:
#df = prep_and_removedegen('/content/drive/MyDrive/BMSIS /MinimalDirectory/OriginalData/OriginalNucleosideAnalogueData/CHNO_Smiles.tsv', '/content/drive/MyDrive/BMSIS /MinimalDirectory/OriginalData/OriginalNucleosideAnalogueData/CHO_Smiles.tsv')

0
Checkpoint 1
0
Checkpoint 2
0
