In [1]:
import os
import re
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem, MACCSkeys, rdMolDescriptors
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
import numpy as np
from sklearn.model_selection import train_test_split
import pubchempy as pcp
import requests
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

base_dir = '/Users/samarjosyula/Desktop/PROJECTS/pgccInhibitorDrugDiscovery'
data_path = os.path.join(base_dir, "data")
if not os.path.exists(data_path): raise Exception("Path not found..")
result_path = os.path.join(base_dir, "results")
if not os.path.exists(result_path): os.makedirs(result_path, exist_ok=True)

descNames = [desc[0] for desc in Descriptors.descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(descNames)

In [3]:
# create compound library
compoundData_Ma = pd.read_excel(
    f'{data_path}/LiteratureData/Ma_compoundLibrary.xlsx',
    sheet_name=1
).rename(columns={'Item Name':'CompoundName', 'CAS Number':'casID'}).filter(items=['CompoundName', 'SMILES', 'Formula', 'casID'])

compoundData_Zhou = pd.read_excel(
    f'{data_path}/LiteratureData/Zhou1_compoundLibrary.xlsx',
    header=1
).rename(columns={'Compounds Name':'CompoundName', 'Item #':'ItemID'}).filter(items=['CompoundName', 'ItemID', 'Source', 'Target'])

compoundData = pd.concat(
    [
        compoundData_Ma,
        compoundData_Zhou
    ], ignore_index=True, copy=True
)
compoundData.to_csv(f'{data_path}/compoundLibrary.csv')

In [4]:
def parseDatabases(file=f'{data_path}/SupplementaryData/CaymanStructureDefinitions.sdf', verbose=False,):
    suppl = Chem.SDMolSupplier(file)

    # parse data from cayman database
    caymanData = pd.DataFrame(columns=['CompoundName', 'SMILES'])
    for mol in suppl:
        if mol is not None and mol.HasProp('_Name'):
            try:
                name = mol.GetProp('Item name')
                ItemID = mol.GetProp('Item number')
                casID = mol.GetProp('CAS Number')
                formula = mol.GetProp('Formula')
                smi = Chem.MolToSmiles(mol)
                caymanData = pd.concat([caymanData, pd.DataFrame([{'CompoundName':name, 'caymanID':ItemID, 'casID':casID, 'Formula':formula, 'SMILES':smi}])], ignore_index=True)
            except:
                continue
    caymanData.drop_duplicates(subset='CompoundName', inplace=True)
    if verbose: print('..parsed cayman database..')

    # parse data from selleck database
    selleck = pd.read_excel(
        f'{data_path}/SupplementaryData/SelleckCompoundLibrary.xlsx', 
        sheet_name=1
    )
    selleckdf = selleck.rename(columns={'Cat':'selleckID', 'Name':'CompoundName', 'CAS Number':'casID'}).filter(items=['CompoundName', 'casID', 'selleckID', 'Formula', 'SMILES'])
    if verbose: print('..parsed selleck database..\n')

    return caymanData, selleckdf

# caymanData, selleckData = parseDatabases(verbose=True)
# caymanData.to_csv(f'{data_path}/SupplementaryData/caymanData.csv')
# selleckData.to_csv(f'{data_path}/SupplementaryData/selleckData.csv')

caymanData = pd.read_csv(f'{data_path}/SupplementaryData/caymanData.csv')
selleckData = pd.read_csv(f'{data_path}/SupplementaryData/selleckData.csv')

# merge datasets
allSuppData = caymanData.merge(selleckData, how='outer', on='SMILES')
allSuppData['ItemID'] = allSuppData['caymanID'].fillna(allSuppData['selleckID'])
allSuppData['CompoundName'] = allSuppData['CompoundName_x'].fillna(allSuppData['CompoundName_y'])
allSuppData['Formula'] = allSuppData['Formula_x'].fillna(allSuppData['Formula_y'])
mergedSuppData = allSuppData.filter(items=['CompoundName', 'SMILES', 'Formula', 'casID', 'ItemID'])

mergedSuppData['CompoundName'] = mergedSuppData['CompoundName'].astype(str).str.strip()
mergedSuppData['SMILES'] = mergedSuppData['SMILES'].astype(str).str.strip()
mergedSuppData['ItemID'] = mergedSuppData['ItemID'].astype(str).str.strip()
mergedSuppData.to_csv(f'{data_path}/SupplementaryData/mergedSuppData.csv')
mergedSuppData

Unnamed: 0,CompoundName,SMILES,Formula,ItemID
0,Prostaglandin A1,CCCCC[C@H](O)/C=C/[C@H]1C=CC(=O)[C@@H]1CCCCCCC...,C20H32O4,10010.0
1,Ac-YVAD-CMK,CC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](C(=O)N[...,C24H33ClN4O8,10014.0
2,Ac-YVAD-CHO,CC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](C(=O)N[...,C23H32N4O8,10016.0
3,DDMS,CS(=O)(=O)NC(=O)CCCCCCCCCC=C(Br)Br,C13H23Br2NO3S,10018.0
4,Prostaglandin A1 ethyl ester,CCCCC[C@H](O)/C=C/[C@H]1C=CC(=O)[C@@H]1CCCCCCC...,C22H36O4,10020.0
...,...,...,...,...
22205,Plerixafor (AMD3100),C1CNCCNCCCN(CCNC1)CC2=CC=C(CN3CCCNCCNCCCNCC3)C=C2,C28H54N8,S8030
22206,Sodium Hyaluronate,[Na+].CC(=O)NC1C(C(O)C(CO)OC1OC2C(O)C(O)C(O[*]...,C14H20NNaO11R,S5650
22207,Fosinopril Sodium,[Na+].CCC(=O)OC(O[P](=O)(CCCCC1=CC=CC=C1)CC(=O...,C30H45NO7P.Na,S2095
22208,Strontium Ranelate,O=C1CN(CC(=O)O[Sr]O1)C2=C(C#N)C3=C(S2)C(=O)O[S...,C12H6N2O8S.2Sr,S2050


In [5]:
# supplement data
def supplementData(input, verbose=False):
    compoundData = pd.DataFrame(input.copy())

    # supplement smiles and molecular formula
    for idx, row in compoundData.iterrows():
        match = pd.DataFrame()
        updated = False
        if pd.notna(row.get('ItemID')):
            match = caymanData[caymanData['caymanID'] == row['ItemID']] 
            if match.empty: # try match from selleck if couldn't match from cayman
                match = selleckData[selleckData['selleckID'] == row['ItemID']] 
            if len(match) > 1: # more than one match found
                pass 
            if match.empty: # no matches
                match = pd.DataFrame()
                updated=False
            else: updated = True
        elif (pd.isna(row.get('SMILES')) or pd.isna(row.get('Formula')) and pd.notna(row.get('casID'))):
            match = caymanData[caymanData['casID'] == row['casID']]
            updated = True

        if updated and not match.empty:
            nextRow = match.iloc[0]

            # validate SMILES
            mol = Chem.MolFromSmiles(nextRow['SMILES'])
            if mol:
                smi = Chem.MolToSmiles(mol, canonical=True, isomericSmiles=True)
                smi = smi.encode('ascii', errors='ignore').decode()
                nextRow['SMILES'] = smi
            else:
                nextRow['SMILES'] = None

            compoundData.loc[
                idx, ['Formula', 'SMILES']
            ] = [nextRow['Formula'], nextRow['SMILES']]
    if verbose: print('..supplemented SMILES and formula..')
    compoundData = compoundData.drop(
        columns=['casID', 'ItemID'], errors='ignore'
    )
    return compoundData

compoundData = supplementData(compoundData, verbose=True)
compoundData

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nextRow['SMILES'] = smi
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nextRow['SMILES'] = smi
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nextRow['SMILES'] = smi
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nextRow['SMILES'] = smi
A value is trying to be set on a copy of a slice from a DataFrame

See t

..supplemented SMILES and formula..


Unnamed: 0,CompoundName,SMILES,Formula,Source,Target
0,Angiotensin II,,C50H71N13O12,,
1,Levetiracetam,CCC(C(=O)N)N1CCCC1=O,C8H14N2O2,,
2,Daptomycin,CCCCCCCCCC(=O)NC(CC1=CNC2=CC=CC=C21)C(=O)NC(CC...,C72H101N17O26,,
3,Lansoprazole,CC1=C(C=CN=C1CS(=O)C2=NC3=CC=CC=C3N2)OCC(F)(F)F,C16H14F3N3O2S,,
4,Adapalene,COC1=C(C=C(C=C1)C2=CC3=C(C=C2)C=C(C=C3)C(=O)O)...,C28H28O3,,
...,...,...,...,...,...
2896,Apcin,Cc1ncc([N+](=O)[O-])n1CCOC(=O)NC(Nc1ncccn1)C(C...,C13H14Cl3N7O4,Cayman,APC inhibitor
2897,M2I-1,CC(C)CN(CC(C)C)c1ccc(C=C2C(=O)NC(=S)NC2=O)cc1[...,C19H24N4O4S,Cayman,APC inhibitor
2898,CUDC-101,C#Cc1cccc(Nc2ncnc3cc(OC)c(OCCCCCCC(=O)NO)cc23)c1,C24H26N4O4,Cayman,"HDACi, EGFRi"
2899,Belinostat,O=C(/C=C/c1cccc(S(=O)(=O)Nc2ccccc2)c1)NO,C15H14N2O4S,Cayman,HDACi


In [6]:
compoundLib = compoundData.drop_duplicates(subset='SMILES')
print(compoundLib['SMILES'].isna().sum())
compoundLib

1


Unnamed: 0,CompoundName,SMILES,Formula,Source,Target
0,Angiotensin II,,C50H71N13O12,,
1,Levetiracetam,CCC(C(=O)N)N1CCCC1=O,C8H14N2O2,,
2,Daptomycin,CCCCCCCCCC(=O)NC(CC1=CNC2=CC=CC=C21)C(=O)NC(CC...,C72H101N17O26,,
3,Lansoprazole,CC1=C(C=CN=C1CS(=O)C2=NC3=CC=CC=C3N2)OCC(F)(F)F,C16H14F3N3O2S,,
4,Adapalene,COC1=C(C=C(C=C1)C2=CC3=C(C=C2)C=C(C=C3)C(=O)O)...,C28H28O3,,
...,...,...,...,...,...
2896,Apcin,Cc1ncc([N+](=O)[O-])n1CCOC(=O)NC(Nc1ncccn1)C(C...,C13H14Cl3N7O4,Cayman,APC inhibitor
2897,M2I-1,CC(C)CN(CC(C)C)c1ccc(C=C2C(=O)NC(=S)NC2=O)cc1[...,C19H24N4O4S,Cayman,APC inhibitor
2898,CUDC-101,C#Cc1cccc(Nc2ncnc3cc(OC)c(OCCCCCCC(=O)NO)cc23)c1,C24H26N4O4,Cayman,"HDACi, EGFRi"
2899,Belinostat,O=C(/C=C/c1cccc(S(=O)(=O)Nc2ccccc2)c1)NO,C15H14N2O4S,Cayman,HDACi


In [None]:
# Get Inhibition Data
def getInhibitionData(pubchemReqs=True, verbose=False):
    inhibitorScreens = pd.read_excel(f'{data_path}/LiteratureData/Zhou3_PGCCinhibitorScreening.xlsx', header=1)
    inhibitorScreens = inhibitorScreens.iloc[:, [9,10,11,12,13,14,15,16]].reset_index(drop=True,)
    inhibitorScreens.columns = [
        'Treatment',
        'numPGCC : VARI068',
        'numPGCC : SUM149',
        'numPGCC : SUM159',
        'numPGCC : MDA-MB231',
        'Mean',
        '95% Cl',
        'p-value'
    ]
    inhibitorScreens = inhibitorScreens[inhibitorScreens['Treatment'] != 'Control'].reset_index(drop=True)
    if verbose: print(f'..configured anti-PGCC screening data..(shape={inhibitorScreens.shape})\n')

    unmatched = []
    if verbose: print('..configuring smiles..')
    for idx, row in inhibitorScreens.iterrows():
        name = str(row['Treatment']).strip()
        
        if verbose: print(f'Mol{idx+1}: {name}')

        smi, target = None, None
        filtered = mergedSuppData.loc[mergedSuppData['CompoundName'].str.lower()==name.lower()]

        # troubleshooting
        cleanedName = None
        if filtered.empty: # remove parenthesis
            noParens = re.sub(r'\(.*?\)', '', name).strip()
            filtered = mergedSuppData.loc[mergedSuppData['CompoundName'].str.lower()==noParens.lower()]
            cleanedName = noParens

        if filtered.empty: # only inside parenthesis
            onlyParens = re.findall(r'\((.*?)\)', name)
            if onlyParens: 
                onlyParens = onlyParens[0].strip()
                filtered = mergedSuppData.loc[mergedSuppData['CompoundName'].str.lower()==onlyParens.lower()]
                cleanedName = onlyParens
        
        if filtered.empty: # only first word
            firstWord = cleanedName.split(' ')[0]
            filtered = mergedSuppData.loc[mergedSuppData['CompoundName'].str.lower()==firstWord.lower()]
            cleanedName = firstWord

        if filtered.empty: # substring match?? very loose..
            filtered = mergedSuppData.loc[mergedSuppData['CompoundName'].str.lower().str.contains(cleanedName.lower())]
            cleanedName = firstWord 

        smi = None
        if not filtered.empty: 
            smiles = []
            for fidx, frow in filtered.iterrows():
                curr = str(frow['SMILES'])
                try: 
                    mol = Chem.MolFromSmiles(curr)
                    if mol: smiles.append(Chem.MolToSmiles(mol, canonical=True, isomericSmiles=True))
                except: continue
            if smiles: smi = max(smiles, key=len)
        if smi is None or pd.isna(smi): unmatched.append(name)
        
        if verbose: print(f' > smi={smi}')
        inhibitorScreens.at[idx, 'SMILES'] = smi

    if verbose: print(f'\n..acquired SMILES (n={len(inhibitorScreens["SMILES"]) - inhibitorScreens["SMILES"].isna().sum()})..')
    if verbose: print(f'>unmatched: (n={len(unmatched)}):\n{unmatched}')

    # find SMILES for unmatched compounds
    if pubchemReqs:
        if verbose: print(f'\n..finding unmatched smiles from pubchempy')
        unmatchedCompounds = pd.DataFrame({'Treatment':unmatched}) 
        smilesList = []
        unmatched.clear()
        for idx, row in unmatchedCompounds.iterrows():
            if verbose: print(f'Mol{idx+1}: {row["Treatment"]}:')
            name = re.sub(r'\(.*?\)', '', row['Treatment']).strip()
            # name = row['Treatment']
            try: 
                res = pcp.get_compounds(name, namespace='name', as_dataframe=True)
                if not res.empty:
                    smi = list(res['canonical_smiles'])[0]
                    smilesList.append(smi)        
                else:
                    try: 
                        res = pcp.get_compounds(row['Treatment'], namespace='name', as_dataframe=True)
                        if not res.empty:
                            smi = list(res['canonical_smiles'])[0]
                            smilesList.append(smi)        
                        else:
                            smi = None
                            smilesList.append(None)
                            unmatched.append(row['Treatment'])
                    except Exception as e1: 
                        smi = None
                        smilesList.append(None)
                        unmatched.append(row['Treatment'])
            except Exception as e:
                smi = None
                smilesList.append(None)
                unmatched.append(row['Treatment'])
            if verbose: print(f' > smi={smi}')
        unmatchedCompounds['SMILES'] = smilesList

        inhibitorScreens = inhibitorScreens.merge(
            unmatchedCompounds[['Treatment', 'SMILES']], 
            how='left', 
            on='Treatment'
        )

        if 'SMILES_x' in inhibitorScreens.columns and 'SMILES_y' in inhibitorScreens.columns:
            inhibitorScreens['SMILES'] = inhibitorScreens['SMILES_x'].combine_first(inhibitorScreens['SMILES_y'])
            inhibitorScreens = inhibitorScreens.drop(columns=['SMILES_x', 'SMILES_y'])

        if verbose: print(f'\n..acquired {len(unmatchedCompounds)-unmatchedCompounds["SMILES"].isna().sum()} from pubchempy..')
        if verbose: print(f'>unmatched: (n={len(unmatched)}):\n{unmatched}\n')
    elif verbose: print(f'Unmatched compounds (n={len(unmatched)}):\n{unmatched}\n')

    # begin to gather data
    aidsFromLib = compoundLib.copy().filter(items=['CompoundName', 'SMILES', 'CID', 'AID'])
    inhibitorData = inhibitorScreens.filter(items=['Treatment', 'SMILES', 'p-value'])

    # calculate labels and combine data
    labels = pd.DataFrame(columns=['CompoundName', 'SMILES', 'p-value', 'anti-PGCC label'])
    if verbose: print(f'..matching p-vals and smiles (n={len(inhibitorData)})..')
    for idx, row in inhibitorData.iterrows():
        name = row['Treatment']
        smi = row['SMILES']

        if verbose: print(f'Mol{idx+1}: {name}')
        if pd.isna(smi) or smi is None: 
            if verbose: print(' - NaN SMILES (skipping)..')
            continue
        
        cids = []
        try:
            cid = pcp.get_cids(
                str(smi), 
                namespace='smiles', 
                domain='compound', 
                list_return='flat'
            )
            cids = cid
        except Exception as e:
            if verbose: print(f" - CID lookup failed for {smi}: {e}")

        p = str(row['p-value'])
        if str(p).startswith('<'): val = float(p[1:])
        else: val = float(p)
        
        label = None
        if 0 < val < 0.05: label = 1
        elif val >= 0.05: label = 0
        else: label, p = None, None

        labels = pd.concat([
            labels,
            pd.DataFrame({'CompoundName':name, 
                          'SMILES':smi, 
                          'p-value':p, 
                          'anti-PGCC label':label
                        }, index=[0])
        ], ignore_index=True)

    if verbose: print(f'\n..configured PGCC inhibition labels (shape:{labels.shape})..')
    return labels

inhibitionLabels = getInhibitionData(verbose=True, pubchemReqs=True)
inhibitionLabels.to_csv(f'{result_path}/inhibitionLabels.csv')
inhibitionLabels

In [8]:
inhibitionLabels = pd.read_csv(f'{result_path}/inhibitionLabels.csv')
inhibitionLabels['anti-PGCC label'].isna().sum()

0

In [9]:
from rdkit.DataStructs import ConvertToNumpyArray
import numpy as np

features_path = os.path.join(result_path, 'features')
os.makedirs(features_path, exist_ok=True)

smilabs = inhibitionLabels.copy()
smilabs = smilabs.filter(items=['SMILES', 'anti-PGCC label']).rename(columns={'anti-PGCC label':'label'})

rdkitRows = []
maccsRows = []
ecfp4Rows = []
metaRows = []

for idx, row in smilabs.iterrows():
    smi = row['SMILES']
    label = row['label']
    mol = Chem.MolFromSmiles(smi)
    if not mol: continue

    rdkit_desc = calc.CalcDescriptors(mol)

    maccs = np.zeros((167,), dtype=int)
    ConvertToNumpyArray(MACCSkeys.GenMACCSKeys(mol), maccs)

    ecfp4 = np.zeros((2048,), dtype=int)
    ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048), ecfp4)

    rdkitRows.append([smi, label] + list(rdkit_desc))
    maccsRows.append([smi, label] + list(maccs))
    ecfp4Rows.append([smi, label] + list(ecfp4))
    metaRows.append([smi, label] + list(maccs) + list(ecfp4))

rdkitDF = pd.DataFrame(rdkitRows, columns= ['SMILES', 'label'] + descNames)
maccsDF = pd.DataFrame(maccsRows, columns= ['SMILES', 'label'] + [f'MACCS_{i}' for i in range(167)])
ecfp4DF = pd.DataFrame(ecfp4Rows, columns= ['SMILES', 'label'] + [f'ECFP4_{i}' for i in range(2048)])
metaFeatures = pd.DataFrame(metaRows, columns= ['SMILES', 'label'] + [f'MACCS_{i}' for i in range(167)] + [f'ECFP4_{i}' for i in range(2048)])

rdkitDF.to_csv(f'{features_path}/rdkit.csv')
maccsDF.to_csv(f'{features_path}/maccs.csv')
ecfp4DF.to_csv(f'{features_path}/ecfp4.csv')
metaFeatures.to_csv(f'{features_path}/metaFigerprints.csv')

In [41]:
# ...create scaffold datasets for LLM4SD...

# create paths to feed into LLM4SD
LLM4SD_path = os.path.join(base_dir, 'LLM4SD_antiPGCC') # all data pertaining fed into LLM4SD
os.makedirs(LLM4SD_path, exist_ok=True)
scaffoldDatasets_path = os.path.join(LLM4SD_path, 'scaffold_datasets') # dataset directory
os.makedirs(scaffoldDatasets_path, exist_ok=True)

featuresDict = {
    'rdkit' : rdkitDF, 
    'maccs' : maccsDF, 
    'ecfp4' : ecfp4DF, 
    'metaFingerprints' : metaFeatures
}
rdkitSubCategories = {}

# create train/test/valid sets
def splitSets(input: pd.DataFrame, name, path):
    df = input.copy()

    train, temp = train_test_split(df, test_size=0.2, random_state=15, stratify=df['label'], shuffle=True)
    valid, test = train_test_split(temp, test_size=0.5, random_state=15, stratify=temp['label'], shuffle=True)

    pd.DataFrame(train).to_csv(f'{path}/{name}_train.csv', index=False)
    pd.DataFrame(test).to_csv(f'{path}/{name}_test.csv', index=False)
    pd.DataFrame(valid).to_csv(f'{path}/{name}_valid.csv', index=False)

for name, data in featuresDict.items():
    df = pd.DataFrame(data, copy=True)
    currPath = os.path.join(scaffoldDatasets_path, name)
    os.makedirs(currPath, exist_ok=True)

    if name=='rdkit': # split rdkit into further subcategories
        allRdkit = df.copy()
        rdkitSubCategories['all'] = allRdkit
        splitSets(input=allRdkit, name='all', path=currPath)

        smilab = allRdkit.filter(items=['SMILES', 'label'])

        # E-State Descriptors
        estate = [desc for desc in list(allRdkit.columns) if 'EState' in desc]
        eStateDesc = pd.concat([smilab, allRdkit.filter(items=estate)], axis=1).reset_index(drop=True)
        allRdkit = allRdkit.drop(columns=estate)
        rdkitSubCategories['E-State'] = eStateDesc
        splitSets(input=eStateDesc, name='E-State', path=currPath)

        # Functional Group Counts
        funcGroup = [desc for desc in list(allRdkit.columns) if desc.startswith('fr_')]
        frDesc = pd.concat([smilab, allRdkit.filter(items=funcGroup)], axis=1)
        allRdkit = allRdkit.drop(columns=funcGroup)
        rdkitSubCategories['Functional Group'] = frDesc
        splitSets(input=frDesc, name='functionalGroupCount', path=currPath)

        # Molecular Topology Descriptors
        topology = [desc for desc in list(allRdkit.columns) if desc.lower() in ['balabanj', 'bertzct', 'hallkieralpha', 'ipc', 'avgipc']]
        topology += [desc for desc in list(allRdkit.columns) if desc.startswith('Chi') or desc.startswith('Kappa')]
        topDesc = pd.concat([smilab, allRdkit.filter(items=topology)], axis=1)
        allRdkit = allRdkit.drop(columns=topology)
        rdkitSubCategories['Molecular Topology'] = topDesc
        splitSets(input=topDesc, name='molecularTopology', path=currPath)

        # Fingerprint Based Descriptors
        fing = [desc for desc in list(allRdkit.columns) if desc.startswith('Fp') or desc.startswith('BCUT2D')]
        fingDesc = pd.concat([smilab, allRdkit.filter(items=fing)], axis=1)
        allRdkit = allRdkit.drop(columns=fing)
        rdkitSubCategories['Fingerprint Based'] = fingDesc
        splitSets(input=fingDesc, name='fingerprintBased', path=currPath)

        # Surface Area Descriptors
        sa = [desc for desc in list(allRdkit.columns) if any(x in desc.lower() for x in ['peoe', 'smr', 'slogp']) or desc.lower() == 'labuteasa']
        saDesc = pd.concat([smilab, allRdkit.filter(items=sa)], axis=1)
        allRdkit = allRdkit.drop(columns=sa)
        rdkitSubCategories['Surface Area'] = saDesc
        splitSets(input=saDesc, name='surfaceArea', path=currPath)

        # structural descriptors and counts
        sdc = [
            desc for desc in list(allRdkit.columns)
            if (
                desc.lower().startswith('n') 
                and desc.lower() not in ['numvalenceelectrons', 'numradicalelectrons']
            ) or desc.lower() in ['fractioncsp3', 'ringcount']
        ]
        sdcDesc = pd.concat([smilab, allRdkit.filter(items=sdc)], axis=1)
        allRdkit = allRdkit.drop(columns=sdc)
        rdkitSubCategories['Structural'] = sdcDesc
        splitSets(input=sdcDesc, name='structural', path=currPath)

        # physiochemical descriptors
        pcDesc = allRdkit
        rdkitSubCategories['Physiochemical'] = pcDesc
        splitSets(input=pcDesc, name='physiochemical', path=currPath)

        featuresDict['rdkit'] = rdkitSubCategories

    else: splitSets(input=df, name=name, path=currPath)

In [45]:
rdkitSub = featuresDict['rdkit']
print(f'rdkit subcategories:\n{list(rdkitSub.keys())}\n')

allDf = pd.DataFrame(rdkitSub['all'])
numAllDesc = allDf.shape[1] - 2

numSubDesc = 0
i = 1
for name, data in rdkitSub.items():
    df = pd.DataFrame(data, copy=True).drop(columns=['SMILES', 'label'])
    numDesc = df.shape[1]
    if name!='all': numSubDesc += numDesc

    print(f'({i})...{name.upper()}...')
    print(f'n_descriptors=[{numDesc}]')
    print(f"Descriptors:\n{list(df.columns)}")

    i+=1
    print()

print(f'numAllDesc:{numAllDesc}\nnumSubDesc:{numSubDesc}')

rdkit subcategories:
['all', 'E-State', 'Functional Group', 'Molecular Topology', 'Fingerprint Based', 'Surface Area', 'Structural', 'Physiochemical']

(1)...ALL...
n_descriptors=[209]
Descriptors:
['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 

In [11]:
# create descriptors and split into train/test/valid sets
# split up into maccs, ecfp4, rdkit, and combined fingerprints
# place into LLM4Sd directory to parse into src/tools

# RUN ON GOOGLE COLLAB
# YOU DO NOT HAVE THE RAM FOR THIS PROJECT

# create run.ipynb, and maybe individual run files, that iterate through the classes in src/tools