# KEGG Compounds Processing

In [None]:
import pandas as pd
from Bio.KEGG.Compound import parse
from rdkit import Chem

In [None]:
data = []
with open(f'data/compounds.kegg', 'r') as file:
        for record in parse(file):
            data.append(record.__dict__)
df = pd.DataFrame(data)

In [None]:
#Get SMILES with removed stereochemistry for every compound from the .mol file
suppl = Chem.SDMolSupplier('data/compounds.mol')
ids_list=[]
smiles_list = []
for mol in suppl:
    if mol:
        ids_list.append(mol.GetProp('ENTRY').replace('cpd:', ''))
        smiles_list.append(Chem.MolToSmiles(mol))
id_to_smiles_map = dict(zip(ids_list, smiles_list))
df['smiles'] = df['entry'].map(id_to_smiles_map)

In [None]:
#Substitute empty lists and strings with NaN
df = df.map(lambda x: pd.NA if (x == '' or (isinstance(x, list) and len(x) == 0)) else x)
df.info()

Only 6578 compounds from KEGG have associated pathways

In [None]:
df.dropna(subset=['pathway', 'smiles'], inplace=True)
df.drop(columns=['structures', 'mass', 'formula'], inplace=True)
df = df[~df['smiles'].str.contains("\\*")]
df.info()

In [None]:
df.to_csv('data/compounds-parsed.tsv', sep='\t', index=False)

# Commercial IL Processing

In [None]:
df = pd.concat([
    pd.read_csv('data/proionic-raw.tsv', delimiter='\t'),
    pd.read_csv('data/iolitec-raw.tsv', delimiter='\t')
])
df

In [None]:
df["name"] = (
    df["name"]
    .str.replace(r",\s*[><]?\d+%", "", regex=True)  # remove the comma and percentage pattern
    .str.strip()                                    # strip leading/trailing whitespace
)
df

In [None]:
df = df.dropna(subset=['name','cas'], how='all')

In [None]:
import pubchempy as pcp

In [None]:
def get_smiles(name, cas):
    compounds = pcp.get_compounds(cas, 'name')
    if not compounds:
        compounds = pcp.get_compounds(name, 'name')

    return compounds[0].isomeric_smiles if compounds else None

In [None]:
smiles_list=[]
for name, cas in zip(df.name, df.cas):
    try:
        smi = get_smiles(name, cas)
        smiles_list.append(smi)
    except:
        smiles_list.append(None)
        continue

In [None]:
df['smiles'] = smiles_list
df

In [None]:
df.dropna(subset=['smiles'], inplace=True)
df = df[df['smiles'].str.contains('\.')]
df

In [None]:
from molvs import standardize_smiles
df['smiles'] = df['smiles'].apply(standardize_smiles)
df

In [None]:
from rdkit import Chem
def split_il_smiles(smiles):
    fragments = smiles.split('.')
    for fragment in fragments:
        mol = Chem.MolFromSmiles(fragment)
        charge = Chem.GetFormalCharge(mol) 
        if charge > 0:
            cation_smiles = fragment
        elif charge < 0:
            anion_smiles = fragment
        else:
            return None, None

    return cation_smiles, anion_smiles

In [None]:
df[['cation', 'anion']] = df['smiles'].apply(split_il_smiles).apply(pd.Series)
df

In [None]:
df = df.dropna(subset=['cation', 'anion'], how='all')[['smiles','cation','anion','name','cas', 'url']]
df

In [None]:
df.to_csv('data/commercial-il.tsv', sep='\t', index=False)