In [1]:
import pandas as pd  # Import pandas for data manipulation
from rdkit import Chem  # Import RDKit for chemical informatics
from rdkit.Chem import rdMolDescriptors  # Import molecular descriptor functions from RDKit

# GABAA_processing

In [2]:
# Load the 'GABAA.csv' file into a pandas DataFrame
data = pd.read_csv('../data/GABAA.csv', encoding='gb2312')

In [3]:
# Function to check if a given SMILES string contains metal atoms
def contains_metal(smiles):
    mol = Chem.MolFromSmiles(smiles)  # Convert SMILES string to RDKit molecular object
    if mol:  # Check if molecule conversion was successful
        for atom in mol.GetAtoms():  # Iterate through each atom in the molecule
            # Check if the atomic number corresponds to a metal
            if atom.GetAtomicNum() in [3, 11, 13, 19, 20, 25, 26, 29, 30, 42, 47, 50, 73, 74, 78, 79, 80, 82, 83]:
                return True  # Return True if a metal atom is found
    return False  # Return False if no metal atoms are found

In [4]:
# Filter out molecules that contain metal atoms
filtered_df = data[~data['smiles'].apply(contains_metal)]

In [5]:
# Calculate the exact molecular weight for each molecule
filtered_df['molecular_weight'] = filtered_df['smiles'].apply(
    lambda x: rdMolDescriptors.CalcExactMolWt(Chem.MolFromSmiles(x)) if Chem.MolFromSmiles(x) else None
)

In [6]:
# Further filter molecules based on molecular weight (between 30 and 1000 Da)
filtered_df_2 = filtered_df[(filtered_df['molecular_weight'] >= 30) & (filtered_df['molecular_weight'] <= 1000)]

In [7]:
filtered_df_2

Unnamed: 0,name,origin,label,smiles,class,molecular_weight
0,gamma-Aminobutyric acid,10.3390/molecules24152678,GABAA agonist,C(CC(=O)O)CN,1,103.063329
1,Menthol,10.1111/bph.12602,GABAA agonist,CC1CCC(C(C1)O)C(C)C,1,156.151415
2,phenobarbital,10.1002/ana.24967,GABAA agonist,CCC1(C(=O)NC(=O)NC1=O)C2=CC=CC=C2,1,232.084792
3,isoguvacine oxide,10.1002/chir.530070608,GABAA agonist,C1CNCC2C1(O2)C(=O)O,1,143.058243
4,Clomethiazole,10.1016/s0014-2999(02)02233-1,GABAA agonist,CC1=C(CCCl)SC=N1,1,161.006598
...,...,...,...,...,...,...
483,chlorogenic acid,10.1021/jf0303971,GABAA inhibition,C1C(C(C(CC1(C(=O)O)O)OC(=O)C=CC2=CC(=C(C=C2)O)...,0,354.095082
484,maltol,10.1021/jf0303971,GABAA inhibition,CC1=C(C(=O)C=CO1)O,0,126.031694
485,Theobromine,10.1021/jf0303971,GABAA inhibition,CN1C=NC2=C1C(=O)NC(=O)N2C,0,180.064725
486,"2,3,5-trimethylpyrazine",10.1021/jf0303971,GABAA inhibition,CN1C2=C(C(=O)N(C1=O)C)NC=N2,0,180.064725


In [16]:
# Save the filtered dataset back to 'GABAA.csv'
filtered_df_2.to_csv('GABAA.csv')

# plant_processing

In [8]:
# Load the 'plant.csv' file into a pandas DataFrame
plant = pd.read_csv('plant.csv', encoding='gb18030')

In [9]:
# Filter out molecules that contain metal atoms in the plant dataset
filtered_plant = plant[~plant['SMILES'].apply(contains_metal)]

In [10]:

# Calculate the exact molecular weight for each molecule in the plant dataset
filtered_plant['molecular_weight'] = filtered_plant['SMILES'].apply(
    lambda x: rdMolDescriptors.CalcExactMolWt(Chem.MolFromSmiles(x)) if Chem.MolFromSmiles(x) else None
)

In [11]:
# Further filter plant molecules based on molecular weight (between 30 and 1000 Da)
filtered_plant_2 = filtered_plant[(filtered_plant['molecular_weight'] >= 30) & (filtered_plant['molecular_weight'] <= 1000)]

In [12]:
filtered_plant_2

Unnamed: 0,Scientific_name,Chinese_name,Extraction_site,Volatile_compounds,Compound_content,Analysis_method,Cite,SMILES,molecular_weight
0,Lamium album,短柄野芝麻,flower,"6,10,14-Trimethylpentadecan-2-one",0.102,GC-MS,"Morteza-Semnani K, Saeedi M, Akbarzadeh M. Che...",CC(C)CCCC(C)CCCC(C)CCCC(=O)C,268.276616
1,Lamium album,短柄野芝麻,flower,Diacetone alcohol,0.091,GC-MS,"Morteza-Semnani K, Saeedi M, Akbarzadeh M. Che...",CC(=O)CC(C)(C)O,116.083730
2,Lamium album,短柄野芝麻,flower,1-Pentanol,0.041,GC-MS,"Morteza-Semnani K, Saeedi M, Akbarzadeh M. Che...",CCCCCO,88.088815
3,Lamium album,短柄野芝麻,flower,Tetracosane,0.039,GC-MS,"Morteza-Semnani K, Saeedi M, Akbarzadeh M. Che...",CCCCCCCCCCCCCCCCCCCCCCCC,338.391252
4,Lamium album,短柄野芝麻,flower,Tricosane,0.037,GC-MS,"Morteza-Semnani K, Saeedi M, Akbarzadeh M. Che...",CCCCCCCCCCCCCCCCCCCCCCC,324.375602
...,...,...,...,...,...,...,...,...,...
2389,Glechoma hederacea,欧活血丹,overground part,Deoxysericealactone,0.0137,GC-MS,"Chou S T, Lai C C, Lai C P, et al. Chemical co...",CC1=C2CC(C(CC2OC1=O)(C)C=C)C(=C)C(=O)OC,276.136159
2390,Glechoma hederacea,欧活血丹,overground part,"2-Butanone, 4-(2,6,6-trimethyl-2-cyclohexen-1-...",0.0121,GC-MS,"Chou S T, Lai C C, Lai C P, et al. Chemical co...",CC1=CCCC(C1=CCC(=O)C)(C)C,192.151415
2391,Glechoma hederacea,欧活血丹,overground part,Nopol,0.0283,GC-MS,"Chou S T, Lai C C, Lai C P, et al. Chemical co...",CC1(C2CC=C(C1C2)CCO)C,166.135765
2392,Hyssopus officinalis,神香草,leaf,"3,5,5-Trimethylbicyclo[2.2.1]heptan-2-one",0.5727,GC-MS,"KIZIL S, HA??M? N, Tolan V, et al. Chemical co...",CC1C2CC(C1=O)CC2(C)C,152.120115


In [None]:
# Save the filtered plant dataset to 'filtered_plant.csv'
filtered_plant_2.to_csv('filtered_plant.csv')