In [1]:
import pandas as pd  # Import pandas for data manipulation
from rdkit import Chem  # Import RDKit for chemical informatics
from rdkit.Chem import rdMolDescriptors  # Import molecular descriptor functions from RDKit

# GABAA_processing

In [2]:
# Load the 'GABAA.csv' file into a pandas DataFrame
data = pd.read_csv('../data/GABAA.csv', encoding='gb2312')

In [5]:
# Function to check if a given SMILES string contains metal atoms
def contains_metal(smiles):
    mol = Chem.MolFromSmiles(smiles)  # Convert SMILES string to RDKit molecular object
    if mol:  # Check if molecule conversion was successful
        for atom in mol.GetAtoms():  # Iterate through each atom in the molecule
            # Check if the atomic number corresponds to a metal
            if atom.GetAtomicNum() in [3, 11, 13, 19, 20, 25, 26, 29, 30, 42, 47, 50, 73, 74, 78, 79, 80, 82, 83]:
                return True  # Return True if a metal atom is found
    return False  # Return False if no metal atoms are found

In [4]:
# Filter out molecules that contain metal atoms
filtered_df = data[~data['smiles'].apply(contains_metal)]

In [5]:
# Calculate the exact molecular weight for each molecule
filtered_df['molecular_weight'] = filtered_df['smiles'].apply(
    lambda x: rdMolDescriptors.CalcExactMolWt(Chem.MolFromSmiles(x)) if Chem.MolFromSmiles(x) else None
)

In [6]:
# Further filter molecules based on molecular weight (between 30 and 1000 Da)
filtered_df_2 = filtered_df[(filtered_df['molecular_weight'] >= 30) & (filtered_df['molecular_weight'] <= 1000)]

In [7]:
filtered_df_2

Unnamed: 0,name,origin,label,smiles,class,molecular_weight
0,gamma-Aminobutyric acid,10.3390/molecules24152678,GABAA agonist,C(CC(=O)O)CN,1,103.063329
1,Menthol,10.1111/bph.12602,GABAA agonist,CC1CCC(C(C1)O)C(C)C,1,156.151415
2,phenobarbital,10.1002/ana.24967,GABAA agonist,CCC1(C(=O)NC(=O)NC1=O)C2=CC=CC=C2,1,232.084792
3,isoguvacine oxide,10.1002/chir.530070608,GABAA agonist,C1CNCC2C1(O2)C(=O)O,1,143.058243
4,Clomethiazole,10.1016/s0014-2999(02)02233-1,GABAA agonist,CC1=C(CCCl)SC=N1,1,161.006598
...,...,...,...,...,...,...
483,chlorogenic acid,10.1021/jf0303971,GABAA inhibition,C1C(C(C(CC1(C(=O)O)O)OC(=O)C=CC2=CC(=C(C=C2)O)...,0,354.095082
484,maltol,10.1021/jf0303971,GABAA inhibition,CC1=C(C(=O)C=CO1)O,0,126.031694
485,Theobromine,10.1021/jf0303971,GABAA inhibition,CN1C=NC2=C1C(=O)NC(=O)N2C,0,180.064725
486,"2,3,5-trimethylpyrazine",10.1021/jf0303971,GABAA inhibition,CN1C2=C(C(=O)N(C1=O)C)NC=N2,0,180.064725


In [16]:
# Save the filtered dataset back to 'GABAA.csv'
filtered_df_2.to_csv('GABAA.csv')

# plant_processing

In [3]:
# Load the 'plant.xlsx' file into a pandas DataFrame
plant = pd.read_excel('../data/plant.xlsx')

In [6]:
# Filter out molecules that contain metal atoms in the plant dataset
filtered_plant = plant[~plant['SMILES'].apply(contains_metal)]

In [7]:

# Calculate the exact molecular weight for each molecule in the plant dataset
filtered_plant['molecular_weight'] = filtered_plant['SMILES'].apply(
    lambda x: rdMolDescriptors.CalcExactMolWt(Chem.MolFromSmiles(x)) if Chem.MolFromSmiles(x) else None
)

In [8]:
# Further filter plant molecules based on molecular weight (between 30 and 1000 Da)
filtered_plant_2 = filtered_plant[(filtered_plant['molecular_weight'] >= 30) & (filtered_plant['molecular_weight'] <= 1000)]

In [9]:
filtered_plant_2

Unnamed: 0,Scientific name,Chinese name,Extraction site,Volatile compounds,Compound content,Analysis method,Cite,Species,Genus,Family,Order,Class,Phylum,Kingdom,SMILES,molecular_weight
0,Lamium album,短柄野芝麻,flower,"6,10,14-Trimethylpentadecan-2-one",0.1020,GC-MS,"Morteza-Semnani K, Saeedi M, Akbarzadeh M. Che...",Lamium album,Lamium,Lamiaceae,Lamiales,Magnoliopsida,Angiospermae,Plantae,CC(C)CCCC(C)CCCC(C)CCCC(=O)C,268.276616
1,Lamium album,短柄野芝麻,flower,Diacetone alcohol,0.0910,GC-MS,"Morteza-Semnani K, Saeedi M, Akbarzadeh M. Che...",Lamium album,Lamium,Lamiaceae,Lamiales,Magnoliopsida,Angiospermae,Plantae,CC(=O)CC(C)(C)O,116.083730
2,Lamium album,短柄野芝麻,flower,1-Pentanol,0.0410,GC-MS,"Morteza-Semnani K, Saeedi M, Akbarzadeh M. Che...",Lamium album,Lamium,Lamiaceae,Lamiales,Magnoliopsida,Angiospermae,Plantae,CCCCCO,88.088815
3,Lamium album,短柄野芝麻,flower,Tetracosane,0.0390,GC-MS,"Morteza-Semnani K, Saeedi M, Akbarzadeh M. Che...",Lamium album,Lamium,Lamiaceae,Lamiales,Magnoliopsida,Angiospermae,Plantae,CCCCCCCCCCCCCCCCCCCCCCCC,338.391252
4,Lamium album,短柄野芝麻,flower,Tricosane,0.0370,GC-MS,"Morteza-Semnani K, Saeedi M, Akbarzadeh M. Che...",Lamium album,Lamium,Lamiaceae,Lamiales,Magnoliopsida,Angiospermae,Plantae,CCCCCCCCCCCCCCCCCCCCCCC,324.375602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10810,Hyssopus officinalis,神香草,leaf,Myrcene,0.0052,GC-MS,"KIZIL S, HAŞİMİ N, Tolan V, et al. Chemical co...",Hyssopus officinalis,Hyssopus,Lamiaceae,Lamiales,Magnoliopsida,Angiospermae,Plantae,CC(=CCCC(=C)C=C)C,136.125201
10811,Hyssopus officinalis,神香草,leaf,cis-Pinonic acid,0.0063,GC-MS,"KIZIL S, HAŞİMİ N, Tolan V, et al. Chemical co...",Hyssopus officinalis,Hyssopus,Lamiaceae,Lamiales,Magnoliopsida,Angiospermae,Plantae,CC(=O)C1CC(C1(C)C)CC(=O)O,184.109944
10812,Hyssopus officinalis,神香草,leaf,beta-bisabolene,0.0057,GC-MS,"KIZIL S, HAŞİMİ N, Tolan V, et al. Chemical co...",Hyssopus officinalis,Hyssopus,Lamiaceae,Lamiales,Magnoliopsida,Angiospermae,Plantae,CC1=CCC(CC1)C(=C)CCC=C(C)C,204.187801
10813,Hyssopus officinalis,神香草,leaf,Elemol,0.0055,GC-MS,"KIZIL S, HAŞİMİ N, Tolan V, et al. Chemical co...",Hyssopus officinalis,Hyssopus,Lamiaceae,Lamiales,Magnoliopsida,Angiospermae,Plantae,CC(=C)C1CC(CCC1(C)C=C)C(C)(C)O,222.198365


In [None]:
# Save the filtered plant dataset to 'filtered_plant.csv'
filtered_plant_2.to_excel('../data/plant.xlsx')