<a href="https://colab.research.google.com/github/Onionomics/Bioprospection_tools/blob/main/NLP_common%20chems%20extracting%20tool_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Instalar paquetes

In [1]:
!pip install rdkit



### Importar paquetes

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

### Cargar BD

In [4]:
raw_db = "Scopus2 (1).csv"  # Como se descarga directamente
df = pd.read_csv(raw_db)
print(df.head())

                                             Authors  \
0  Kasamatsu S.; Kinno A.; Hishiyama J.-I.; Akaik...   
1  Roszczenko P.; Szewczyk O.K.; Czarnomysy R.; B...   
2      Shlosberg Y.; Huang A.; Tóth T.N.; Kaner R.B.   
3             Xu H.; Li Y.; Xing J.; Liu L.; Wang Y.   
4  Vuković S.; Popović-Djordjević J.B.; Kostić A....   

                                   Author full names  \
0  Kasamatsu, Shingo (26663334200); Kinno, Ayaka ...   
1  Roszczenko, Piotr (57226442658); Szewczyk, Olg...   
2  Shlosberg, Yaniv (57220860520); Huang, Ailun (...   
3  Xu, Huanhuan (57315874700); Li, Yi (5731587480...   
4  Vuković, Sandra (58074329000); Popović-Djordje...   

                                        Author(s) ID  \
0  26663334200; 57981299300; 57981635500; 7102705...   
1  57226442658; 58625520800; 26429139800; 4326224...   
2   57220860520; 57224007190; 7102154276; 7006509807   
3  57315874700; 57315874800; 57208030437; 1402577...   
4  58074329000; 36601201000; 36439697400; 8654

### Extracción de los apartados de interés: Abstracts por separado

In [5]:
import re

df.columns = df.columns.str.strip()  # Array con los encabezados del df
array_keywords = "Author Keywords"  # nombre exacto de la columna
array_abstracts = "Abstract"
abstract_tok = "Abs_token"
df[array_keywords] = df[array_keywords].astype(str).str.lower()
df[array_abstracts] = df[array_abstracts].astype(str).str.lower()

df[abstract_tok] = df[array_abstracts].apply(lambda x: re.findall(r'\b\w+\b', x))
df[abstract_tok].head()

Unnamed: 0,Abs_token
0,"[alliaceous, and, cruciferous, vegetables, are..."
1,"[nanomedicine, is, a, potential, provider, of,..."
2,"[in, recent, years, extensive, scientific, eff..."
3,"[secondary, metabolites, contribute, to, the, ..."
4,"[ever, since, ancient, times, allium, species,..."


### Revisión de abstracts a partir de términos químicos en CheBI

In [7]:
from rdkit import Chem
import gzip

# Ruta del archivo descargado (ajusta la ruta según corresponda)
BdCheBI_sdf = "ChEBI_lite.sdf"

# Cargar el archivo con SDMolSupplier
supplier = Chem.SDMolSupplier(BdCheBI_sdf)

# Verificar si se cargaron moléculas
molecules = [mol for mol in supplier if mol is not None]
print(f"Se cargaron {len(molecules)} moléculas.")

data = [{"ID": mol.GetProp("ChEBI ID"), "Nombre": mol.GetProp("_Name")} for mol in molecules if mol.HasProp("ChEBI ID")]
df_chebi = pd.DataFrame(data)
df_chebi.columns

df_chebi.head()

[00:35:19] Can't kekulize mol.  Unkekulized atoms: 1 3 5 7 8
[00:35:19] ERROR: Could not sanitize molecule ending on line 174882
[00:35:19] ERROR: Can't kekulize mol.  Unkekulized atoms: 1 3 5 7 8
[00:35:20] 

****
Post-condition Violation
Element 'hv' not found
Violation occurred on line 93 in file /project/build/temp.linux-x86_64-cpython-312/rdkit/Code/GraphMol/PeriodicTable.h
Failed Expression: anum > -1
----------
Stacktrace:
----------
****

[00:35:20] ERROR: Element 'hv' not found
[00:35:20] ERROR: moving to the beginning of the next molecule
[00:35:20] Explicit valence for atom # 1 N, 4, is greater than permitted
[00:35:20] ERROR: Could not sanitize molecule ending on line 282792
[00:35:20] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted
[00:35:20] Explicit valence for atom # 15 O, 3, is greater than permitted
[00:35:20] ERROR: Could not sanitize molecule ending on line 334358
[00:35:20] ERROR: Explicit valence for atom # 15 O, 3, is greater than permitted
[

Se cargaron 23409 moléculas.


[00:35:27] ERROR: EOF hit while reading atoms
[00:35:27] ERROR: moving to the beginning of the next molecule


Unnamed: 0,ID,Nombre
0,CHEBI:90,
1,CHEBI:165,
2,CHEBI:598,
3,CHEBI:776,
4,CHEBI:943,


In [8]:
compuestos_chebi = [mol.GetProp("ChEBI Name").lower() for mol in molecules if mol.HasProp("ChEBI Name")]

# Verificar si se extrajeron nombres
print(f"Se extrajeron {len(compuestos_chebi)} nombres de compuestos.")
print("Nombres extraídos:", compuestos_chebi[:10])  # Ver los primeros 10 nombres
df_mols = pd.DataFrame(compuestos_chebi)
df_mols.rename(columns={df_mols.columns[0]: "Chemical_comps"}, inplace=True)
df_mols.head()

Se extrajeron 23409 nombres de compuestos.
Nombres extraídos: ['(-)-epicatechin', '(1s,4r)-fenchone', '1-alkyl-2-acylglycerol', '16alpha-hydroxyestrone', '2,6-dichlorobenzonitrile', '2-hydroxybutyric acid', '3-(all-trans-octaprenyl)benzene-1,2-diol', '2-polyprenylphenol', '20-hydroxycholesterol', '3-oxo-5beta-steroid']


Unnamed: 0,Chemical_comps
0,(-)-epicatechin
1,"(1s,4r)-fenchone"
2,1-alkyl-2-acylglycerol
3,16alpha-hydroxyestrone
4,"2,6-dichlorobenzonitrile"


In [9]:
print(df_mols.shape)

(23409, 1)


In [10]:
# Abrir el archivo como df

df_npass = pd.read_csv("NPASSv1.txt", sep="\t", encoding="ISO-8859-1")
df_npass.head()

Unnamed: 0,np_id,pref_name,iupac_name,chembl_id,pubchem_cid,zinc_id
0,NPC100002,Dibromomethane,dibromomethane,CHEMBL1229889,3024,n.a.
1,NPC100017,Yuanhuadine,,CHEMBL1911631,6440572,n.a.
2,NPC100039,4-Methylbenzaldehyde,4-methylbenzaldehyde,CHEMBL190927,7725,n.a.
3,NPC100048,n.a.,,CHEMBL506315,10629555,n.a.
4,NPC100049,"7,7''-Di-O-Methylamentoflavone",5-hydroxy-8-[2-hydroxy-5-(5-hydroxy-7-methoxy-...,CHEMBL1208793,13871759,n.a.


In [11]:
# Filtrar solo las columnas: pref_name y pubchem_cid
df_npass_filt = df_npass[["pref_name", "pubchem_cid"]].copy()

# Eliminar entradas invalidas
df_npass_filt = df_npass_filt[
    (df_npass_filt["pref_name"].str.lower() != "n.a.") &
    (df_npass_filt["pubchem_cid"].str.lower() != "n.a.")
]

df_npass_filt.reset_index(drop=True, inplace=True) #indice de compuesto actualizado

df_npass_filt.head()

Unnamed: 0,pref_name,pubchem_cid
0,Dibromomethane,3024
1,Yuanhuadine,6440572
2,4-Methylbenzaldehyde,7725
3,"7,7''-Di-O-Methylamentoflavone",13871759
4,Olean-12(13)En-3Beta-Hexadecanoate,13915595


## Unir CheBI y NPASS

In [12]:
# Nombres en minuscula y sin espacios en los extremos
df_npass_filt["pref_name"] = df_npass_filt["pref_name"].str.strip().str.lower()

# Filtrar los nombres que NO están ya en df_mols. EVITAR DUPLICADOS
nuevos_compuestos = df_npass_filt[~df_npass_filt["pref_name"].isin(df_mols["Chemical_comps"])]

# Crear un DataFrame con la columna Chemical_comps
df_nuevos = nuevos_compuestos[["pref_name"]].rename(columns={"pref_name": "Chemical_comps"})

# Concatenar los nuevos compuestos a df_mols
df_mols = pd.concat([df_mols, df_nuevos], ignore_index=True)

print(df_mols.shape)

(47901, 1)


In [None]:
df_mols.head()

Unnamed: 0,Chemical_comps
0,(-)-epicatechin
1,"(1s,4r)-fenchone"
2,1-alkyl-2-acylglycerol
3,16alpha-hydroxyestrone
4,"2,6-dichlorobenzonitrile"


## Extraer clase, familia quimica y sinonimos de los compuestos usando Metabocards

In [13]:
# Inicializar listas para almacenar los datos
common_names = []
synonyms_list = []
kingdoms = []
classes = []
families = []

# Abrir el archivo y leer línea por línea
with open("metabocards.txt", "r", encoding="ISO-8859-1") as f:
    lines = f.readlines()

# Diccionario temporal para almacenar datos de un metabolito
current_data = {"Common_Name": None, "Synonyms": None, "Kingdom": None, "Class": None, "Family": None}

i = 0
while i < len(lines):
    line = lines[i].strip()

    if line.startswith("# Common_Name:"):
        current_data["Common_Name"] = lines[i + 1].strip() if i + 1 < len(lines) else None

    elif line.startswith("# Synonyms:"):
        current_data["Synonyms"] = lines[i + 1].strip().split(";") if i + 1 < len(lines) else []

    elif line.startswith("# Taxonomy_Kingdom:"):
        current_data["Kingdom"] = lines[i + 1].strip() if i + 1 < len(lines) else None

    elif line.startswith("# Taxonomy_Class:"):
        current_data["Class"] = lines[i + 1].strip() if i + 1 < len(lines) else None

    elif line.startswith("# Taxonomy_Family:"):
        current_data["Family"] = lines[i + 1].strip() if i + 1 < len(lines) else None
        # Una vez capturamos Family, asumimos que termina el bloque del metabolito y guardamos los datos
        common_names.append(current_data["Common_Name"])
        synonyms_list.append(current_data["Synonyms"])
        kingdoms.append(current_data["Kingdom"])
        classes.append(current_data["Class"])
        families.append(current_data["Family"])

        # Reiniciar el diccionario para el siguiente compuesto
        current_data = {"Common_Name": None, "Synonyms": None, "Kingdom": None, "Class": None, "Family": None}

    i += 1  # Avanzar a la siguiente línea

# Crear DataFrame
df_metabocards = pd.DataFrame({
    "Common_Name": common_names,
    "Synonyms": synonyms_list,
    "Kingdom": kingdoms,
    "Class": classes,
    "Family": families
})

# Mostrar primeras filas para verificar
print(df_metabocards.head())

             Common_Name                                           Synonyms  \
0      1-Methylhistidine  [1 methylhistidine,  1-methyl histidine,  1-MH...   
1     1,3-Diaminopropane  [1,3-Diamino-n-propane,  1,3-Propylenediamine,...   
2     2-Ketobutyric acid  [2-Ketobutanoate,  2-Ketobutanoic acid,  2-Oxo...   
3  2-Hydroxybutyric acid  [(RS)-2-Hydroxybutyrate,  (RS)-2-Hydroxybutyri...   
4       2-Methoxyestrone  [2-(8S,9S,13S,14S)-3-hydroxy-2-methoxy-13-meth...   

   Kingdom                             Class                Family  
0  Organic                       Amino Acids  Mammalian Metabolite  
1  Organic                     Cyclic Amines  Mammalian Metabolite  
2  Organic                        Keto-Acids  Mammalian Metabolite  
3  Organic                     Hydroxy Acids  Mammalian Metabolite  
4  Organic  Steroids and Steroid Derivatives  Mammalian Metabolite  


## Busqueda en abstracts con el listado de compuestos construido

### Funcion de búsqueda con diccionario de sinonimos

In [14]:
# Crear el diccionario de equivalencias
df_mols = pd.DataFrame(columns=["Chemical_comps"])

diccionario_sinonimos = {}

for _, row in df_metabocards.iterrows():
    compuesto_original = row["Common_Name"]
    sinonimos = row["Synonyms"] if isinstance(row["Synonyms"], list) else []

    for sinonimo in sinonimos:
        diccionario_sinonimos[sinonimo.strip().lower()] = compuesto_original.lower()

    diccionario_sinonimos[compuesto_original.lower()] = compuesto_original.lower()

# Filtrar sinónimos genéricos no deseados
stop_synonyms = {
    "acid", "acetate", "sugar", "alcohols", "hydroxy", "m 2",
    "oxygen", "hydrogen", "phosphorus"
}

# Eliminar entradas cuyo key esté en la lista de términos genéricos
diccionario_sinonimos = {
    k: v for k, v in diccionario_sinonimos.items() if k not in stop_synonyms
}

# Set con todos los términos mapeables válidos
chemical_set = set(diccionario_sinonimos.keys()) | set(diccionario_sinonimos.values())

# Asegurar compuestos en df_mols sean minúsculas
df_mols["Chemical_comps"] = df_mols["Chemical_comps"].str.lower()

# Mostrar una muestra del diccionario final
print(list(diccionario_sinonimos.items())[:10])

[('1 methylhistidine', '1-methylhistidine'), ('1-methyl histidine', '1-methylhistidine'), ('1-mhis', '1-methylhistidine'), ('1-methyl-histidine', '1-methylhistidine'), ('1-methyl-l-histidine', '1-methylhistidine'), ('1-n-methyl-l-histidine', '1-methylhistidine'), ('l-1-methylhistidine', '1-methylhistidine'), ('n1-methyl-l-histidine', '1-methylhistidine'), ('pi-methylhistidine', '1-methylhistidine'), ('1-methylhistidine', '3-methylhistidine')]


In [15]:
from nltk.util import ngrams
from collections import defaultdict

# Lista de tokens a excluir por confusión (stop words)
stop_tokens = {
    'a', 'as', 'this', 'that', 'be', 'same', 'result', 'optimum', 'lead',
    'ltd', 'r', 'e', 'c', 'na', 'co', 'fe', 'ag', 'cd', 'cr', 'cl', 'au',
    't3', 'cc', 'aa', 'pca', 'pla', 'met'
}
min_token_length = 3  # longitud mínima para tokens individuales

def find_common_tokens_with_ngrams(abs_tokens, chemical_comps, diccionario_sinonimos):
    # Filtrar tokens individuales
    filtered_tokens = [t for t in abs_tokens if len(t) >= min_token_length and t not in stop_tokens]

    # Generar bigramas (como strings separados por espacio)
    bigram_tokens = [' '.join(bg) for bg in ngrams(abs_tokens, 2)]

    # Unificar tokens y bigramas para búsqueda
    all_terms = filtered_tokens + bigram_tokens

    normalized_tokens = set()
    token_mapping = {}

    for token in all_terms:
        token_lower = token.lower()
        mapped = diccionario_sinonimos.get(token_lower, token_lower)
        if mapped in chemical_comps:
            normalized_tokens.add(mapped)
            token_mapping[token] = mapped

    return list(normalized_tokens), token_mapping

In [16]:
# Aplicar la nueva función
df[["common_chemicals", "chemical_mapping"]] = df["Abs_token"].apply(
    lambda tokens: pd.Series(find_common_tokens_with_ngrams(tokens, chemical_set, diccionario_sinonimos))
)

# Visualizar
df[["Abs_token", "common_chemicals", "chemical_mapping"]].head()

Unnamed: 0,Abs_token,common_chemicals,chemical_mapping
0,"[alliaceous, and, cruciferous, vegetables, are...",[sulfide],{'sulfur': 'sulfide'}
1,"[nanomedicine, is, a, potential, provider, of,...",[],{}
2,"[in, recent, years, extensive, scientific, eff...","[nadph, nadh]","{'nadh': 'nadh', 'nadph': 'nadph'}"
3,"[secondary, metabolites, contribute, to, the, ...",[naringenin],{'naringenin': 'naringenin'}
4,"[ever, since, ancient, times, allium, species,...",[ethanol],{'alcohol': 'ethanol'}


In [17]:
# Guardar la matriz de extraccciones y equivalencias (trazabilidad de términos)
df_supervissed=df[["Abs_token", "common_chemicals", "chemical_mapping"]]
df_supervissed.to_excel("df_extractions.xlsx", index=False)

In [18]:
from collections import Counter

# Aplanar todos los valores del nuevo diccionario de mapeo
all_mappings = df["chemical_mapping"].dropna().apply(lambda d: list(d.items()))
flat_mappings = [pair for sublist in all_mappings for pair in sublist]

# Contar las apariciones más comunes
conteo_mapeos = Counter(flat_mappings).most_common(70)

# Mostrar los más frecuentes
for (original, mapeado), count in conteo_mapeos:
    print(f"{(original, mapeado)}: {count}")

('water', 'water'): 52
('sulfur', 'sulfide'): 39
('quercetin', 'quercetin'): 22
('sodium', 'sodium'): 20
('glutathione', 'glutathione'): 18
('nitrogen', 'nitrogen'): 15
('nitrate', 'nitrate'): 14
('ascorbic acid', 'ascorbic acid'): 14
('glucose', 'd-glucose'): 12
('superoxide', 'superoxide'): 12
('cysteine', 'l-cysteine'): 11
('ethanol', 'ethanol'): 11
('phenol', 'phenol'): 11
('vitamin c', 'ascorbic acid'): 11
('kaempferol', 'kaempferol'): 11
('l cysteine', 'l-cysteine'): 9
('potassium', 'potassium'): 9
('phosphate', 'phosphate'): 9
('hydrogen', 'hydrogen'): 9
('calcium', 'calcium'): 9
('phosphorus', 'phosphorus'): 8
('acetone', 'acetone'): 8
('methanol', 'methanol'): 8
('chloride', 'chloride'): 8
('sucrose', 'sucrose'): 7
('zinc', 'zinc'): 7
('selenium', 'selenium'): 7
('sulphur', 'sulfide'): 6
('gallic acid', 'gallic acid'): 6
('iron', 'iron'): 6
('cholesterol', 'cholesterol'): 6
('pyruvic acid', 'b-hydroxypropionic acid'): 6
('sulfate', 'sulfate'): 6
('nitric oxide', 'nitric oxide'

In [19]:
# Convertir a DataFrame
df_mappings = pd.DataFrame([
    {"original": original, "mapped": mapeado, "count": count}
    for (original, mapeado), count in conteo_mapeos
])
df_mappings.to_excel("df_freq_mappings.xlsx", index=False)

In [20]:
df["common_chemicals"] = df["common_chemicals"].apply(lambda x: "; ".join(x) if x else "")
#para ver toda la lista
pd.set_option("display.max_rows", None)
print(df)

                                               Authors  \
0    Kasamatsu S.; Kinno A.; Hishiyama J.-I.; Akaik...   
1    Roszczenko P.; Szewczyk O.K.; Czarnomysy R.; B...   
2        Shlosberg Y.; Huang A.; Tóth T.N.; Kaner R.B.   
3               Xu H.; Li Y.; Xing J.; Liu L.; Wang Y.   
4    Vuković S.; Popović-Djordjević J.B.; Kostić A....   
5      Kim B.-M.; Suh S.G.; Oh W.; Oh S.-Y.; Jung J.H.   
6    Di Gioia F.; Hong J.C.; Pisani C.; Petropoulos...   
7    Jayaswall K.; Kumar D.; Jayaswal D.; Sharma H....   
8    Vyavahare G.D.; Lee Y.; Seok Y.J.; Kim H.N.; S...   
9    Li X.; Li R.; Wang K.; Kong Y.; Lv Y.; Cao B.;...   
10      Zhou J.; Xin X.; Li W.; Ding H.; Yu S.; Cui X.   
11                    Nishioka T.; Suga H.; Shimizu M.   
12   Čepulienė V.; Juškevičienė D.; Viškelis J.; Mo...   
13   Ou Z.; Deng Y.; Wu Y.; Wang Y.; Zhao Y.; Liu C...   
14   Nguyen N.K.; Vo D.T.V.; Le T.X.; Morton L.W.; ...   
15   Lee D.-Y.; Kim E.-J.; Park S.-E.; Cho K.-M.; K...   
16   Xiang L.;

### Unir y reemplazar (para visualizar en VOS viewer)

In [22]:
scopus_df = pd.read_csv("Scopus2 (1).csv")  # Base de datos de Scopus

In [23]:
scopus_df["Author Keywords"] = df["common_chemicals"]
print(scopus_df["Author Keywords"])

0                                                sulfide
1                                                       
2                                            nadph; nadh
3                                             naringenin
4                                                ethanol
5                                                       
6                               water; nitrate; nitrogen
7                        sulfide; l-cystine; glutathione
8                               nitrogen; water; nitrate
9                                                  water
10                                           acetic acid
11                                            l-cysteine
12                                                      
13     d-glucose; malondialdehyde; coenzyme a; glutat...
14                                             potassium
15           l-leucine; melibiose; l-methionine; glycine
16                                                      
17                             

### Descargar

In [None]:
# Guardar el dataframe con la nueva columna de extracciones
scopus_df.to_csv("common_chems.csv", index=False)