In [1]:
from pathlib import Path

from glycowork.glycan_data.loader import df_species
from glycowork.motif.graph import glycan_to_nxGraph
import pandas as pd

from gifflar.data.utils import GlycanStorage

BONDS = {
    "alpha_bond": "C[C@H](OC)CC",
    "beta_bond": "C[C@@H](OC)CC",
    "nostereo_bond": "CC(OC)CC"
}

In [2]:
gs = GlycanStorage("/home/daniel/Data1/roman/GIFFLAR/data_pret")

df_species["ID"] = [f"GID{i + 1:05d}" for i in range(len(df_species))]
df_species.rename(columns={"glycan": "Glycan"}, inplace=True)
df_species.drop(columns="ref", inplace=True)
df_species.head()

Unnamed: 0,Glycan,Species,Genus,Family,Order,Class,Phylum,Kingdom,Domain,ID
0,Gal(a1-2)[Rha3Me(a1-3)][Xyl(b1-4)]Fuc(a1-3)[Xy...,ATCV-1,Chlorovirus,Phycodnaviridae,Algavirales,Megaviricetes,Nucleocytoviricota,Bamfordvirae,Virus,GID00001
1,Gal(a1-2)[Rha3Me(a1-3)][Xyl(b1-4)]Fuc(a1-3)[Xy...,ATCV-1,Chlorovirus,Phycodnaviridae,Algavirales,Megaviricetes,Nucleocytoviricota,Bamfordvirae,Virus,GID00002
2,Gal(a1-2)[Rha3Me(a1-3)][Xyl4Me(b1-4)]Fuc(a1-3)...,ATCV-1,Chlorovirus,Phycodnaviridae,Algavirales,Megaviricetes,Nucleocytoviricota,Bamfordvirae,Virus,GID00003
3,Gal(a1-2)[Rha3Me(a1-3)][Xyl4Me(b1-4)]Fuc(a1-3)...,ATCV-1,Chlorovirus,Phycodnaviridae,Algavirales,Megaviricetes,Nucleocytoviricota,Bamfordvirae,Virus,GID00004
4,GalA(a1-2)Rha(a1-4)GalA,Abelmoschus_esculentus,Abelmoschus,Malvaceae,Malvales,Dicotyledons,Angiosperms,Plantae,Eukarya,GID00005


In [3]:
def parse_mono(filepath, iupac):
    mono = dict()
    bonds = set()
    
    monos_text = ""
    bonds_text = ""

    g = glycan_to_nxGraph(iupac)
    
    for n in g.nodes:
        node = g.nodes[n]
        if n % 2 == 0:  # monosaccharide
            r = gs.query(node["string_labels"])
            if r is None:  # return from processing function
                pass
            if "," in node["string_labels"]:
                m = f"\"{node['string_labels']}\""
            else:
                m = node["string_labels"]
            mono[m] = r["smiles"]
            monos_text += f"\n{n // 2 + 1} {m}"
        else:
            if "a" in node["string_labels"]:  # alpha_bond
                bond_type = "alpha_bond"
            elif "b" in node["string_labels"]:
                bond_type = "beta_bond"
            else:
                bond_type = "nostereo_bond"
            bonds.add(bond_type)
            N = list(g.neighbors(n))
            bonds_text += f"\n{min(N) // 2 + 1} {max(N) // 2 + 1} {bond_type}"
    
    print("SMILES", file=filepath)
    for iupac, smiles in mono.items():
        print(iupac, smiles, file=filepath)
    for bond in bonds:
        print(bond, BONDS[bond], file=filepath)
    print("\nMONOMERS", end="", file=filepath)
    print(monos_text, file=filepath)
    print("\nBONDS", end="", file=filepath)
    print(bonds_text, file=filepath)
    return mono


def parse_level(level):
    root = Path(f"taxonomy_{level}")
    root.mkdir(exist_ok=True)
    graphs = root / "graphs"
    graphs.mkdir(exist_ok=True)
    valid = set(pd.read_csv(f"../GLAMOUR/taxonomy_{level}.tsv", sep="\t")["IUPAC"])
    mask = [False for _ in range(len(df_species))]
    
    monos = dict()
    for i, (_, row) in enumerate(df_species.iterrows()):
        print(f"\rParsing {i}", end="")
        if row["Glycan"] not in valid:
            continue
        
        mask[i] = True
        with open(graphs / f"{row['ID']}_graph.txt", "w") as f:
            monos.update(parse_mono(f, row["Glycan"]))
    
    df = df_species[mask]
    df.to_csv(root / "multilabel.txt", index=False)

    with open(root / "bonds.txt", "w") as f:
        print("Molecule,SMILES", file=f)
        for bond, smiles in BONDS.items():
            print(bond, smiles, file=f, sep=",")

    with open(root / "monos.txt", "w") as f:
        print("Molecule,SMILES", file=f)
        for mono, smiles in monos.items():
            print(mono, smiles, file=f, sep=",")


parse_level("Domain")

Parsing 37516