In [3]:
import warnings
import os
warnings.filterwarnings("ignore")
from nbdev.showdoc import *
import copy
from IPython.display import HTML
import pandas as pd
from glycowork.glycan_data.loader import df_species
from glycowork.motif.analysis import plot_embeddings, get_heatmap, characterize_monosaccharide
from glycowork.motif.processing import presence_to_matrix
from glycowork.motif.query import get_insight
#%load_ext autoreload

In [2]:
DATA_PATH = "../../data/glycan_embedding"

# Datasets

In [4]:
# Extracted glycans used for inference and enrichment
df_glycan_list = pd.read_csv(os.path.join(DATA_PATH, 'glycan_list.csv'))
df_glycan_list.head()

Unnamed: 0,glycan,Composition,tissue_species,tissue_sample
0,Fuc(a1-?)GlcNAc(b1-2)Man(a1-6)[GlcNAc(b1-2)Man...,"{'dHex': 2, 'HexNAc': 4, 'Hex': 3}",['Homo_sapiens'],['blood']
1,Neu5Ac(a2-?)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Glc...,"{'Neu5Ac': 1, 'Hex': 4, 'HexNAc': 4, 'dHex': 1}",['Homo_sapiens'],['blood']
2,Neu5Ac(a2-6)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Gal...,"{'Neu5Ac': 1, 'Hex': 5, 'HexNAc': 4}",['Homo_sapiens'],['blood']
3,Neu5Ac(a2-6)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Glc...,"{'Neu5Ac': 1, 'Hex': 4, 'HexNAc': 4}",['Homo_sapiens'],['blood']
4,Fuc(a1-2)[GalNAc(a1-3)]Gal(b1-4)GlcNAc(b1-2)Ma...,"{'dHex': 1, 'HexNAc': 5, 'Hex': 5}",['Homo_sapiens'],['blood']


In [6]:
# Glycan sequences used for embedding space learning
df_glycan = pd.read_pickle(os.path.join(DATA_PATH, 'df_glycan.pkl'))
df_glycan.head()

Unnamed: 0,glycan,Species,Genus,Family,Order,Class,Phylum,Kingdom,Domain,ref,...,disease_id,disease_sample,disease_direction,disease_ref,disease_species,tissue_sample,tissue_id,tissue_ref,tissue_species,Composition
0,Gal(b1-4)Glc-ol,"[Acinonyx_jubatus, Addax_nasomaculatus, Aepyce...","[Acinonyx, Addax, Aepyceros, Ailuropoda, Alcel...","[Felidae, Bovidae, Bovidae, Ursidae, Bovidae, ...","[Carnivora, Artiodactyla, Artiodactyla, Carniv...","[Mammalia, Mammalia, Mammalia, Mammalia, Mamma...","[Chordata, Chordata, Chordata, Chordata, Chord...","[Animalia, Animalia, Animalia, Animalia, Anima...","[Eukarya, Eukarya, Eukarya, Eukarya, Eukarya, ...","[https://pubmed.ncbi.nlm.nih.gov/31828568/, ht...",...,[],[],[],[],[],"[milk, milk, milk, milk, milk, milk, milk, mil...","[UBERON:0001913, UBERON:0001913, UBERON:000191...","[https://pubmed.ncbi.nlm.nih.gov/31828568/, ht...","[Acinonyx_jubatus, Addax_nasomaculatus, Ailuro...",{'Hex': 2}
1,Neu5Ac(a2-3)Gal(b1-4)Glc1Cer,"[Alces_alces, Balaenoptera_acutorostrata, Bos_...","[Alces, Balaenoptera, Bos, Bubalus, Campylobac...","[Cervidae, Balaenopteridae, Bovidae, Bovidae, ...","[Artiodactyla, Artiodactyla, Artiodactyla, Art...","[Mammalia, Mammalia, Mammalia, Mammalia, Epsil...","[Chordata, Chordata, Chordata, Chordata, Prote...","[Animalia, Animalia, Animalia, Animalia, Bacte...","[Eukarya, Eukarya, Eukarya, Eukarya, Bacteria,...","[https://pubmed.ncbi.nlm.nih.gov/26104834/, ht...",...,"[DOID:83, DOID:5409, DOID:1909]","[lens, tumor, skin]","[up, up, up]","[https://pubmed.ncbi.nlm.nih.gov/7905480/, htt...","[Homo_sapiens, Homo_sapiens, Cricetulus_griseus]","[A549_cell_line, AML_193_cell_line, CHOK1_cell...","[cellosaurus:CVCL_0023, cellosaurus:CVCL_1071,...","[https://pubmed.ncbi.nlm.nih.gov/23345451/, ht...","[Homo_sapiens, Homo_sapiens, Homo_sapiens, Hom...","{'Neu5Ac': 1, 'Hex': 2}"
2,Glc1Cer,"[Acaudina_molpadioides, Acholeplasma_axanthum,...","[Acaudina, Acholeplasma, Agama, Agama, Agelas,...","[Caudinidae, Acholeplasmataceae, Agamidae, Aga...","[Molpadiida, Acholeplasmatales, Squamata, Squa...","[Holothuroidea, Mollicutes, Reptilia, Reptilia...","[Echinodermata, Firmicutes, Chordata, Chordata...","[Animalia, Bacteria, Animalia, Animalia, Anima...","[Eukarya, Bacteria, Eukarya, Eukarya, Eukarya,...","[https://pubmed.ncbi.nlm.nih.gov/22004409/, ht...",...,[DOID:1909],[skin],[up],[https://pubmed.ncbi.nlm.nih.gov/2582447/],[Cricetulus_griseus],"[COS7_cell_line, HT29_cell_line, M1_cell_line,...","[cellosaurus:CVCL_0224, cellosaurus:CVCL_A8EZ,...","[https://pubmed.ncbi.nlm.nih.gov/20157020/, ht...","[Chlorocebus_sabaeus, Homo_sapiens, Mus_muscul...",{'Hex': 1}
3,Gal(b1-4)GlcNAc(b1-2)Man(a1-3)[Gal(b1-4)GlcNAc...,"[Angiostrongylus_cantonensis, AvianInfluenzaA_...","[Angiostrongylus, Alphainfluenzavirus, Bos, Bo...","[Angiostrongylidae, Orthomyxoviridae, Bovidae,...","[Rhabditida, Articulavirales, Artiodactyla, Ar...","[Chromadorea, Insthoviricetes, Mammalia, Mamma...","[Nematoda, Negarnaviricota, Chordata, Chordata...","[Animalia, Riboviria, Animalia, Animalia, Anim...","[Eukarya, Virus, Eukarya, Eukarya, Eukarya, Eu...","[https://pubmed.ncbi.nlm.nih.gov/26650734/, ht...",...,"[DOID:8778, DOID:11729, DOID:9256, DOID:008052...","[serum, serum, tumor, serum, serum, serum, ser...","[down, up, down, down, down, down, down, down,...","[https://pubmed.ncbi.nlm.nih.gov/34643622/, ht...","[Homo_sapiens, Homo_sapiens, Homo_sapiens, Hom...","[2A3_cell_line, A9_fibroblast_cell_line, BMMC_...","[cellosaurus:CVCL_0D71, cellosaurus:CVCL_3984,...","[https://pubmed.ncbi.nlm.nih.gov/36289103/, ht...","[Homo_sapiens, Mus_musculus, Homo_sapiens, Hom...","{'Hex': 5, 'HexNAc': 4, 'dHex': 1}"
4,Man(a1-2)Man(a1-3)[Man(a1-3)[Man(a1-6)]Man(a1-...,"[Adeno-associated_dependoparvovirusA, Angiostr...","[Dependoparvovirus, Angiostrongylus, Arabidops...","[Parvoviridae, Angiostrongylidae, Brassicaceae...","[Piccovirales, Rhabditida, Brassicales, Lepido...","[Quintoviricetes, Chromadorea, Dicotyledons, I...","[Cossaviricota, Nematoda, Angiosperms, Arthrop...","[Shotokuvirae, Animalia, Plantae, Animalia, An...","[Virus, Eukarya, Eukarya, Eukarya, Eukarya, Eu...","[https://pubmed.ncbi.nlm.nih.gov/37774344/, ht...",...,"[DOID:9965, SYMP:0000633, DOID:3908, DOID:3969...","[serum, plasma, serum, serum, serum, urine, , ...","[up, down, down, up, up, up, , , , ]","[https://pubmed.ncbi.nlm.nih.gov/32123198/, ht...","[Mus_musculus, Homo_sapiens, Homo_sapiens, Hom...","[2A3_cell_line, B_cell, CHOK1_cell_line, CHOS_...","[cellosaurus:CVCL_0D71, CL:0000236, cellosauru...","[https://pubmed.ncbi.nlm.nih.gov/36289103/, ht...","[Homo_sapiens, Homo_sapiens, Cricetulus_griseu...","{'Hex': 6, 'HexNAc': 2}"


In [9]:
df_glycan_binding = pd.read_pickle(os.path.join(DATA_PATH, 'glycan_binding.pkl'))
df_glycan_binding.head(10)

Unnamed: 0,3-Anhydro-Gal(a1-3)Gal(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3-Anhydro-Gal(a1-3)Gal4S(b1-4)3-Anhydro-Gal2S(a1-3)Gal4S(b1-4)3-Anhydro-Gal(a1-3)Gal4S,3dGal(b1-3)[Fuc(a1-4)]Glc,3dGal(b1-4)Glc,4d8dNeu5Ac(a2-3)Gal(b1-4)Glc,4dNeu5Ac(a2-3)Gal(b1-4)Glc,7dNeu5Ac(a2-3)Gal(b1-4)Glc,...,wwwSflexneri5c,wwwSflexneriO2c,wwwSflexneriO5c,wwwSisomicin,wwwSmix,wwwTobramycin,wwwTyrS,wwwpHGGs,target,protein
0,,,,,,,,,,,...,,,,,,,,,AADSIPSISPTGIITPTPTQSGMVSNCNKFYDVHSNDGCSAIASSQ...,TAL6-4LysM
1,,,,,,,,,,,...,,,,,,,,,AAFFSLVVLLALLPFGIHASALPSTELTPRVNPNLPGPNDVFVGFR...,rCnSL-proA
2,,,,,,,,,,,...,,,,,,,,,AANEADYQAKLTAYQTELARVQKANADAKAAYEAAVAANNAANAAL...,AntigenI/IIA3VP1
3,,,,,,,,,,,...,,,,,,,,,AASKLGVPQPAQRDQVNCQLYAVQPNDNCIDISSKNNITYAQLLSW...,TAL6-6LysM
4,,,,,,,,,,,...,,,,,,,,,ACNNEWEDEQYEQYISFKSPIPAGGEGVTDIYVRYKEDGKVTYRLP...,SP15308A-bot-339-19-339
5,,,,,,,,,,,...,,,,,,,,,ACPSQCSCSGTEVNCAGKSLASVPAGIPTTTRVLYLNSNQITKLEP...,VLRB.aGPA.23-GCN4-biotin
6,,,,,,,,,,,...,,,,,,,,,ADEPIDLEKLEEKRDKENVGNLPKFDNEVKDGSENPMAKYPDFDDE...,Protein L(A-C2)
7,,,,,,,,,,,...,,,,,,,,,ADGIQDKICIGYLSNNSTDTVDTLTENGVPVTSSIDLVETNHTGTY...,A/H16-2
8,,,,,,,,,,,...,,,,,,,,,ADGIQDKICIGYLSNNSTDTVDTLTENGVPVTSSIDLVETNHTGTY...,A/H16
9,,,,,,,,,,,...,,,,,,,,,ADGIQDRICVGYLSTNSSERVDTLLENGVPVTSSIDLIETNHTGTY...,A/Gull/Maryland/704/1977(H13N6)


In [11]:
# Used to control representation in the embedding space by assessing closeness
df_N_glycans = pd.read_pickle(os.path.join(DATA_PATH, 'N_glycans_df.pkl'))
df_N_glycans.head()

Unnamed: 0,glycan,Species,Genus,Family,Order,Class,Phylum,Kingdom,Domain,ref,...,disease_sample,disease_direction,disease_ref,disease_species,tissue_sample,tissue_id,tissue_ref,tissue_species,Composition,Structure_Type
1,Neu5Ac(a2-3)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Gal...,"[Cricetulus_griseus, Homo_sapiens, Mus_musculu...","[Cricetulus, Homo, Mus, Ovis, Rattus]","[Cricetidae, Hominidae, Muridae, Bovidae, Muri...","[Rodentia, Primates, Rodentia, Artiodactyla, R...","[Mammalia, Mammalia, Mammalia, Mammalia, Mamma...","[Chordata, Chordata, Chordata, Chordata, Chord...","[Animalia, Animalia, Animalia, Animalia, Anima...","[Eukarya, Eukarya, Eukarya, Eukarya, Eukarya]","[, , https://unicarb-dr.glycosmos.org/referenc...",...,[],[],[],[],"[2A3_cell_line, AML_193_cell_line, Cal-27_cell...","[cellosaurus:CVCL_0D71, cellosaurus:CVCL_1071,...","[https://pubmed.ncbi.nlm.nih.gov/36289103/, ht...","[Homo_sapiens, Homo_sapiens, Homo_sapiens, Hom...","{'Neu5Ac': 1, 'Hex': 5, 'HexNAc': 4}",Complex_Gal
2,Gal(a1-3)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Gal(b1...,"[Bos_taurus, Ginglymostoma_cirratum, Mus_muscu...","[Bos, Ginglymostoma, Mus, Sus]","[Bovidae, Ginglymostomatidae, Muridae, Suidae]","[Artiodactyla, Orectolobiformes, Rodentia, Art...","[Mammalia, Chondrichthyes, Mammalia, Mammalia]","[Chordata, Chordata, Chordata, Chordata]","[Animalia, Animalia, Animalia, Animalia]","[Eukarya, Eukarya, Eukarya, Eukarya]","[, https://pubmed.ncbi.nlm.nih.gov/19156518/, ...",...,[],[],[],[],[],[],[],[],"{'Hex': 6, 'HexNAc': 4}",Complex_Gal
3,Neu5Ac(a2-?)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Gal...,"[Notamacropus_eugenii, Homo_sapiens, Homo_sapi...","[Notamacropus, Homo, Homo]","[Macropodidae, Hominidae, Hominidae]","[Diprotodontia, Primates, Primates]","[Mammalia, Mammalia, Mammalia]","[Chordata, Chordata, Chordata]","[Animalia, Animalia, Animalia]","[Eukarya, Eukarya, Eukarya]","[https://pubmed.ncbi.nlm.nih.gov/23053637/, ht...",...,"[, ]","[, ]","[, ]","[, ]","[urine, urine]","[UBERON:0001088, UBERON:0001088]","[https://pubmed.ncbi.nlm.nih.gov/33650863/, ht...","[Homo_sapiens, Homo_sapiens]","{'Neu5Ac': 1, 'Hex': 5, 'HexNAc': 4}",Complex_Gal
4,Fuc(a1-2)Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[Gal(b1...,[Homo_sapiens],[Homo],[Hominidae],[Primates],[Mammalia],[Chordata],[Animalia],[Eukarya],[],...,[],[],[],[],[plasma],[ENVO:01000798],[https://pubmed.ncbi.nlm.nih.gov/1577715/],[Homo_sapiens],"{'dHex': 1, 'Hex': 5, 'HexNAc': 4}",Complex_Gal
5,Fuc(a1-2)[Gal(a1-3)]Gal(b1-4)GlcNAc(b1-2)Man(a...,[Homo_sapiens],[Homo],[Hominidae],[Primates],[Mammalia],[Chordata],[Animalia],[Eukarya],[],...,[],[],[],[],[plasma],[ENVO:01000798],[https://pubmed.ncbi.nlm.nih.gov/1577715/],[Homo_sapiens],"{'dHex': 1, 'Hex': 6, 'HexNAc': 4}",Complex_Gal
