In [1]:
import pandas as pd
import numpy as np
import igraph as ig
from meta_matching_tool import *

In [2]:
import math
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
import torch.nn.functional as F
import torch.nn.init as init
import os
import zipfile

# Windows
# package_dir = "E:\\SPARSENN\\Modified_model\\meta_matching_tool"
# Macos
package_dir = "/Users/watertank/Desktop/SPARSENN/Modified_model/meta_matching_tool"
# package_dir = os.path.abspath(os.path.dirname(__file__))


zip_path = os.path.join(package_dir, 'data', 'kegg.txt.zip')
output_dir = "data/"

# Check if the zip file exists
if os.path.exists(zip_path):
    # Open the zip file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Extract all the files
        zip_ref.extractall(output_dir)
    os.remove(zip_path)

#################### Function for data pre-processing ####################

# define a function to remove rows with more than 75% of zeros
def remove_rows(dat, data, thres = 0.75):
    """
    dat contains mz and time and other info
    data has shape (n_features, n_samples)
    """
    rowsum = np.sum(data==0,1)
    new_dat = dat.iloc[np.where((rowsum < thres * data.shape[1])==True)[0],:]
    return new_dat

# define a function to find the potential KEGGID for each feature
def find_keggid(dat, kegg_sub, this_adductlist, match_tol_ppm=9):
    """
    dat contains mz and time and other info
    kegg_sub is a subset of kegg that contains only the compounds that are in the graph
    """
    this_kegg = kegg_sub[kegg_sub['Adduct'].isin(this_adductlist)]

    dic = {}
    for i in range(dat.shape[0]):
        # If the mz matches that in the database, we claim that this is a match.
        idx = list(np.where(np.abs(this_kegg['mz']-dat['mz'].iloc[i])/(dat['mz'].iloc[i])<=match_tol_ppm/1e6)[0])
        
        # Get the corresponding KEGGID.
        dic[dat.index[i]] = list(this_kegg['KEGGID'].values[idx])
    return dic

# define a function to get the feature-metabolite matching matrix, adj matrix and feature data
def get_data(dic, new_dat, g):
    # Get all the features and the metabolites.
    features = [key for key, value in dic.items() if value!=[]]
    
    metabolites = np.unique(sum([value for key, value in dic.items() if value!=[]], []))

    
    # get feature data  
    data_anno_new = new_dat.loc[features,:]
    print("The shape of data:", data_anno_new.shape)

    # get feature-metabolite matching matrix
    matching = np.zeros([len(features), len(metabolites)])
    for ix,i in enumerate(features):
        idx = np.where(np.isin(metabolites, dic[i]))[0]
        matching[ix, idx] = 1

    print("The shape of feature-metabolites matching:", matching.shape)

    # get adjacency matrix of metabolites
    subgraph = ig.Graph()

    # add the vertices from order_list to the subgraph
    for v in metabolites:
        if v in g.vs["name"]:
            subgraph.add_vertex(v)

    # add the edges that connect the vertices in the subgraph
    for e in g.es:
        source = e.source
        target = e.target
        if g.vs['name'][source] in metabolites and g.vs['name'][target] in metabolites:
            subgraph.add_edge(g.vs['name'][source], g.vs['name'][target])

    # Not quite clear why didn't use the original adj matrix but a new one. TODO: check this
    adj_new = np.array(subgraph.get_adjacency().data)
    g_sub = ig.Graph.Adjacency((adj_new > 0).tolist(), mode = "undirected")

    adj_matrix = np.array(g_sub.get_adjacency().data)
    print("The shape of metabolic network:", adj_matrix.shape)
    
    return(data_anno_new, matching, adj_matrix, metabolites)


def data_preprocessing(pos=None, neg=None, 
                       pos_adductlist=["M+H","M+NH4","M+Na","M+ACN+H","M+ACN+Na","M+2ACN+H","2M+H","2M+Na","2M+ACN+H"], 
                       neg_adductlist = ["M-H", "M-2H", "M-2H+Na", "M-2H+K", "M-2H+NH4", "M-H2O-H", "M-H+Cl", "M+Cl", "M+2Cl"], 
                       idx_feature = 4, match_tol_ppm=5, zero_threshold=0.75, log_transform=True, scale=1000):

    # Load data
    g = ig.Graph.Read_GraphML(os.path.join(package_dir, 'data', 'graph.graphhml'))
    all_compound = list(g.vs["name"])
    
    # Filter out the lines in the DB where **(kegg['mz']-kegg['AdductMass']==kegg['MonoisotopicMass'])**.
    kegg = pd.read_csv(os.path.join(package_dir, 'data', 'kegg.txt'), sep='\t')
    kegg['r'] = (kegg['mz']-kegg['AdductMass']==kegg['MonoisotopicMass'])
    kegg = kegg[kegg['r']==True]
    kegg_sub = kegg[kegg['KEGGID'].isin(all_compound)]

    if pos is not None:
        pos.columns.values[0] = 'mz'
        pos.columns.values[1] = 'time'
        pos.columns = pos.columns.str.replace('pos', '')
        pos.index = ['pos.' + str(i) for i in pos.index]

    if neg is not None:
        neg.columns.values[0] = 'mz'
        neg.columns.values[1] = 'time'
        neg.columns = neg.columns.str.replace('neg', '')
        neg.index = ['neg.' + str(i) for i in neg.index]

    # concatenate the two dataframes
    if pos is not None and neg is not None:
        dat = pd.concat([pos, neg], axis=0)
    elif pos is not None:
        dat = pos
    elif neg is not None:
        dat = neg
        
    # leave out those with very low expression rate
    new_dat = remove_rows(dat, dat.iloc[:,idx_feature:], thres = zero_threshold)

    # select only the compounds that are in the graph
     
    if pos is not None and neg is not None:
        dic_pos = find_keggid(new_dat.loc[new_dat.index.str.contains('pos')], kegg_sub, pos_adductlist)
        dic_neg = find_keggid(new_dat.loc[new_dat.index.str.contains('neg')], kegg_sub, neg_adductlist)

        dic = {**dic_pos, **dic_neg}
    elif pos is not None:
        dic = find_keggid(new_dat.loc[new_dat.index.str.contains('pos')], kegg_sub, pos_adductlist)
    elif neg is not None:
        dic = find_keggid(new_dat.loc[new_dat.index.str.contains('neg')], kegg_sub, neg_adductlist)

    data_annos, matchings, adj_matrices, metabolites = get_data(dic, new_dat, g)
    
    if log_transform:
        data_annos.iloc[:,idx_feature:] = np.log(data_annos.iloc[:,idx_feature:]+1)
        
    if scale:
        expression = data_annos.iloc[:,idx_feature:].T
        m_min = np.min(expression, 0)
        m_max = np.max(expression, 0)

        expression = ((expression - m_min)/(m_max - m_min)-0.5) * scale
        
        data_annos.iloc[:,idx_feature:] = expression.T

    return(data_annos, matchings, adj_matrices, metabolites)

In [3]:
# define a function to preprocess the data
pos = pd.read_csv('positive.txt', sep='\t')
neg = pd.read_csv('negative.txt', sep='\t')

pos_adductlist = ["M+H","M+NH4","M+Na","M+ACN+H","M+ACN+Na","M+2ACN+H","2M+H","2M+Na","2M+ACN+H"]
neg_adductlist = ["M-H", "M-2H", "M-2H+Na", "M-2H+K", "M-2H+NH4", "M-H2O-H", "M-H+Cl", "M+Cl", "M+2Cl"]

In [4]:
data_annos, matchings, adj_matrices,  metabolites = data_preprocessing(pos=pos, neg=neg, idx_feature = 4, match_tol_ppm=5, zero_threshold=0.75, scale = 1000)

The shape of data: (1174, 704)
The shape of feature-metabolites matching: (1174, 913)
The shape of metabolic network: (913, 913)


In [6]:
print(metabolites.shape)

(913,)


In [25]:
metabolites

array(['C00029', 'C00032', 'C00036', 'C00041', 'C00042', 'C00047',
       'C00048', 'C00049', 'C00052', 'C00058', 'C00059', 'C00061',
       'C00062', 'C00064', 'C00065', 'C00072', 'C00073', 'C00074',
       'C00077', 'C00078', 'C00079', 'C00081', 'C00082', 'C00084',
       'C00086', 'C00089', 'C00090', 'C00095', 'C00097', 'C00099',
       'C00104', 'C00106', 'C00108', 'C00109', 'C00111', 'C00114',
       'C00116', 'C00118', 'C00121', 'C00122', 'C00123', 'C00129',
       'C00133', 'C00134', 'C00135', 'C00137', 'C00140', 'C00141',
       'C00143', 'C00146', 'C00147', 'C00148', 'C00149', 'C00152',
       'C00153', 'C00155', 'C00156', 'C00158', 'C00159', 'C00160',
       'C00163', 'C00164', 'C00166', 'C00170', 'C00178', 'C00179',
       'C00180', 'C00181', 'C00183', 'C00184', 'C00185', 'C00186',
       'C00188', 'C00189', 'C00191', 'C00198', 'C00208', 'C00212',
       'C00213', 'C00214', 'C00217', 'C00218', 'C00219', 'C00221',
       'C00222', 'C00224', 'C00227', 'C00232', 'C00233', 'C002

In [7]:
kegg = pd.read_csv(os.path.join(package_dir, 'data', 'kegg.txt'), sep='\t')

In [26]:
# 步骤1: 提取唯一的 KEGGID-Name 映射（保留第一个出现的 Name）
unique_id_name = kegg[["KEGGID", "Name"]].drop_duplicates(subset="KEGGID", keep="first")

# 步骤2: 将你的 KEGGID 数组转换为 DataFrame（假设你的数组名为 kegg_id_array）
kegg_id_array = metabolites # 替换为你的实际数组
query_df = pd.DataFrame({"KEGGID": kegg_id_array})

# 步骤3: 通过 merge 快速匹配名称（类似 SQL 的 JOIN）
result_df = query_df.merge(unique_id_name, on="KEGGID", how="left")

# 输出结果
print(result_df)

     KEGGID                                               Name
0    C00029  UDP-glucose;UDPglucose;UDP-D-glucose;Uridine d...
1    C00032            Heme;Haem;Protoheme;Heme B;Protoheme IX
2    C00036  Oxaloacetate;Oxalacetic acid;Oxaloacetic acid;...
3    C00041  L-Alanine;L-2-Aminopropionic acid;L-alpha-Alanine
4    C00042  Succinate;Succinic acid;Butanedionic acid;Ethy...
..      ...                                                ...
908  C20775       beta-Citryl-L-glutamate;beta-Citrylglutamate
909  C20776  N-Acetylaspartylglutamylglutamate;N-Acetyl-L-a...
910  C20806                         omega-Hydroxyphylloquinone
911  C20835                alpha-L-Fucopyranose;alpha-L-Fucose
912  C20836                  beta-L-Fucopyranose;beta-L-Fucose

[913 rows x 2 columns]


In [28]:
result_df.to_csv("target_metabolic.txt", sep="\t", index=False, encoding="utf-8")

False