In [1]:
import pandas as pd
import numpy as np
import igraph as ig
from meta_matching_tool import *

In [2]:
# define a function to preprocess the data
pos = pd.read_csv('positive.txt', sep='\t')
neg = pd.read_csv('negative.txt', sep='\t')

pos_adductlist = ["M+H","M+NH4","M+Na","M+ACN+H","M+ACN+Na","M+2ACN+H","2M+H","2M+Na","2M+ACN+H"]
neg_adductlist = ["M-H", "M-2H", "M-2H+Na", "M-2H+K", "M-2H+NH4", "M-H2O-H", "M-H+Cl", "M+Cl", "M+2Cl"]


In [3]:
target_metas = pd.read_csv('selected_feature_id.txt', sep='\t')

In [4]:
###################### Function for main model ######################

def getLayerSizeList(partition, threshold_layer_size, sparsify_coefficient):
    """
    Obtain the size of each sparse layer
    
    INPUT:
    partition: the adjacent matrix of metabolic network
    threshold_layer_size: the threshold of sparese layer
    sparsify_coefficient: the coefficient of each sparse level
    
    OUTPUT:
    sparsify_hidden_layer_size_dict: a dictionary indicating the sparse layer
    """
    n_meta = np.shape(partition)[0]
    n_layer = math.floor(np.log10(1.0 * threshold_layer_size / n_meta) / np.log10(sparsify_coefficient)) + 3
    
    # dict for number of neurons in each layer
    sparsify_hidden_layer_size_dict = {}

    sparsify_hidden_layer_size_dict['n_hidden_0'] = int(n_meta)

    # How is this sparsing rate determined? TODO: check this
    for i in range(1,n_layer):
        sparsify_hidden_layer_size_dict['n_hidden_%d' % (i)] = int(n_meta * (sparsify_coefficient) ** (i-1))
    return sparsify_hidden_layer_size_dict


def getPartitionMatricesList(sparsify_hidden_layer_size_dict, degree_dict, feature_meta, partition):
    """
    Obtain the linkage matrix among two sparse layers
    """
    np.random.seed(1);  # for reproducable result
    g = ig.Graph.Adjacency((partition).tolist(), mode = "undirected")
    dist = np.array(g.shortest_paths()) # use the shortest distance matrix to assign links
    
    sum_remove_node_list = []  # keep note of which nodes are already removed
    
    partition_mtx_dict = {}
    residual_connection_dic = {}

    partition_mtx_dict["p0"] = feature_meta  # first matrix being the connection from features to meta
    partition_mtx_dict["p1"] = partition  # first matrix being the whole adjacency matrix

    # The code below adopted a seemingly very **stupid** way of determining the linkage. TODO: rewrite this
    for i in range(2, len(sparsify_hidden_layer_size_dict)):
        num_nodes_to_remove = sparsify_hidden_layer_size_dict["n_hidden_%d" % (i-1)] - \
                              sparsify_hidden_layer_size_dict["n_hidden_%d" % (i)]
        # sort node degree dict according to number of degrees
        sorted_node_degree_list = sorted(degree_dict.items(), key=lambda item: item[1])

        # Directly take the position of the nodes that are needed to be removed.
        temp_remove_list = []
        max_to_remove_node_degree = sorted_node_degree_list[num_nodes_to_remove - 1][1]
        
        # any node with degree less than `max_to_remove_node_degree` is certain to be removed
        for j in range(num_nodes_to_remove):  
            if sorted_node_degree_list[j][1] < max_to_remove_node_degree:
                id_to_remove_node = sorted_node_degree_list[j][0]
                # print(sorted_node_degree_list[j])
                temp_remove_list.append(id_to_remove_node)
            else:
                break  # node with more degrees is not under consideration
        
        # sample from all nodes that have max_to_remove_node_degree to reach number of nodes to remove
        sample_list = []
        for j in range(len(temp_remove_list), len(sorted_node_degree_list)):
            if sorted_node_degree_list[j][1] == max_to_remove_node_degree:
                sample_list.append(sorted_node_degree_list[j])
            else:
                break  # node with more degrees is not under consideration
            
        # Very interesting way of determining connection...
        sample_idx_list = sorted(
            np.random.choice(len(sample_list), num_nodes_to_remove - len(temp_remove_list), replace=False))
        for idx in sample_idx_list:
            temp_remove_list.append(sample_list[idx][0])

        # sum up add nodes to be removed
        all_list = np.arange(partition.shape[0])
        previous_layer_list = [x for x in all_list if x not in sum_remove_node_list]
        temp_partition = np.delete(partition, sum_remove_node_list, axis=0)
        sum_remove_node_list += temp_remove_list
        temp_partition = np.delete(temp_partition, sum_remove_node_list, axis=1)
        next_layer_list = [x for x in all_list if x not in sum_remove_node_list]

        # Residual connection layer
        residual_location = [previous_layer_list.index(x) for x in next_layer_list]
        
        # assign each neuron at least one linkage
        # I believe this is a mistake...
        # for k in range(len(previous_layer_list)):
        #     if sum(dist[k,next_layer_list]==float("inf"))==len(next_layer_list):
        #         idx = np.random.choice(len(next_layer_list), 1, replace=False)
        #     else:
        #         idx = np.argsort(dist[k,next_layer_list], axis = -1)[0]
        #     temp_partition[k, idx] = 1
            
            
        # Alternative version
        for k in range(len(previous_layer_list)):
            pos = previous_layer_list[k]
            if sum(dist[pos,next_layer_list]==float("inf"))==len(next_layer_list):
                idx = np.random.choice(len(next_layer_list), 1, replace=False)
            else:
                idx = np.argsort(dist[pos,next_layer_list], axis = -1)[0]
            temp_partition[k, idx] = 1
        
        for j in range(len(temp_remove_list)):
            degree_dict.pop(temp_remove_list[j])
            
        # if i == len(sparsify_hidden_layer_size_dict) - 1:
        #     print(next_layer_list)

        partition_mtx_dict["p%d" % i] = temp_partition

        residual_connection_dic["p%d" % i] = residual_location

        print(residual_location)

    return partition_mtx_dict, residual_connection_dic


# This might not be used in my settings.
def getNodeDegreeDict(partition):
    """
    Obtain the node degree using the adjacent matrix of metabolic network
    """
    degree_dict = {}
    row, col = partition.shape
    for i in range(row):
        degree_dict[i] = -1  # decrease its own
        for j in range(0, col):
            if partition[i, j] == 1:
                degree_dict[i] += 1

    return degree_dict


## Functions for backward selection.
def getKeggidByIndex(raw_keggid, idxs, output_dir):
    match_dic = {}

    raise NotImplementedError

In [5]:
import random

# The logic here is disastrous... Maybe there is better implementation
def backwardSelect(final_keggids, subgraph: ig.Graph, numberOfNodesList: list):
    
    indices = set([subgraph.vs.find(idx).index for idx in final_keggids])
    # Keep track of all nodes in each layer for sparse connection
    idsOfConnectedNodesEachLayer = [indices.copy()]

    # Keep track of all nodes that have been connected
    idxsHaveBeenConnected = indices.copy()

    # number of output nodes must equal to number we pre-set
    assert numberOfNodesList[-1] == len(final_keggids)

    # Backward selection
    numberOfNodesList.reverse()
    numberOfNodesList.remove(len(final_keggids))
    
    for layerNumber, numberOfEachLayer in enumerate(numberOfNodesList):
        # print(f"At the beginning of {numberOfEachLayer}, the nodes are {idxsHaveBeenConnected}")
        currentNumber = len(idxsHaveBeenConnected)
        numberOfNodesToBeConnected = numberOfEachLayer - currentNumber
        
        # The idxs to be newly connected in this layer
        idxsToBeConnected = set()

        # We only want those haven't been connected

        idxsCanBeConnected = set(np.concatenate([subgraph.neighborhood(idx, order=1, mindist=1) for idx in idxsHaveBeenConnected]).astype(np.int32).flatten().tolist()) \
                            - idxsHaveBeenConnected
        # print(idxsCanBeConnected)
        
        # print(f"Current layer num: {numberOfEachLayer}, now we have {currentNumber} already.")
        # print(len(idxsCanBeConnected))

        # if we happen to have more than we want to remove
        if len(idxsCanBeConnected) >= numberOfNodesToBeConnected:
            # print(f"Current layer num: {numberOfEachLayer}, now we have {len(idxsCanBeConnected)} can be connected, and we before have {currentNumber}.")
            # print(sorted(idxsCanBeConnected))
            idxsCanBeConnected= random.sample(sorted(idxsCanBeConnected), numberOfNodesToBeConnected)
            idxsToBeConnected.update(idxsCanBeConnected)
            idxsHaveBeenConnected.update(idxsCanBeConnected)
            idsOfConnectedNodesEachLayer.append(sorted(idxsToBeConnected))
            continue

        # else we have less nodes
        elif len(idxsCanBeConnected) < numberOfNodesToBeConnected:
            # first we add them all
            currentNumber += len(idxsCanBeConnected)
            
            idxsToBeConnected.update(idxsCanBeConnected)
            idxsHaveBeenConnected.update(idxsCanBeConnected)
            # print(f"After extending one step, we have {currentNumber}, but we need {numberOfEachLayer}.")
            # if currentNumber > 800:
            #     print(sorted(idxsHaveBeenConnected))

            # print(currentNumber)

            # while we don't have enough, we keep sampling until we have all we want
            while currentNumber < numberOfEachLayer:
                # if numberOfEachLayer == 913:
                #     print(idxsHaveBeenConnected)
                idxsCanBeConnected = set(np.concatenate([subgraph.neighborhood(idx, order=1, mindist=1) for idx in idxsHaveBeenConnected]).astype(np.int32).flatten().tolist()) \
                            - idxsHaveBeenConnected
                # print(f"Now we have {currentNumber}.")
                # print(f"By cts extending one step, we have {len(idxsCanBeConnected)} can be added, and we need {numberOfEachLayer}")
                # print(currentNumber)
                # If the connected subgraph is all selected, which is unlikely to happen, we randomly put needed node into the connection...
                if len(idxsCanBeConnected) == 0:
                    # print(f"Random sampling happens in {numberOfEachLayer}, we sample {numberOfEachLayer - currentNumber} nodes.")
                    idxsCanBeConnected = set(range(subgraph.vcount())) - idxsHaveBeenConnected
                    assert idxsCanBeConnected.isdisjoint(idxsHaveBeenConnected)
                    # print((len(idxsHaveBeenConnected), numberOfNodesToBeConnected, len(idxsCanBeConnected)))
                    idxsCanBeConnected = random.sample(sorted(idxsCanBeConnected), numberOfEachLayer - currentNumber)
                    # print(sorted(idxsCanBeConnected))
                    idxsToBeConnected.update(idxsCanBeConnected)
                    idxsHaveBeenConnected.update(idxsCanBeConnected)
                    break
                
                # If we still need more nodes, we just add them all
                elif len(idxsCanBeConnected) < numberOfEachLayer - currentNumber:
                    currentNumber += len(idxsCanBeConnected)
                    idxsToBeConnected.update(idxsCanBeConnected)
                    idxsHaveBeenConnected.update(idxsCanBeConnected)
                    continue
                
                # When we have more nodes than we need
                elif len(idxsCanBeConnected) >= numberOfEachLayer - currentNumber:
                    idxsCanBeConnected = random.sample(sorted(idxsCanBeConnected), numberOfEachLayer - currentNumber)
                    # print(numberOfNodesToBeConnected - currentNumber)
                    idxsToBeConnected.update(idxsCanBeConnected)
                    idxsHaveBeenConnected.update(idxsCanBeConnected)
                    break
                    
            #when we have enough, then just go to another layer
            idsOfConnectedNodesEachLayer.append(sorted(idxsToBeConnected))

            # print(idxsToBeConnected)
    mergedNodeList = [sorted(idsOfConnectedNodesEachLayer[0])]
    for i in range(1, len(idsOfConnectedNodesEachLayer)):
        # print(len(idsOfConnectedNodesEachLayer[i]))
        temp = sorted(mergedNodeList[i-1] + list(idsOfConnectedNodesEachLayer[i]))
        mergedNodeList.append(temp)
        # print(len(temp))
    mergedNodeList.reverse()
    return mergedNodeList

In [6]:
# kegg = pd.read_csv(os.path.join(package_dir, 'data', 'kegg.txt'), sep='\t')
# # 步骤1: 提取唯一的 KEGGID-Name 映射（保留第一个出现的 Name）
# unique_id_name = kegg[["KEGGID", "Name"]].drop_duplicates(subset="KEGGID", keep="first")

# # 步骤2: 将你的 KEGGID 数组转换为 DataFrame（假设你的数组名为 kegg_id_array）
# kegg_id_array = metabolites # 替换为你的实际数组
# query_df = pd.DataFrame({"KEGGID": kegg_id_array})

# # 步骤3: 通过 merge 快速匹配名称（类似 SQL 的 JOIN）
# result_df = query_df.merge(unique_id_name, on="KEGGID", how="left")

# # 输出结果
# print(result_df)

In [7]:
def getLayerSizeList(partition, final_layer_size, sparsify_coefficient):
    """
    Obtain the size of each sparse layer
    
    INPUT:
    partition: the adjacent matrix of metabolic network
    final_layer_size: the final of sparse layer
    sparsify_coefficient: the coefficient of each sparse level
    
    OUTPUT:
    sparsify_hidden_layer_size_dict: a dictionary indicating the sparse layer
    """
    n_meta = np.shape(partition)[0]
    n_layer = math.floor(np.log10(1.0 * final_layer_size / n_meta) / np.log10(sparsify_coefficient)) + 2
    
    # dict for number of neurons in each layer
    sparsify_hidden_layer_size_dict = {}

    sparsify_hidden_layer_size_dict['n_hidden_0'] = int(n_meta)

    # How is this sparsing rate determined? TODO: check this
    for i in range(1, n_layer):
        sparsify_hidden_layer_size_dict['n_hidden_%d' % (i)] = int(final_layer_size / (sparsify_coefficient) ** (n_layer - i - 1))
    return sparsify_hidden_layer_size_dict

In [8]:
# define a function to preprocess the data
pos = pd.read_csv('positive.txt', sep='\t')
neg = pd.read_csv('negative.txt', sep='\t')

pos_adductlist = ["M+H","M+NH4","M+Na","M+ACN+H","M+ACN+Na","M+2ACN+H","2M+H","2M+Na","2M+ACN+H"]
neg_adductlist = ["M-H", "M-2H", "M-2H+Na", "M-2H+K", "M-2H+NH4", "M-H2O-H", "M-H+Cl", "M+Cl", "M+2Cl"]

data_annos, matchings, sub_graph,  metabolites, dic = data_preprocessing(pos=pos, neg=neg, idx_feature = 4, match_tol_ppm=5, zero_threshold=0.75, scale = 1000)

The shape of data: (1174, 704)
The shape of feature-metabolites matching: (1174, 913)
The shape of metabolic network: (913, 913)


In [9]:
# Y information
info = pd.read_csv("y.csv")

icu = (info['icu']).values
cov = (info['cov']).values

print()
idx_cov = np.where((info['cov'] == 'Yes'))[0]  #& (info['day']=='d0')
# HUGE mistake!!!!
expression = (data_annos.iloc[:,idx_cov + 4].T).values

print(expression.shape)

y = np.zeros(len(idx_cov))
y[np.where(icu[idx_cov] == 'Yes')] = 1  # change between cov/icu
target = y.astype(int)
print(y.shape)


(625, 1174)
(625,)


In [10]:
target.shape

(625,)

In [18]:
data_annos.index

Index(['pos.result.20', 'pos.result.23', 'pos.result.24', 'pos.result.27',
       'pos.result.39', 'pos.result.47', 'pos.result.53', 'pos.result.62',
       'pos.result.63', 'pos.result.114',
       ...
       'neg.result.2398', 'neg.result.2454', 'neg.result.2462',
       'neg.result.2495', 'neg.result.2544', 'neg.result.2654',
       'neg.result.2774', 'neg.result.2786', 'neg.result.2800',
       'neg.result.2812'],
      dtype='object', length=1174)

In [12]:
import pandas as pd
import scipy.stats as stats

In [13]:
def featureSelection(data: pd.DataFrame, indices, p_value = 0.05):

    """A simple function for variable selection.
    INPUT: 
    data: a feature by sample data frame
    indices: a tuple indicating the indices of the samples of different labels, i.e. ([T],[F])
    number: number of features we want
    p_value: threshold for the two sample t test 
    """
    group_1 = data.iloc[:, indices[0]]
    group_2 = data.iloc[:, indices[1]]
    # 对每个特征进行t检验，并获取结果
    results = data.apply(
        lambda row: stats.ttest_ind(a = group_1.loc[row.name], b = group_2.loc[row.name]),
        axis=1
    )

    # 转换为包含统计量和p值的DataFrame
    results_df = pd.DataFrame(results.tolist(), columns=['t_statistic', 'p_value'], index=data.index)

    # 按p值升序排序
    results_df.sort_values('p_value', inplace=True)

    # 重置索引，将特征名作为单独的一列（可选）
    results_df.reset_index(inplace=True)

    print(results_df)
    return results_df

In [14]:
data_from_5th = data_annos.iloc[:, 4:]

In [15]:
res = featureSelection(data_annos.iloc[:, 4 + idx_cov], (y == 1, y == 0), 20)

                index  t_statistic       p_value
0     neg.result.1190    10.641740  2.053710e-24
1     pos.result.3601     9.476665  5.393706e-20
2     pos.result.1546     9.288549  2.576048e-19
3      neg.result.898     8.702359  2.896983e-17
4     pos.result.1708     8.475659  1.690401e-16
...               ...          ...           ...
1169   pos.result.954    -0.008117  9.935262e-01
1170  pos.result.1470    -0.004710  9.962433e-01
1171   neg.result.288     0.004550  9.963709e-01
1172   pos.result.290    -0.002384  9.980988e-01
1173   pos.result.625     0.000390  9.996887e-01

[1174 rows x 3 columns]


In [31]:
key_metas = [dic[key] for key in res['index'].values[:5]]

In [32]:
important_meta = np.concatenate(key_metas).flatten().tolist()

In [34]:
important_meta

['C00822',
 'C05560',
 'C01179',
 'C01197',
 'C05350',
 'C00257',
 'C00800',
 'C16356',
 'C16360',
 'C00352',
 'C06377',
 'C12214']