# Example of COVID data analysis

### Load data

In [1]:
import pandas as pd
import numpy as np
import igraph as ig
from meta_matching_tool import *

# define a function to preprocess the data
pos = pd.read_csv('positive.txt', sep='\t')
neg = pd.read_csv('negative.txt', sep='\t')

pos_adductlist = ["M+H","M+NH4","M+Na","M+ACN+H","M+ACN+Na","M+2ACN+H","2M+H","2M+Na","2M+ACN+H"]
neg_adductlist = ["M-H", "M-2H", "M-2H+Na", "M-2H+K", "M-2H+NH4", "M-H2O-H", "M-H+Cl", "M+Cl", "M+2Cl"]

The data should contain mz and feature observations

In [2]:
pos

Unnamed: 0,m/z,time,min.mz,max.mz,B1_WU350-004_d7_polar_pos.mzML,B1_WU350-005_d14_polar_pos.mzML,B1_WU350-005_d7_polar_pos.mzML,B1_WU350-006_d14_polar_pos.mzML,B1_WU350-006_d3_polar_pos.mzML,B1_WU350-006_d7_polar_pos.mzML,...,B9_WU350-394_d7_polar_pos.mzML,B9_WU350-395_d0_polar_pos.mzML,B9_WU350-397_d0_polar_pos.mzML,B9_WU350-397_d3_polar_pos.mzML,B9_WU350-397_d7_polar_pos.mzML,B9_WU350-398_d0_polar_pos.mzML,B9_WU350-398_d3_polar_pos.mzML,B9_WU350-398_d7_polar_pos.mzML,B9_WU350-400_d0_polar_pos.mzML,B9_WU350-400_d3_polar_pos.mzML
result.1,52.005422,418.545572,52.005384,52.005482,85530.001183,55716.927932,78463.783905,87818.823468,83335.367808,69348.930247,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
result.2,54.033476,93.617746,54.033242,54.033630,48476.359316,27872.703653,47693.335922,44385.837300,29834.844919,40394.442219,...,5000.876527,20239.518701,6589.628202,0.000000,6105.816136,21383.756680,6054.030063,9026.395011,51893.669027,3465.010069
result.3,55.017475,146.695128,55.017313,55.017553,34029.490959,94908.433229,146692.302752,13468.123872,7975.044598,31809.876128,...,78263.212230,0.000000,34920.607365,30482.131505,5930.747594,10383.547527,29295.623397,37627.828189,31036.299486,32305.841706
result.4,55.053901,323.240583,55.053743,55.054051,21315.340175,34056.645449,56681.109544,43867.322409,0.000000,10323.425549,...,92851.568912,7919.722416,10822.030887,15328.775084,39106.559939,12207.480073,16705.343246,38865.297421,17178.390494,11286.559982
result.5,56.049186,309.560707,56.049126,56.049269,231267.953873,446263.344378,478938.814431,547804.473538,246766.686636,313336.330236,...,109851.944644,83096.167305,42002.886303,69363.411231,99242.611848,45811.639528,59755.744440,64734.973832,66517.271257,44766.028207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
result.4684,1428.219728,513.455827,1428.219099,1428.220357,40071.643984,32831.231610,36937.302413,30940.829917,35533.822435,30879.457646,...,18028.246814,23578.454739,19208.231755,18907.557534,30498.255144,21457.588997,15265.572366,9733.044909,11932.747080,20921.726089
result.4685,1452.679091,601.251092,1452.678723,1452.679885,173598.118785,132401.039890,137114.047948,141214.258859,166015.374076,138962.878979,...,108549.803617,8339.478047,82421.043508,3206.107250,149419.059071,4630.515313,10961.276395,134962.521631,83222.471728,3999.688483
result.4686,1468.645099,582.163235,1468.644448,1468.645364,65608.987858,132943.059950,127358.447540,111715.941322,65581.457570,124621.536800,...,3337.988362,2764.146568,15784.642394,7012.573197,130356.209139,88043.414973,89043.576187,119310.748483,3253.112024,97796.202034
result.4687,1516.201688,511.258353,1516.201016,1516.202464,37276.825363,35638.598305,27610.984528,37541.052261,37149.384096,29078.814672,...,23327.894462,18576.931328,19362.108617,18843.872067,28083.117989,16835.686141,5501.967204,29795.652202,19987.232098,15951.412939


### Conduct data pre-processing using MetaMatching package
User can also have data pre-processing on other platform.

In [3]:
data_annos, matchings, sub_graph,  metabolites = data_preprocessing(pos=pos, neg=neg, idx_feature = 4, match_tol_ppm=5, zero_threshold=0.75, scale = 1000)

The shape of data: (1174, 704)
The shape of feature-metabolites matching: (1174, 913)
The shape of metabolic network: (913, 913)


In [4]:
data_annos

Unnamed: 0,mz,time,min.mz,max.mz,B1_WU350-004_d7_polar_.mzML,B1_WU350-005_d14_polar_.mzML,B1_WU350-005_d7_polar_.mzML,B1_WU350-006_d14_polar_.mzML,B1_WU350-006_d3_polar_.mzML,B1_WU350-006_d7_polar_.mzML,...,B9_WU350-394_d7_polar_.mzML,B9_WU350-395_d0_polar_.mzML,B9_WU350-397_d0_polar_.mzML,B9_WU350-397_d3_polar_.mzML,B9_WU350-397_d7_polar_.mzML,B9_WU350-398_d0_polar_.mzML,B9_WU350-398_d3_polar_.mzML,B9_WU350-398_d7_polar_.mzML,B9_WU350-400_d0_polar_.mzML,B9_WU350-400_d3_polar_.mzML
pos.result.20,61.039522,196.416835,61.039471,61.039679,437.628474,441.346395,478.359916,460.408374,430.755394,441.446427,...,368.985418,297.286639,355.341877,360.825973,402.911599,302.332963,323.934390,404.019479,303.716424,312.733080
pos.result.23,62.059713,120.385664,62.059579,62.059778,228.158667,399.042876,285.587152,349.035646,389.531525,387.564361,...,377.256213,349.175690,351.353982,297.697866,311.550970,319.463392,334.326957,329.787306,328.222108,254.363931
pos.result.24,62.059773,567.364501,62.059573,62.059899,462.183840,472.388044,475.285648,465.820505,439.920154,473.195084,...,282.592611,256.041209,310.150920,405.168382,402.277061,299.208178,259.927059,431.276804,235.071199,367.336546
pos.result.27,63.043732,139.905082,63.043578,63.043941,370.604943,371.535584,408.435481,378.687633,361.606920,358.681835,...,308.933020,255.800836,249.784720,280.350629,318.432638,272.294414,280.897422,246.782749,282.219570,280.724949
pos.result.39,69.044108,142.410496,69.043938,69.044326,308.575735,258.127862,228.775377,234.163987,257.916146,220.114265,...,454.008163,242.391608,277.975688,461.060247,259.522509,242.129561,220.210086,309.695672,448.315144,486.169317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
neg.result.2654,681.295731,64.941535,681.295422,681.295955,-171.402797,-197.067069,-324.137229,-135.009998,-373.050473,-374.522160,...,-311.372628,-305.845331,-329.163016,-361.825650,-395.893140,3.153771,-320.913945,-236.174276,-319.196059,-329.939720
neg.result.2774,973.314535,445.032143,973.313996,973.315277,313.108536,306.325139,342.275169,304.629473,342.305529,322.275169,...,256.756923,243.854682,309.148558,257.772842,323.266993,286.894680,244.155129,268.259871,274.192328,325.511937
neg.result.2786,1018.317729,445.073482,1018.317337,1018.318266,407.742276,411.968504,375.843233,401.236418,432.112066,421.547489,...,270.549107,272.193111,336.417488,304.015992,291.483502,307.815055,255.735481,300.808524,293.385342,309.510488
neg.result.2800,1040.298741,447.460359,1040.298282,1040.299241,145.680525,137.318035,-500.000000,444.396520,245.779113,208.515756,...,-500.000000,-500.000000,-500.000000,-500.000000,-500.000000,-500.000000,-500.000000,-500.000000,-500.000000,-500.000000


Our objective regarding the COVID data is to train the model to accurately predict whether a patient was admitted to the ICU or not. To represent the patient's status, we assign 1 if they were in the ICU and 0 if they were not. Additionally, we only utilize data from the samples collected at the hospital's entrance.

In [5]:
# Y information
info = pd.read_csv("y.csv")

icu = (info['icu']).values
cov = (info['cov']).values

print()
idx_cov = np.where((info['cov'] == 'Yes'))[0]  #& (info['day']=='d0')
# HUGE mistake!!!!
expression = (data_annos.iloc[:,idx_cov + 4].T).values

print(expression.shape)

y = np.zeros(len(idx_cov))
y[np.where(icu[idx_cov] == 'Yes')] = 1  # change between cov/icu
target = y.astype(int)
print(y.shape)


(625, 1174)
(625,)


In [6]:
metaboliteDf = pd.read_csv('target_metabolic.txt', sep='\t')

### Train the model
To perform our analysis, we utilize the 'sparse_nn' function within the MetaMatching package. This function takes as input an expression matrix, an observation array, a metabolic network, and a matching matrix that links features and metabolites. Once the analysis is complete, the output of the module is a folder that contains several key results, including the metabolic and feature importance, predicted linkages between features and metabolites, and the model itself.

In [10]:
# input: ratio of test set, feature_meta, partition, expression, target, size of fully connected, sparse_rate, thres_sparse
sparsify_coefficient = 0.3
num_hidden_layer_neuron_list = [5]
target_keggids = np.random.choice(metaboliteDf['KEGGID'].values, 5, replace=False).tolist()
feature_meta = matchings
expression = expression
target = target
drop_out = 0.1
random_seed = 10
batch_size = 16
lr = 0.00001
weight_decay = 0
num_epoch = 50

sparse_nn(expression=expression, target=target, target_keggids=target_keggids, feature_meta=feature_meta, knowledge_graph=sub_graph,random_seed=random_seed, sparsify_coefficient=sparsify_coefficient,
          num_hidden_layer_neuron_list=num_hidden_layer_neuron_list, drop_out=drop_out, batch_size=batch_size, lr=lr, weight_decay=weight_decay, num_epoch=num_epoch)

epoch: 0, test acc: 0.547872, corr train acc: 0.608696
epoch: 1, test acc: 0.547872, corr train acc: 0.608696
epoch: 2, test acc: 0.547872, corr train acc: 0.608696
epoch: 3, test acc: 0.547872, corr train acc: 0.608696
epoch: 4, test acc: 0.547872, corr train acc: 0.608696
epoch: 5, test acc: 0.547872, corr train acc: 0.608696
epoch: 6, test acc: 0.553191, corr train acc: 0.620137
epoch: 7, test acc: 0.547872, corr train acc: 0.620137
epoch: 8, test acc: 0.553191, corr train acc: 0.620137
epoch: 9, test acc: 0.574468, corr train acc: 0.643021
epoch: 10, test acc: 0.563830, corr train acc: 0.643021
epoch: 11, test acc: 0.579787, corr train acc: 0.656751
epoch: 12, test acc: 0.632979, corr train acc: 0.725400
epoch: 13, test acc: 0.632979, corr train acc: 0.725400
epoch: 14, test acc: 0.622340, corr train acc: 0.725400
epoch: 15, test acc: 0.632979, corr train acc: 0.725400
epoch: 16, test acc: 0.680851, corr train acc: 0.745995
epoch: 17, test acc: 0.718085, corr train acc: 0.778032
ep