# Select adducts deriving from same molecular features 

To reduce redundancy we are selecting the adduct with least non-zero values for each mass spectral feature. Adducts were defined through the ion identity molecular networking workflow within GNPS [Schmid et al., 2021](https://www.nature.com/articles/s41467-021-23953-9).

load libraries

In [1]:
import operator
import pandas as pd
from functools import reduce
from networkx import *
import session_info


session_info.show()

load feature table

In [2]:
ft = pd.read_csv('../../data/PreprocessedData/MS2_timsTOF_quant.csv', sep = ',')

load adduct edges annotation

In [3]:
ed = pd.read_csv('../../data/PreprocessedData/MS2_timsTOF_edges_msannotation.csv', sep = ',')

In [4]:
ft = ft.drop(['Unnamed: 262'], axis=1) # remove empty column in feature table

In [5]:
ft.index = ft['row ID'] # transform index to row IDs

create network from edgelist

In [6]:
G = nx.from_pandas_edgelist(ed, 'ID1', 'ID2', edge_attr=list(set(list(ed.columns)) - set(['ID1','ID2'])),
                            create_using=nx.Graph())

In [7]:
for column in ft:
    nx.set_node_attributes(G, pd.Series(ft[column], index = ft.index).to_dict(),column)

get all nodes of a connected component

In [8]:
s = G.subgraph(nx.shortest_path(G,3))

In [9]:
s.nodes()

NodeView((3, 2502, 1226, 434, 4952, 2236))

retrieve all connected components

In [10]:
nodes_per_comp = list()
components = nx.connected_components(G)
component_map = { } 
for nodes in components:
    nodes_per_comp.append(list(nodes))

In [11]:
snames= set(list(ft.columns)) - set(list(ft.columns[:10]))

select adduct with least non-zero values

In [12]:
rem = list()

for n in nodes_per_comp:
    indxs = set(ft[snames][ft['row ID'].isin(n)].gt(0).sum(axis=1).index)
    max_indx = set([ft[snames][ft['row ID'].isin(n)].gt(0).sum(axis=1).idxmax()])
    rem.append(list(indxs-max_indx))

In [13]:
desel_ids = reduce(operator.concat, rem)

number of feature removed

In [14]:
len(desel_ids)

1776

dimensions of feature table

In [15]:
ft.shape

(3844, 262)

column names of feature table

In [16]:
ft.columns[:10]

Index(['row ID', 'row m/z', 'row retention time', 'correlation group ID',
       'annotation network number', 'best ion', 'auto MS2 verify',
       'identified by n=', 'partners', 'neutral M mass'],
      dtype='object')

dimensions of feature table that will be removed

In [17]:
ft[ft['row ID'].isin(desel_ids)].shape

(1776, 262)

remove redundant adducts from feature table

In [18]:
ft.drop(desel_ids , inplace=True)

dimensions of the reduced feature table

In [19]:
ft.shape

(2068, 262)

write out reduced feature table

In [20]:
ft.to_csv('output/MS2_timsTOF_quant_mergedAdducts.csv',sep = ',', index=False)