# MS Data Analysis Tools

## Load Data

**Input:**
-   `.tsv` file obtained as an output of `preprocessing_template.ipynb`

In [2]:
import re
import numpy as np
import pandas as pd
import plotly.express as px
import MSprocessing.stats as mss

from alphastats.dataset.keys import Cols
from statsmodels.stats.multitest import multipletests
from alphastats.dataset.preprocessing import PreprocessingStateKeys as PSK
from alphastats.statistics.differential_expression_analysis import DifferentialExpressionAnalysis

In [137]:
import importlib
mss = importlib.reload(mss)   

In [3]:
index_cols = ["label_id", "group", "AL_kode", "plate_position", "plate_nr",
 "sample_name", "sample_type", "study_ID", "timepoint", "sample_order"]

data = pd.read_csv("preprocessed_data.tsv", sep="\t", index_col=index_cols)
data = data[data.index.get_level_values("sample_type") == "sample"].copy()

In [30]:
proteome, meta = mss.split_proteome_meta(data)
proteome

['label_id', 'group', 'AL_kode', 'plate_position', 'plate_nr', 'sample_name', 'sample_type', 'study_ID', 'timepoint', 'sample_order']


Unnamed: 0_level_0,A0A075B6H9,A0A075B6I9,A0A075B6J1,A0A075B6J2,A0A075B6J9,A0A075B6K0,A0A075B6K2,A0A075B6K4,A0A075B6K5,A0A075B6K6,...,Q9Y608,Q9Y613,Q9Y624,Q9Y646,Q9Y696,Q9Y6C2,Q9Y6E0,Q9Y6N7,Q9Y6R7,Q9Y6Z7
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
plate1_A6,17.959,26.861,19.002,18.341,24.072,22.491,17.052,25.470,27.043,17.135,...,16.929,17.444,19.555,14.867,16.305,13.894,18.162,13.373,22.879,22.208
plate1_A7,19.415,28.164,19.662,19.457,24.495,23.547,20.596,25.015,27.182,16.714,...,16.289,16.727,19.098,15.138,16.572,13.664,16.597,13.355,22.218,22.848
plate1_A8,19.936,27.251,21.133,19.775,25.300,23.902,19.002,26.533,28.198,17.070,...,16.254,16.774,18.925,15.796,16.971,13.112,17.103,13.664,22.254,21.890
plate1_A9,21.413,27.443,20.212,19.585,24.971,23.238,19.222,24.639,27.524,16.718,...,15.766,16.395,19.421,15.934,17.248,12.504,17.686,12.984,23.872,22.195
plate1_A10,18.788,27.110,21.624,19.894,24.818,24.758,19.759,24.491,28.049,16.880,...,16.284,16.931,19.979,15.705,17.285,13.428,17.842,13.286,22.795,23.232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
plate2_E10,19.772,27.739,20.543,18.748,25.064,24.179,19.876,24.815,27.544,16.278,...,16.244,17.014,18.253,16.531,16.653,12.994,16.974,13.225,22.072,22.793
plate2_E11,17.187,27.728,20.508,19.354,24.005,25.485,21.122,26.925,27.773,17.462,...,17.331,16.503,19.583,15.998,17.199,13.653,17.104,13.349,23.552,23.758
plate2_E12,17.544,27.460,22.957,20.281,25.056,24.972,18.657,26.134,27.769,15.011,...,18.382,16.865,17.774,16.353,16.891,13.496,17.581,12.992,25.213,23.718
plate2_F1,18.292,26.506,20.646,19.470,25.136,25.955,19.489,27.164,29.438,17.303,...,17.568,16.396,19.575,15.996,17.472,13.665,17.430,13.937,21.634,22.310


In [5]:
meta

Unnamed: 0_level_0,label_id,group,AL_kode,plate_position,plate_nr,sample_type,study_ID,timepoint,sample_order,A0A075B6H9,...,Q9Y608,Q9Y613,Q9Y624,Q9Y646,Q9Y696,Q9Y6C2,Q9Y6E0,Q9Y6N7,Q9Y6R7,Q9Y6Z7
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
plate1_A6,4083906373,Intervention,100005,A6,plate1,sample,67.0,day_15,1,17.959,...,16.929,17.444,19.555,14.867,16.305,13.894,18.162,13.373,22.879,22.208
plate1_A7,4083918930,Intervention,100006,A7,plate1,sample,34.0,day_0,2,19.415,...,16.289,16.727,19.098,15.138,16.572,13.664,16.597,13.355,22.218,22.848
plate1_A8,4083982663,Intervention,100007,A8,plate1,sample,63.0,day_0,3,19.936,...,16.254,16.774,18.925,15.796,16.971,13.112,17.103,13.664,22.254,21.890
plate1_A9,4083907917,Control,100008,A9,plate1,sample,16.0,day_15,4,21.413,...,15.766,16.395,19.421,15.934,17.248,12.504,17.686,12.984,23.872,22.195
plate1_A10,4083985725,Intervention,100009,A10,plate1,sample,79.0,day_0,5,18.788,...,16.284,16.931,19.979,15.705,17.285,13.428,17.842,13.286,22.795,23.232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
plate2_E10,4083996660,Control,100153,E10,plate2,sample,57.0,day_0,148,19.772,...,16.244,17.014,18.253,16.531,16.653,12.994,16.974,13.225,22.072,22.793
plate2_E11,4083980445,Intervention,100154,E11,plate2,sample,79.0,day_15,149,17.187,...,17.331,16.503,19.583,15.998,17.199,13.653,17.104,13.349,23.552,23.758
plate2_E12,4083966190,Intervention,100155,E12,plate2,sample,35.0,day_15,150,17.544,...,18.382,16.865,17.774,16.353,16.891,13.496,17.581,12.992,25.213,23.718
plate2_F1,4083947093,Control,100156,F1,plate2,sample,45.0,day_15,151,18.292,...,17.568,16.396,19.575,15.996,17.472,13.665,17.430,13.937,21.634,22.310


# Differential Expression Analysis

### Set Parameters

In [140]:
#define the variables
method = "ttest"            #choose between "wald", "sam", "ttest", "welch-ttest" or "paired-ttest"
variable = "group"          #variable to test (eg. group, timepoint, etc.)    
group1 = "Control"            
group2 = "Intervention"                 
adjust = "fdr_bh"           #choose between "bonferroni", "sidak", "holm", "simes-hochberg", "hommel", "fdr_bh", "fdr_by", "fdr_tsbh", "fdr_tsbky"
alpha = 0.05                #p value cutoff
min_fc = 1                  #log fold change cutoff

#method specific variables
pairing = "study_ID"        #pairing column for paired t-test (eg. subject ID)
perm: 10                    #SAM permutations


In [141]:
#run DEA.
dea_results, volcano_plot = mss.run_dea(
    proteome=proteome,
    meta=meta,
    method=method,
    column=variable,
    group1=group1,
    group2=group2,
    pairing=pairing,
    adjust=adjust,
    alpha=alpha,
    min_fc=min_fc,
    preprocessing_info={PSK.LOG2_TRANSFORMED: True}
)
dea_results

Unnamed: 0_level_0,log2fc,pval,padj
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Q14247,0.427573,0.000647,0.823615
A0A087WSY6,0.420135,0.002869,0.998360
Q96BM9;Q9NVJ2,-0.173606,0.007688,0.998360
P07711,-0.215119,0.008734,0.998360
P29401,-0.271597,0.009442,0.998360
...,...,...,...
P06312,-0.000545,0.997001,0.998360
P08670,-0.000237,0.997082,0.998360
P13647,-0.000313,0.997450,0.998360
P35443,0.000351,0.997576,0.998360


In [139]:
volcano_plot.show()


## GO Enrichment

In [132]:
enrichment = mss.go_enrichment(dea_results, 
    pval_cutoff = alpha,
    organism = "hsapiens",
    sources = ["GO:BP", "GO:MF", "GO:CC", "KEGG", "REAC"],
    )
enrichment

Unnamed: 0,source,native,name,p_value,significant,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents
0,GO:MF,GO:0000149,SNARE binding,1.0,False,"""Binding to a SNARE (soluble N-ethylmaleimide-...",8,39,1,1255,0.025641,0.125000,query_1,[GO:0005515]
1,GO:BP,GO:0051611,regulation of serotonin uptake,1.0,False,"""Any process that modulates the frequency, rat...",2,39,1,1255,0.025641,0.500000,query_1,"[GO:0051580, GO:0051610]"
2,GO:BP,GO:0051612,negative regulation of serotonin uptake,1.0,False,"""Any process that stops, prevents, or reduces ...",1,39,1,1255,0.025641,1.000000,query_1,"[GO:0051581, GO:0051610, GO:0051611]"
3,GO:BP,GO:0051620,norepinephrine uptake,1.0,False,"""The directed movement of norepinephrine into ...",2,39,1,1255,0.025641,0.500000,query_1,"[GO:0015874, GO:0090493]"
4,GO:BP,GO:0051621,regulation of norepinephrine uptake,1.0,False,"""Any process that modulates the frequency, rat...",2,39,1,1255,0.025641,0.500000,query_1,"[GO:0051049, GO:0051620]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2751,GO:BP,GO:0008228,opsonization,1.0,False,"""The process in which a microorganism (or othe...",12,39,1,1255,0.025641,0.083333,query_1,"[GO:0002252, GO:0006910]"
2752,GO:BP,GO:0008277,regulation of G protein-coupled receptor signa...,1.0,False,"""Any process that modulates the frequency, rat...",12,39,2,1255,0.051282,0.166667,query_1,"[GO:0007186, GO:0009966]"
2753,GO:BP,GO:0008283,cell population proliferation,1.0,False,"""The multiplication or reproduction of cells, ...",204,39,6,1255,0.153846,0.029412,query_1,[GO:0009987]
2754,GO:BP,GO:0008202,steroid metabolic process,1.0,False,"""The chemical reactions and pathways involving...",43,39,1,1255,0.025641,0.023256,query_1,[GO:0006629]
