# MS Data Analysis Tools

## Load Data

**Input:**
-   `.tsv` file obtained as an output of `preprocessing_template.ipynb`

In [3]:
import re
import numpy as np
import pandas as pd
import plotly.express as px
import MSprocessing.stats as mss

from alphastats.dataset.keys import Cols
from statsmodels.stats.multitest import multipletests
from alphastats.dataset.preprocessing import PreprocessingStateKeys as PSK
from alphastats.statistics.differential_expression_analysis import DifferentialExpressionAnalysis

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import importlib
mss = importlib.reload(mss)   

In [11]:
index_cols = ["label_id", "group", "AL_kode", "plate_position", "plate_nr",
 "sample_name", "sample_type", "study_ID", "timepoint", "sample_order"]

data = pd.read_csv("preprocessed_data.tsv", sep="\t", index_col=index_cols)
data = data[data.index.get_level_values("sample_type") == "sample"].copy()

In [None]:
proteome, meta = mss.split_proteome_meta(data, "sample_name") #set column to use as index
proteome

Unnamed: 0_level_0,A0A075B6H9,A0A075B6I9,A0A075B6J1,A0A075B6J2,A0A075B6J9,A0A075B6K0,A0A075B6K2,A0A075B6K4,A0A075B6K5,A0A075B6K6,...,Q9Y608,Q9Y613,Q9Y624,Q9Y646,Q9Y696,Q9Y6C2,Q9Y6E0,Q9Y6N7,Q9Y6R7,Q9Y6Z7
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
plate1_A6,17.959,26.861,19.002,18.341,24.072,22.491,17.052,25.470,27.043,17.135,...,16.929,17.444,19.555,14.867,16.305,13.894,18.162,13.373,22.879,22.208
plate1_A7,19.415,28.164,19.662,19.457,24.495,23.547,20.596,25.015,27.182,16.714,...,16.289,16.727,19.098,15.138,16.572,13.664,16.597,13.355,22.218,22.848
plate1_A8,19.936,27.251,21.133,19.775,25.300,23.902,19.002,26.533,28.198,17.070,...,16.254,16.774,18.925,15.796,16.971,13.112,17.103,13.664,22.254,21.890
plate1_A9,21.413,27.443,20.212,19.585,24.971,23.238,19.222,24.639,27.524,16.718,...,15.766,16.395,19.421,15.934,17.248,12.504,17.686,12.984,23.872,22.195
plate1_A10,18.788,27.110,21.624,19.894,24.818,24.758,19.759,24.491,28.049,16.880,...,16.284,16.931,19.979,15.705,17.285,13.428,17.842,13.286,22.795,23.232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
plate2_E10,19.772,27.739,20.543,18.748,25.064,24.179,19.876,24.815,27.544,16.278,...,16.244,17.014,18.253,16.531,16.653,12.994,16.974,13.225,22.072,22.793
plate2_E11,17.187,27.728,20.508,19.354,24.005,25.485,21.122,26.925,27.773,17.462,...,17.331,16.503,19.583,15.998,17.199,13.653,17.104,13.349,23.552,23.758
plate2_E12,17.544,27.460,22.957,20.281,25.056,24.972,18.657,26.134,27.769,15.011,...,18.382,16.865,17.774,16.353,16.891,13.496,17.581,12.992,25.213,23.718
plate2_F1,18.292,26.506,20.646,19.470,25.136,25.955,19.489,27.164,29.438,17.303,...,17.568,16.396,19.575,15.996,17.472,13.665,17.430,13.937,21.634,22.310


In [6]:
meta

Unnamed: 0_level_0,label_id,group,AL_kode,plate_position,plate_nr,sample_type,study_ID,timepoint,sample_order,A0A075B6H9,...,Q9Y608,Q9Y613,Q9Y624,Q9Y646,Q9Y696,Q9Y6C2,Q9Y6E0,Q9Y6N7,Q9Y6R7,Q9Y6Z7
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
plate1_A6,4083906373,Intervention,100005,A6,plate1,sample,67.0,day_15,1,17.959,...,16.929,17.444,19.555,14.867,16.305,13.894,18.162,13.373,22.879,22.208
plate1_A7,4083918930,Intervention,100006,A7,plate1,sample,34.0,day_0,2,19.415,...,16.289,16.727,19.098,15.138,16.572,13.664,16.597,13.355,22.218,22.848
plate1_A8,4083982663,Intervention,100007,A8,plate1,sample,63.0,day_0,3,19.936,...,16.254,16.774,18.925,15.796,16.971,13.112,17.103,13.664,22.254,21.890
plate1_A9,4083907917,Control,100008,A9,plate1,sample,16.0,day_15,4,21.413,...,15.766,16.395,19.421,15.934,17.248,12.504,17.686,12.984,23.872,22.195
plate1_A10,4083985725,Intervention,100009,A10,plate1,sample,79.0,day_0,5,18.788,...,16.284,16.931,19.979,15.705,17.285,13.428,17.842,13.286,22.795,23.232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
plate2_E10,4083996660,Control,100153,E10,plate2,sample,57.0,day_0,148,19.772,...,16.244,17.014,18.253,16.531,16.653,12.994,16.974,13.225,22.072,22.793
plate2_E11,4083980445,Intervention,100154,E11,plate2,sample,79.0,day_15,149,17.187,...,17.331,16.503,19.583,15.998,17.199,13.653,17.104,13.349,23.552,23.758
plate2_E12,4083966190,Intervention,100155,E12,plate2,sample,35.0,day_15,150,17.544,...,18.382,16.865,17.774,16.353,16.891,13.496,17.581,12.992,25.213,23.718
plate2_F1,4083947093,Control,100156,F1,plate2,sample,45.0,day_15,151,18.292,...,17.568,16.396,19.575,15.996,17.472,13.665,17.430,13.937,21.634,22.310


# Differential Expression Analysis

### Set Parameters

In [244]:
#define the variables
method = "paired-ttest"            #choose between "sam", "ttest", "welch-ttest" or "paired-ttest"
variable = "timepoint"          #variable to test (eg. group, timepoint, etc.)    
group1 = "day_0"            
group2 = "day_15"                 
adjust = "fdr_bh"           #choose between "bonferroni", "sidak", "holm", "simes-hochberg", "hommel", "fdr_bh", "fdr_by", "fdr_tsbh", "fdr_tsbky"
alpha = 0.05                #p value cutoff
min_fc = 1                  #log fold change cutoff

#method specific variables
pairing = "study_ID"        #pairing column for paired t-test (eg. subject ID)
perm: 10                    #SAM permutations


In [243]:
#run DEA.
dea_results, volcano_plot = mss.run_dea(
    proteome=proteome,
    meta=meta,
    method=method,
    column=variable,
    group1=group1,
    group2=group2,
    pairing=pairing,
    adjust=adjust,
    alpha=alpha,
    min_fc=min_fc,
    preprocessing_info={PSK.LOG2_TRANSFORMED: True}
)
dea_results


Unnamed: 0_level_0,log2fc,pval,padj
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Q6P179,0.360743,0.001704,0.990397
P10809,-0.353471,0.002378,0.990397
Q6E0U4,0.374157,0.002531,0.990397
P21549,0.274443,0.009310,0.990397
P62495,-0.227571,0.009765,0.990397
...,...,...,...
P00390,-0.000429,0.996188,0.997960
P31944,-0.000471,0.996758,0.997960
Q9UIQ6,0.000429,0.996928,0.997960
Q99969,-0.000386,0.997176,0.997960


In [195]:
print(dea_results.loc["Q14247"])
print(dea_results.loc["A0A087WSY6"])
print(dea_results.loc["Q96BM9;Q9NVJ2"])

log2fc    0.362534
pval      0.007500
padj      0.979417
Name: Q14247, dtype: float64
log2fc    0.413566
pval      0.007870
padj      0.979417
Name: A0A087WSY6, dtype: float64
log2fc   -0.192137
pval      0.002553
padj      0.979417
Name: Q96BM9;Q9NVJ2, dtype: float64


In [158]:
volcano_plot.show()


## Regression Models

In [8]:
_, coef_df = mss.run_logreg(
    proteome=proteome,
    meta=meta,
    column="group",
    group1="Intervention",
    group2="Control",
    cv=5,
    adjust="fdr_bh"
)

coef_df

Unnamed: 0_level_0,beta,se,pval,padj
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P08319,2.065574e-03,0.054616,0.969831,0.999996
Q14247,-1.498426e-03,0.039973,0.970097,0.999996
Q96BM9;Q9NVJ2,6.162893e-04,0.018133,0.972887,0.999996
A0A087WSY6,-1.561232e-03,0.047704,0.973892,0.999996
Q00325,-7.186550e-04,0.022623,0.974659,0.999996
...,...,...,...,...
Q8NBP7,-2.228291e-06,0.037004,0.999952,0.999996
P30048,3.000688e-07,0.017111,0.999986,0.999996
P99999,2.286700e-07,0.033432,0.999995,0.999996
P13647,1.539429e-07,0.026777,0.999995,0.999996


In [15]:
results = mss.run_mixedlm(
    proteome=proteome,
    meta=meta,
    var1={"group": ["Control", "Intervention"]},
    var2={"timepoint": ["day_0", "day_15"]},
    pairing="study_ID",
)

results




Unnamed: 0_level_0,beta,pval,padj
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P46777,0.357641,0.000363,0.316785
P60709;P63261,-0.891406,0.000498,0.316785
A0A0C4DH32,-0.779190,0.001648,0.542891
Q9UNW1,-0.649090,0.001707,0.542891
Q93063,-0.539592,0.002574,0.552557
...,...,...,...
Q8NBJ4,0.001132,0.995328,0.998468
P04066,0.000665,0.997416,0.998712
P11215,-0.000588,0.997576,0.998712
A0A1B0GTC6,-0.000375,0.998682,0.998712


## GO Enrichment

In [17]:
enrichment = mss.go_enrichment(results, 
    pval_cutoff = 0.05,
    organism = "hsapiens",
    sources = ["GO:BP", "GO:MF", "GO:CC", "KEGG", "REAC"],
    )
enrichment

Unnamed: 0,source,native,name,p_value,significant,description,term_size,query_size,intersection_size,effective_domain_size,precision,recall,query,parents
0,GO:MF,GO:0000146,microfilament motor activity,1.0,False,"""A motor activity that generates movement alon...",5,67,1,1255,0.014925,0.200000,query_1,"[GO:0003774, GO:0120544, GO:0140657]"
1,GO:BP,GO:0051651,maintenance of location in cell,1.0,False,"""Any process in which a substance or cellular ...",30,67,1,1255,0.014925,0.033333,query_1,"[GO:0009987, GO:0051235, GO:0051641]"
2,GO:BP,GO:0051653,spindle localization,1.0,False,"""Any process in which is the spindle is transp...",7,67,1,1255,0.014925,0.142857,query_1,"[GO:0022402, GO:0051640]"
3,GO:BP,GO:0051656,establishment of organelle localization,1.0,False,"""The directed movement of an organelle to a sp...",37,67,3,1255,0.044776,0.081081,query_1,"[GO:0051234, GO:0051640]"
4,GO:BP,GO:0051668,localization within membrane,1.0,False,"""Any process in which a substance or cellular ...",66,67,4,1255,0.059701,0.060606,query_1,[GO:0051641]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3636,GO:BP,GO:0008154,actin polymerization or depolymerization,1.0,False,"""Assembly or disassembly of actin filaments by...",34,67,1,1255,0.014925,0.029412,query_1,[GO:0007015]
3637,GO:BP,GO:0008202,steroid metabolic process,1.0,False,"""The chemical reactions and pathways involving...",43,67,2,1255,0.029851,0.046512,query_1,[GO:0006629]
3638,GO:BP,GO:0008203,cholesterol metabolic process,1.0,False,"""The chemical reactions and pathways involving...",25,67,1,1255,0.014925,0.040000,query_1,"[GO:0016125, GO:1902652]"
3639,GO:BP,GO:0007520,myoblast fusion,1.0,False,"""A process in which non-proliferating myoblast...",8,67,1,1255,0.014925,0.125000,query_1,"[GO:0000768, GO:0014902]"
