In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, cross_val_score, GridSearchCV, GroupKFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import explained_variance_score, r2_score, make_scorer
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from scipy.stats import pearsonr, spearmanr, wilcoxon
import scipy.stats as stats

import seaborn as sns
import matplotlib.pyplot as plt

import optuna

sns.set_theme(style="whitegrid")

# Statistical Tests per Gene

In [15]:
df = pd.read_csv("../all_results_pearson.tsv", sep="\t")
df.head()

Unnamed: 0,split,gene_id,rvalue,method
0,4,ENSG00000229450.2,0.164422,LinRegr
1,3,ENSG00000229450.2,0.208025,LinRegr
2,2,ENSG00000229450.2,0.007842,LinRegr
3,1,ENSG00000229450.2,0.073344,LinRegr
4,5,ENSG00000229450.2,0.174378,LinRegr


## Test between pairs of methods within each gene

In [20]:
methods = df.method.unique()
g, m1, m2, s, p = [], [], [], [], []
for gene in df.gene_id.unique():
    for i in range(0, len(methods)-1):
        for j in range(i+1, len(methods)):
                d1 = df[(df.gene_id==gene) & (df.method==methods[i])].sort_values("split")
                d2 = df[(df.gene_id==gene) & (df.method==methods[j])].sort_values("split")
                test = wilcoxon(d1.rvalue.values, d2.rvalue.values)
                g.append(gene)
                m1.append(methods[i])
                m2.append(methods[j])
                s.append(test.statistic)
                p.append(test.pvalue)
results = pd.DataFrame({"gene": g, "method1": m1, "method2": m2, "statistic": s, "pvalue": p})
results.head()

Unnamed: 0,gene,method1,method2,statistic,pvalue
0,ENSG00000229450.2,LinRegr,Ridge,0.0,0.0625
1,ENSG00000229450.2,LinRegr,SVR,0.0,0.0625
2,ENSG00000229450.2,LinRegr,SNP,0.0,0.0625
3,ENSG00000229450.2,Ridge,SVR,7.0,1.0
4,ENSG00000229450.2,Ridge,SNP,3.0,0.3125


## Test between pairs of methods across all genes

In [27]:
df[df.method=="Ridge"][["gene_id", "rvalue"]].groupby("gene_id").mean().reset_index().sort_values("gene_id")

Unnamed: 0,gene_id,rvalue
0,ENSG00000176681.9,0.45904
1,ENSG00000179344.11,0.730596
2,ENSG00000196126.6,0.420465
3,ENSG00000196735.6,0.594444
4,ENSG00000198502.5,0.344322
5,ENSG00000214425.1,0.347039
6,ENSG00000229450.2,0.5172
7,ENSG00000232629.4,0.490962
8,ENSG00000237541.3,0.496431
9,ENSG00000238083.3,0.394711


In [48]:
best_method = df.groupby(["method", "gene_id"]).mean().groupby("method").mean().idxmax()["rvalue"]
methods = df.method.unique()
m1, m2, s, p = [], [], [], []
for method in methods:
    if method != best_method:
        d1 = df[df.method==best_method][["gene_id", "rvalue"]].groupby("gene_id").mean().reset_index().sort_values("gene_id")
        d2 = df[df.method==method][["gene_id", "rvalue"]].groupby("gene_id").mean().reset_index().sort_values("gene_id")
        test = wilcoxon(np.array(d1.rvalue.values) - np.array(d2.rvalue.values))
        m1.append(best_method)
        m2.append(method)
        s.append(test.statistic)
        p.append(test.pvalue)
results = pd.DataFrame({"method1": m1, "method2": m2, "statistic": s, "pvalue": p})
results

Unnamed: 0,method1,method2,statistic,pvalue
0,Ridge,LinRegr,0.0,0.001953
1,Ridge,SVR,11.0,0.105469
2,Ridge,SNP,4.0,0.013672


In [50]:
results["pvalue_BH"] = stats.false_discovery_control(results.pvalue.values)
results

Unnamed: 0,method1,method2,statistic,pvalue,pvalue_BH
0,Ridge,LinRegr,0.0,0.001953,0.005859
1,Ridge,SVR,11.0,0.105469,0.105469
2,Ridge,SNP,4.0,0.013672,0.020508
