In [1]:
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

import numpy as np
np.random.seed(2024)

In [2]:
def read_data(file_path):
    df = pd.read_csv(file_path, delimiter='\t')
    df = df.T
    df = df.rename_axis('case_id').reset_index()
    return df

In [3]:
proteo_path = './data/CPTAC/DISCOVERY_data_freeze_v_2.1/UCEC_discovery_Proteomics_PNNL_ratio_median_polishing_log2.txt'
df_proteo = read_data(proteo_path)
print(f'Number of cases: {df_proteo.shape[0]}')
df_proteo.head()

Number of cases: 95


Unnamed: 0,case_id,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAT,AAED1,AAGAB,...,ZSWIM8,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
0,S001,-1.18,-0.863,-0.802,0.222,0.256,0.665,1.28,-0.339,0.412,...,-0.0877,,0.0229,0.109,,-0.332,-0.433,-1.02,-0.123,-0.0859
1,S002,-0.685,-1.07,-0.684,0.984,0.135,0.334,1.3,0.139,1.33,...,-0.0356,,0.363,1.07,0.737,-0.564,-0.00461,-1.13,-0.0757,-0.473
2,S003,-0.528,-1.32,0.435,,-0.24,1.04,-0.0213,-0.0479,0.419,...,0.00112,-0.145,0.0105,-0.116,,0.151,-0.074,-0.54,0.32,-0.419
3,S005,-1.67,-1.19,-0.443,0.243,-0.0993,0.757,0.74,-0.929,0.229,...,0.0725,-0.0552,-0.0714,0.0933,0.156,-0.398,-0.0752,-0.797,-0.0301,-0.467
4,S006,-0.374,-0.0206,-0.537,0.311,0.375,0.0131,-1.1,,0.565,...,-0.176,,-1.22,-0.562,0.937,-0.646,0.207,-1.85,-0.176,0.0513


In [4]:
path_mutation_gene = './data/CPTAC/DISCOVERY_data_freeze_v_2.1/UCEC_discovery_somatic_mutation_gene_level.txt'
df_mut_gene = read_data(path_mutation_gene)
df_mut_gene

Unnamed: 0,case_id,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,...,ZUFSP,ZWILCH,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,S001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,S002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,S003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,S005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,S006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,S099,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
91,S100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92,S101,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
93,S102,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
top_genes = df_mut_gene.sum(axis=0, numeric_only=True).sort_values(ascending=False).head(10)
list(top_genes.index)

['PTEN',
 'PIK3CA',
 'ARID1A',
 'PIK3R1',
 'KRAS',
 'CTNNB1',
 'CTCF',
 'KMT2B',
 'ZFHX3',
 'TP53']

In [6]:
columns_na = df_proteo.columns[df_proteo.isna().any()].tolist()
print(f'Columns with NaN values: {len(columns_na)}')

Columns with NaN values: 2983


## 0. Test functions

In [7]:
def test_correlation(df_mut_gene, df_proteo, list_genes, method='pearson'):
    columns_na = df_proteo.columns[df_proteo.isna().any()].tolist()

    correlations = []
    p_values = []

    for gene in list_genes:
        correlations.append({'gene': gene})
        p_values.append({'gene': gene})

        for proteo in list(df_proteo.columns)[1:]:
            if proteo in columns_na:
                continue

            if method=='pearson':
                res = stats.pearsonr(df_mut_gene[gene], df_proteo[proteo])
            elif method=='spearman':
                res = stats.spearmanr(df_mut_gene[gene], df_proteo[proteo])

            correlations[-1][proteo] = res.statistic
            p_values[-1][proteo] = res.pvalue

    df_corr = pd.DataFrame(correlations)
    df_pval = pd.DataFrame(p_values)
    
    return df_corr, df_pval

def pvalue_correction(df_pval):
    p_values_np = df_pval[df_pval.columns[1:]].to_numpy()
    p_values_np_1d = p_values_np.ravel()

    p_values_adusted_np = stats.false_discovery_control(p_values_np_1d)
    p_values_adusted_np = p_values_adusted_np.reshape(p_values_np.shape)

    df_pval_adj = df_pval.copy()

    for i, col in enumerate(df_pval.columns[1:]):
        df_pval_adj[col] = p_values_adusted_np[:,i]

    return df_pval_adj

def get_signif_columns(df_pval, p_value=0.05):
    signif_columns = []

    for col in list(df_pval.columns)[1:]:
        if (df_pval[col] < p_value).any():
            signif_columns.append(col)

    return signif_columns

def rank_columns_signif(df_pval, p_value=0.05, signif_columns=None):
    d_signif_columns = {}

    if signif_columns is None:
        cols_list = list(df_pval.columns)[1:]
    else:
        cols_list = signif_columns

    for col in cols_list:
        n_rows = df_pval[df_pval[col] < p_value].shape[0]
        if n_rows > 0:
            d_signif_columns[col] = n_rows

    d_signif_columns = {k: v for k, v in sorted(d_signif_columns.items(), 
                                                key=lambda item: item[1],
                                                reverse=True)}
    
    rank_columns = pd.Series(d_signif_columns)

    return rank_columns

## 1. Correlation Top 10 Mutated Genes

### 1.1. Pearson correlation test 

In [8]:
df_corr_pe, df_pval_pe = test_correlation(df_mut_gene, df_proteo, list(top_genes.index), method='pearson')
df_pval_adj_pe = pvalue_correction(df_pval_pe)

In [9]:
df_corr_pe

Unnamed: 0,gene,A1BG,A2M,AAAS,AACS,AAGAB,AAK1,AAMDC,AAMP,AAR2,...,ZSCAN18,ZSCAN26,ZSWIM8,ZW10,ZWILCH,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
0,PTEN,-0.086793,-0.044434,-0.364537,0.200953,0.25151,-0.138007,0.140471,-0.265318,-0.255745,...,-0.013847,-0.083439,0.278347,0.077464,-0.018037,-0.398552,-0.204746,0.01475,0.206722,-0.078863
1,PIK3CA,0.012156,0.086673,0.022083,-0.005928,0.065859,-0.037912,0.156773,-0.011303,-0.137067,...,0.076399,0.214318,-0.067936,0.085943,0.120471,-0.083236,-0.055727,-0.093777,-0.013168,-0.016652
2,ARID1A,-0.043181,0.119588,0.021957,0.116556,0.290419,-0.374078,-0.229665,-0.095264,0.103951,...,-0.078484,-0.194122,0.04621,0.206833,0.313958,-0.161658,-0.287984,-0.27635,0.034893,-0.013806
3,PIK3R1,-0.262228,-0.097311,-0.034818,0.353795,-0.033778,-0.212193,-0.048712,-0.121592,-0.053392,...,-0.155873,-0.164955,-0.031169,0.288354,0.160633,-0.09275,-0.220999,-0.179789,0.068925,0.022475
4,KRAS,0.13615,0.021552,-0.184177,-0.081516,0.099109,-0.050409,0.035694,-0.080309,-0.305761,...,0.004197,0.052898,0.173735,-0.051168,-0.146464,-0.098748,0.158028,-0.028458,0.136884,-0.077694
5,CTNNB1,-0.169715,0.043321,0.011932,-0.02903,0.108934,0.063783,-0.03175,-0.135618,0.000121,...,0.142337,0.028823,0.066486,0.20449,-0.022137,-0.0328,-0.251656,-0.165188,-0.040854,0.027348
6,CTCF,-0.08261,-0.059453,0.103244,0.087468,0.18924,-0.145914,-0.128412,0.01892,0.07709,...,-0.113761,-0.155269,0.0281,0.12334,0.187308,-0.135752,-0.034909,-0.058618,0.059992,-0.004831
7,KMT2B,0.023263,0.041062,-0.020477,0.121248,0.191688,-0.226357,-0.080221,-0.00657,0.002823,...,-0.334312,-0.208977,-0.004869,0.064286,0.17991,-0.149123,-0.092314,-0.168264,0.078131,-0.044644
8,ZFHX3,-0.146529,-0.046444,0.109066,0.120714,0.073803,-0.019986,-0.130093,0.061247,0.042964,...,-0.268548,0.031582,-0.109771,0.09699,0.211189,0.068938,-0.01643,-0.117071,0.09886,0.132986
9,TP53,0.006132,0.053974,0.389353,-0.123969,-0.176244,0.178059,-0.094211,0.352616,0.286135,...,-0.023306,0.134403,-0.206922,-0.109639,0.117726,0.196717,0.254779,0.046609,-0.050123,0.05728


In [10]:
df_pval_adj_pe

Unnamed: 0,gene,A1BG,A2M,AAAS,AACS,AAGAB,AAK1,AAMDC,AAMP,AAR2,...,ZSCAN18,ZSCAN26,ZSWIM8,ZW10,ZWILCH,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
0,PTEN,0.778601,0.902795,0.020676,0.351017,0.195437,0.589244,0.580932,0.160416,0.184019,...,0.97486,0.790222,0.130243,0.808454,0.965635,0.008412,0.337955,0.972414,0.331707,0.804934
1,PIK3CA,0.978108,0.778859,0.955733,0.990087,0.845067,0.918965,0.518244,0.979985,0.591923,...,0.811769,0.306144,0.838705,0.781233,0.656764,0.790747,0.872878,0.753234,0.976155,0.968569
2,ARID1A,0.905522,0.66019,0.955985,0.671332,0.108183,0.01624,0.257929,0.747669,0.716763,...,0.806134,0.376749,0.897719,0.331268,0.068546,0.500066,0.112004,0.134514,0.926023,0.974932
3,PIK3R1,0.167559,0.740025,0.926032,0.027228,0.92863,0.312533,0.891457,0.652738,0.879259,...,0.521612,0.486538,0.934722,0.111538,0.503082,0.757097,0.284215,0.430257,0.835788,0.955061
4,KRAS,0.595274,0.957007,0.412609,0.796191,0.73479,0.886657,0.924086,0.799961,0.081112,...,0.992738,0.880177,0.451743,0.884818,0.558487,0.735439,0.513231,0.941268,0.592825,0.807687
5,CTNNB1,0.467402,0.905285,0.978614,0.939897,0.69885,0.851275,0.9333,0.597623,0.999884,...,0.573055,0.940602,0.843026,0.338555,0.955622,0.930955,0.195232,0.485665,0.911658,0.943937
6,CTCF,0.792756,0.862599,0.719868,0.776507,0.394418,0.560633,0.62526,0.963511,0.80964,...,0.681103,0.523867,0.942038,0.646114,0.401099,0.597108,0.926023,0.86503,0.861549,0.991922
7,KMT2B,0.953499,0.910961,0.959976,0.65385,0.386315,0.268741,0.800491,0.989017,0.995385,...,0.043685,0.323948,0.991839,0.849713,0.429947,0.547572,0.758612,0.472838,0.806708,0.902042
8,ZFHX3,0.558415,0.897091,0.698348,0.656158,0.82003,0.961004,0.619272,0.858332,0.906012,...,0.152239,0.933841,0.695828,0.741329,0.316296,0.835758,0.968985,0.669588,0.735379,0.608132
9,TP53,0.989679,0.877604,0.01089,0.643461,0.442436,0.436231,0.751456,0.028031,0.115325,...,0.953387,0.602785,0.330912,0.695995,0.667393,0.367665,0.18691,0.896629,0.887159,0.868555


In [11]:
signif_cols_pe = get_signif_columns(df_pval_pe, p_value=0.05)
print(f'Number of significant columns - p_value: {len(signif_cols_pe)}')

Number of significant columns - p_value: 5763


In [12]:
signif_cols_adj_pe = get_signif_columns(df_pval_adj_pe, p_value=0.05)
print(f'Number of significant columns - adjuested p_value: {len(signif_cols_adj_pe)}')

Number of significant columns - adjuested p_value: 1360


In [13]:
rank_cols_pe = rank_columns_signif(df_pval_adj_pe, signif_columns=signif_cols_adj_pe)
rank_cols_pe[rank_cols_pe >= 3].shape[0]

44

In [14]:
rank_cols_pe[rank_cols_pe >= 3]

CEP78       5
MYO5C       5
PIK3CA      4
ANXA2       3
AP1M2       3
ARVCF       3
CAPS        3
CCDC186     3
CDKN2A      3
CDV3        3
CMPK2       3
CORO2A      3
CRABP2      3
CTNNB1      3
FAS         3
FBXO22      3
GALNT7      3
GCHFR       3
HERC5       3
KCTD14      3
KIAA1324    3
KIF2A       3
MARVELD2    3
MCF2L       3
MX2         3
OGFR        3
OPA1        3
PLCB4       3
PLEKHS1     3
SCGB2A1     3
SEC16A      3
SMARCE1     3
SORBS2      3
TBC1D8      3
TESC        3
TJP2        3
TJP3        3
TPD52L2     3
TRAF3IP2    3
TRIM2       3
TTC9        3
USP6NL      3
VAV2        3
XRN1        3
dtype: int64

In [35]:
cols = ['gene']
cols.extend(list(rank_cols_pe[rank_cols_pe >= 3].index))

df_corr_pe[cols]

Unnamed: 0,gene,CEP78,MYO5C,PIK3CA,ANXA2,AP1M2,ARVCF,CAPS,CCDC186,CDKN2A,...,TESC,TJP2,TJP3,TPD52L2,TRAF3IP2,TRIM2,TTC9,USP6NL,VAV2,XRN1
0,PTEN,0.437358,0.483736,-0.425473,0.367672,0.498807,0.458294,0.413757,0.388019,-0.375559,...,0.361863,0.370184,0.494236,-0.356847,0.345254,0.392241,0.474205,0.335666,0.37691,-0.407393
1,PIK3CA,0.076183,0.061783,-0.220623,0.077778,0.093727,0.128347,0.069589,0.040767,-0.020354,...,0.080885,0.065551,0.105065,0.01783,0.1389,0.059239,0.206958,-0.087565,-0.084082,-0.009028
2,ARID1A,0.454338,0.336527,-0.339752,0.277211,0.351057,0.189101,0.410517,-0.06019,-0.356282,...,0.064438,0.384652,0.409218,-0.398772,0.072091,0.359062,0.199873,-0.00141,0.371365,-0.169858
3,PIK3R1,0.229534,0.348107,-0.365436,0.027622,0.245679,0.232686,0.199442,0.02442,-0.234917,...,-0.031675,0.175045,0.187955,-0.27296,0.079558,0.195773,0.078472,0.081014,0.154623,-0.345566
4,KRAS,-0.009866,0.105169,-0.104342,0.416951,0.142346,0.000405,0.215165,0.195662,-0.082872,...,0.166352,0.18312,0.255374,-0.098029,0.152906,0.26448,0.24067,0.037921,0.150913,0.029228
5,CTNNB1,0.14626,0.096529,-0.205659,-0.010335,0.105327,0.357642,0.022682,0.331143,-0.259288,...,0.356416,0.008202,0.012859,-0.28295,0.39058,-0.00461,0.523632,0.490829,-0.040744,-0.052193
6,CTCF,0.407654,0.329705,-0.150756,0.232428,0.168851,0.235021,0.214406,-0.102375,-0.236772,...,0.041666,0.225732,0.303938,-0.213147,0.026435,0.277178,0.171983,-0.01659,0.201438,-0.077616
7,KMT2B,0.323808,0.100988,-0.205859,0.076998,0.162414,0.158918,0.102483,-0.040893,-0.248358,...,0.061432,0.19471,0.211992,-0.20123,-0.025596,0.229186,-0.003968,-0.031578,0.197299,-0.223497
8,ZFHX3,0.334756,0.051116,-0.242091,0.033057,0.234548,0.087407,0.085419,-0.000197,-0.223709,...,0.005836,0.241584,0.199044,-0.189467,-0.017495,0.22338,0.02205,-0.098665,0.056253,-0.060196
9,TP53,-0.428581,-0.508128,0.387884,-0.328807,-0.477723,-0.368613,-0.407972,-0.348511,0.349925,...,-0.449914,-0.379696,-0.456553,0.50034,-0.466192,-0.492402,-0.385317,-0.360124,-0.43253,0.441981


In [36]:
df_pval_adj_pe[cols]

Unnamed: 0,gene,CEP78,MYO5C,PIK3CA,ANXA2,AP1M2,ARVCF,CAPS,CCDC186,CDKN2A,...,TESC,TJP2,TJP3,TPD52L2,TRAF3IP2,TRIM2,TTC9,USP6NL,VAV2,XRN1
0,PTEN,0.002449,0.000375,0.003651,0.019291,0.000212,0.0011,0.005212,0.011256,0.01561,...,0.022138,0.018134,0.000252,0.02528,0.03356,0.010111,0.000601,0.042445,0.015087,0.006306
1,PIK3CA,0.812601,0.856725,0.285393,0.807396,0.753362,0.625535,0.834028,0.91194,0.959998,...,0.798591,0.845988,0.712624,0.966385,0.585924,0.863466,0.330895,0.776163,0.787954,0.984607
2,ARID1A,0.001284,0.041826,0.038566,0.132681,0.028907,0.394729,0.005837,0.860938,0.025591,...,0.849172,0.01233,0.00603,0.008354,0.824947,0.023868,0.355177,0.99787,0.017553,0.466869
3,PIK3R1,0.258397,0.031502,0.020313,0.943494,0.210237,0.248795,0.356338,0.950979,0.242126,...,0.933521,0.447293,0.398998,0.141336,0.802356,0.371055,0.806134,0.798158,0.526337,0.033429
4,KRAS,0.983021,0.712454,0.715604,0.004737,0.57303,0.999315,0.303656,0.371509,0.792085,...,0.480894,0.416221,0.185084,0.737662,0.533506,0.162129,0.224462,0.918935,0.541345,0.939543
5,CTNNB1,0.559381,0.742676,0.334778,0.98214,0.711866,0.024802,0.954726,0.047158,0.174648,...,0.025516,0.986077,0.976726,0.121683,0.010523,0.992106,7.5e-05,0.000293,0.911947,0.882115
6,CTCF,0.006298,0.048515,0.541801,0.249511,0.470545,0.241794,0.30611,0.722471,0.236457,...,0.909391,0.270159,0.084119,0.309799,0.946102,0.132701,0.458279,0.968694,0.349191,0.807963
7,KMT2B,0.055667,0.727392,0.334333,0.809978,0.496758,0.509674,0.722083,0.911487,0.203425,...,0.857819,0.374747,0.313204,0.349918,0.947945,0.259565,0.993263,0.933841,0.364924,0.276565
8,ZFHX3,0.043265,0.884908,0.220518,0.930206,0.24309,0.776608,0.783302,0.99966,0.276225,...,0.990229,0.221856,0.357994,0.393746,0.966971,0.276834,0.95581,0.735576,0.871295,0.860907
9,TP53,0.003312,0.000142,0.011293,0.049536,0.000504,0.018864,0.006229,0.031179,0.029968,...,0.00152,0.014188,0.001176,0.0002,0.000803,0.000273,0.012052,0.023097,0.002859,0.002097


### 1.2. Spearman correlation 

In [15]:
df_corr_sp, df_pval_sp = test_correlation(df_mut_gene, df_proteo, list(top_genes.index), method='spearman')
df_pval_adj_sp = pvalue_correction(df_pval_sp)

In [16]:
signif_cols_sp = get_signif_columns(df_pval_sp, p_value=0.05)
print(f'Number of significant columns - p_value: {len(signif_cols_sp)}')

Number of significant columns - p_value: 5813


In [17]:
signif_cols_adj_sp = get_signif_columns(df_pval_adj_sp, p_value=0.05)
print(f'Number of significant columns - adjuested p_value: {len(signif_cols_adj_sp)}')

Number of significant columns - adjuested p_value: 1279


In [18]:
rank_cols_sp = rank_columns_signif(df_pval_adj_sp, signif_columns=signif_cols_adj_sp)
rank_cols_sp[rank_cols_sp >= 3].shape[0]

37

In [19]:
rank_cols_sp[rank_cols_sp >= 3]

CEP78      5
FAS        4
LRRC41     4
MYO5C      4
ALDH1L2    3
ANXA2      3
ARID1A     3
ARL15      3
CAPS       3
CORO2A     3
CRABP2     3
DLG5       3
ENDOG      3
FBXO22     3
FREM2      3
GCHFR      3
HERC5      3
KIF2A      3
LTN1       3
MCF2L      3
MX2        3
OGFR       3
OPA1       3
PIK3CA     3
PLEKHS1    3
SCGB2A1    3
SEC16A     3
TBC1D8     3
TESC       3
TJP2       3
TJP3       3
TPD52L2    3
TRIM2      3
TTC9       3
USP43      3
VAV2       3
ZNF185     3
dtype: int64

In [31]:
cols = ['gene']
cols.extend(list(rank_cols_sp[rank_cols_sp >= 3].index))

In [33]:
df_corr_sp[cols]

Unnamed: 0,gene,CEP78,FAS,LRRC41,MYO5C,ALDH1L2,ANXA2,ARID1A,ARL15,CAPS,...,TBC1D8,TESC,TJP2,TJP3,TPD52L2,TRIM2,TTC9,USP43,VAV2,ZNF185
0,PTEN,0.421819,0.386982,0.274938,0.461839,-0.345557,0.381331,-0.339434,0.337078,0.362037,...,0.33708,0.350266,0.346494,0.450543,-0.334728,0.366267,0.456657,0.372858,0.34885,0.005649
1,PIK3CA,0.114396,0.106335,0.087525,0.033398,0.018043,0.076392,-0.03071,-0.024952,0.074474,...,0.015739,0.090981,0.088676,0.152786,0.042227,0.056814,0.200001,0.155471,-0.047601,0.015355
2,ARID1A,0.46888,0.261817,0.338938,0.315417,-0.325829,0.313485,-0.627361,0.120304,0.403336,...,0.212848,0.020437,0.394461,0.401407,-0.415287,0.333923,0.193953,0.34202,0.364386,-0.075576
3,PIK3R1,0.226317,0.054316,0.12477,0.345579,-0.245606,0.02519,0.036998,0.245602,0.176334,...,0.024403,-0.018893,0.204669,0.190109,-0.318421,0.220807,0.092888,0.133822,0.106665,-0.333375
4,KRAS,-0.000409,0.349174,0.344674,0.068771,0.114619,0.413441,-0.193213,-0.0438,0.197309,...,0.359,0.135496,0.180113,0.214911,-0.089239,0.243972,0.26403,0.08187,0.146138,0.178067
5,CTNNB1,0.133782,0.332163,0.263815,0.072518,-0.470118,0.020005,-0.059598,0.353417,-0.019172,...,-0.074185,0.400519,0.002084,-0.021672,-0.303409,-0.037092,0.534712,0.292987,-0.005418,-0.083353
6,CTCF,0.428506,0.046808,0.345956,0.33702,-0.248085,0.243826,-0.39021,0.067233,0.227236,...,0.158297,0.025532,0.219147,0.289363,-0.193617,0.274465,0.16936,0.294465,0.205531,-0.107233
7,KMT2B,0.320361,0.056455,0.30961,0.124113,-0.144724,0.103949,-0.2079,0.207002,0.113361,...,0.090956,0.04391,0.219996,0.198044,-0.17116,0.21462,0.005825,0.116943,0.199386,-0.350381
8,ZFHX3,0.375111,0.051341,0.141997,0.039315,-0.015264,0.071229,-0.223402,0.160959,0.064755,...,-0.084643,0.014801,0.284455,0.234968,-0.208603,0.194725,0.011563,0.24699,0.031915,-0.352909
9,TP53,-0.413501,-0.490743,-0.426917,-0.47502,0.345513,-0.34782,0.288619,-0.409799,-0.338114,...,-0.367249,-0.436171,-0.357997,-0.386217,0.459758,-0.456053,-0.354759,-0.391762,-0.417202,-0.103606


In [34]:
df_pval_adj_sp[cols]

Unnamed: 0,gene,CEP78,FAS,LRRC41,MYO5C,ALDH1L2,ANXA2,ARID1A,ARL15,CAPS,...,TBC1D8,TESC,TJP2,TJP3,TPD52L2,TRIM2,TTC9,USP43,VAV2,ZNF185
0,PTEN,0.005942,0.0145,0.139784,0.00168,0.037521,0.016698,0.04378157,0.045739,0.026337,...,0.045739,0.03405,0.037019,0.002571,0.047954,0.024045,0.002002,0.020616,0.035404,0.989381
1,PIK3CA,0.673343,0.702606,0.766653,0.928325,0.96358,0.804812,0.9351717,0.947353,0.809761,...,0.968491,0.754306,0.762101,0.526343,0.905037,0.86571,0.35197,0.516145,0.891274,0.969367
2,ARID1A,0.00136,0.168314,0.044397,0.070014,0.057208,0.073198,4.112238e-07,0.650546,0.009854,...,0.306049,0.958094,0.012369,0.010429,0.007174,0.048571,0.372643,0.040899,0.024949,0.806462
3,PIK3R1,0.263122,0.872639,0.634236,0.037521,0.207524,0.947353,0.918483,0.207524,0.437597,...,0.949454,0.961499,0.335025,0.387558,0.066313,0.28099,0.747623,0.599896,0.701801,0.04916
4,KRAS,0.998611,0.035041,0.038306,0.829022,0.672274,0.0075,0.375901,0.90066,0.361185,...,0.028173,0.593575,0.424904,0.299547,0.760214,0.212535,0.162493,0.787031,0.552403,0.431835
5,CTNNB1,0.599896,0.049896,0.16343,0.816841,0.001294,0.959176,0.8574449,0.03178,0.960971,...,0.811384,0.010618,0.99663,0.955626,0.086869,0.918483,0.000174,0.103817,0.989476,0.781882
6,CTCF,0.005068,0.893526,0.037359,0.045805,0.200847,0.212987,0.01342817,0.834205,0.260326,...,0.505438,0.946619,0.286056,0.110626,0.374523,0.140902,0.464111,0.101612,0.33225,0.699356
7,KMT2B,0.063813,0.866423,0.078351,0.636911,0.557384,0.710894,0.3234255,0.326323,0.676679,...,0.754306,0.90066,0.283642,0.35877,0.456688,0.300204,0.988931,0.663598,0.354112,0.033975
8,ZFHX3,0.019496,0.881316,0.568001,0.912372,0.969367,0.820637,0.2730062,0.494647,0.841722,...,0.777291,0.970391,0.119968,0.236744,0.320846,0.369257,0.976849,0.203643,0.932294,0.032169
9,TP53,0.0075,0.000673,0.005237,0.001098,0.037521,0.036031,0.1119584,0.008353,0.045116,...,0.023494,0.004061,0.028763,0.014879,0.001777,0.002055,0.031113,0.013105,0.006768,0.712141


## 2. Correlation Selected Mutated Genes

In [20]:
selec_genes = ['PIK3R1', 'CTNNB1', 'ARID1A', 'INPPL1', 'KMT2D', 'JAK1', 'TP53', 'PTEN']

### 2.1 Pearson

In [21]:
df_corr_selec_pe, df_pval_selec_pe = test_correlation(df_mut_gene, df_proteo, selec_genes, method='pearson')
df_pval_selec_adj_pe = pvalue_correction(df_pval_selec_pe)

In [22]:
signif_cols_selec_adj_pe = get_signif_columns(df_pval_selec_adj_pe, p_value=0.05)
print(f'Number of significant columns - adjuested p_value: {len(signif_cols_selec_adj_pe)}')

Number of significant columns - adjuested p_value: 1547


In [23]:
rank_cols_selec_sp = rank_columns_signif(df_pval_selec_adj_pe, signif_columns=signif_cols_selec_adj_pe)
rank_cols_selec_sp[rank_cols_selec_sp >= 3]

PIK3CA    5
CEP78     4
GCHFR     4
MYO5C     4
TOM1L1    4
         ..
TUBB3     3
UCHL1     3
USP6NL    3
VAV2      3
XRN1      3
Length: 63, dtype: int64