In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

  import pandas.util.testing as tm


In [2]:
endo = cptac.Endometrial()

                                    

In [3]:
gene = 'PIK3CA'

In [4]:
#Endo ttest
# Step 1 - Create dataframe in order to do comparisons with wrap_ttest
proteomics = endo.get_proteomics()
prot_list = list(proteomics.columns)

mut_type = endo.get_genotype_all_vars(gene)
mut_type = mut_type.drop(columns = ["Mutation_Status", "Location"])

prot_and_mutations = mut_type.join(proteomics)


# Keep two values to compare
compare = ['Wildtype_Tumor','Missense_Mutation']
get = prot_and_mutations['Mutation'].isin(compare)
missense_wt = prot_and_mutations[get]
missense_wt['Mutation'].value_counts()




Missense_Mutation    43
Wildtype_Tumor       41
Name: Mutation, dtype: int64

# Get the median difference between wildtype and missense for each gene

In [5]:
missense = missense_wt[missense_wt.Mutation == "Missense_Mutation"] #get all missense_mutations
wt = missense_wt[missense_wt.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype

In [6]:
endo_d = {}

for prot in proteomics:
    dif_endo = wt_med[prot] - missense_med[prot]
    endo_d[prot] = dif_endo
#     print(wt_med[prot], '-', missense_med[prot], '=',dif_endo)

endo_df = pd.DataFrame.from_dict(endo_d, orient='index', columns=['Endo'])


In [7]:
endo_df

Unnamed: 0,Endo
A1BG,-0.07100
A2M,-0.10300
A2ML1,-0.47135
A4GALT,-0.22545
AAAS,-0.07410
...,...
ZXDC,-0.08700
ZYG11B,0.03240
ZYX,0.13800
ZZEF1,0.00210


# Get the p value from the t test

In [8]:
missense_wt

Name,Mutation,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAT,AAED1,AAGAB,...,ZSWIM8,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,Missense_Mutation,-1.180,-0.863,-0.8020,0.2220,0.2560,0.6650,1.2800,-0.3390,0.4120,...,-0.08770,,0.0229,0.1090,,-0.3320,-0.43300,-1.020,-0.12300,-0.0859
C3L-00008,Wildtype_Tumor,-0.685,-1.070,-0.6840,0.9840,0.1350,0.3340,1.3000,0.1390,1.3300,...,-0.03560,,0.3630,1.0700,0.737,-0.5640,-0.00461,-1.130,-0.07570,-0.4730
C3L-00032,Missense_Mutation,-0.528,-1.320,0.4350,,-0.2400,1.0400,-0.0213,-0.0479,0.4190,...,0.00112,-0.1450,0.0105,-0.1160,,0.1510,-0.07400,-0.540,0.32000,-0.4190
C3L-00090,Wildtype_Tumor,-1.670,-1.190,-0.4430,0.2430,-0.0993,0.7570,0.7400,-0.9290,0.2290,...,0.07250,-0.0552,-0.0714,0.0933,0.156,-0.3980,-0.07520,-0.797,-0.03010,-0.4670
C3L-00136,Wildtype_Tumor,-1.080,-0.708,-0.1260,-0.4260,-0.1140,-0.1110,0.8950,1.2600,0.1570,...,0.45500,,0.3970,-0.9990,-0.730,-0.0229,-0.33100,-1.160,-0.11600,0.0025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01510,Missense_Mutation,-0.269,0.944,1.2000,-0.1550,0.2660,-0.5480,-0.1050,,0.4010,...,-0.20400,,-0.5830,0.9130,0.249,-0.5150,-0.15100,-1.390,-0.12100,-0.4260
C3N-01520,Missense_Mutation,-1.070,-0.712,0.0462,-0.0471,0.2270,1.3500,1.2100,0.0048,0.6820,...,-0.06990,-0.4010,0.5570,0.8270,0.348,-0.1870,-0.10700,-0.830,0.06200,-0.5280
C3N-01537,Wildtype_Tumor,-0.290,-0.320,3.1700,-0.9070,0.0317,-0.0425,,-0.2730,-0.0747,...,-0.17900,-0.5210,0.0523,0.5150,0.585,0.1380,-0.19000,-0.966,-0.00627,-0.2490
C3N-01802,Wildtype_Tumor,0.266,1.390,-0.0655,0.4700,0.3980,-0.1340,0.4610,1.0400,0.3630,...,-0.14000,,-0.0122,0.2500,0.553,0.3870,0.06420,-0.437,0.10400,-0.4980


In [9]:
cols = list(missense_wt.columns[1:])

t_test = u.wrap_ttest(missense_wt, 'Mutation', cols, return_all = True)
t_test

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  reject = pvals <= alphacBonf
  pvals_corrected[pvals_corrected>1] = 1


Unnamed: 0,Comparison,P_Value
0,PPP2R2D,0.000692
1,PCDHB3,0.001627
2,ZNF432,0.001928
3,KLF6,0.001964
4,CASP7,0.002192
...,...,...
10994,ZNF416,
10995,ZNF506,
10996,ZNF69,
10997,ZNF79,


# Combine the difference in median with results from t-test

In [10]:
t_test = t_test.set_index('Comparison')

In [11]:
df = endo_df.join(t_test)
df

Unnamed: 0,Endo,P_Value
A1BG,-0.07100,0.720461
A2M,-0.10300,0.681688
A2ML1,-0.47135,0.136212
A4GALT,-0.22545,0.492356
AAAS,-0.07410,0.295868
...,...,...
ZXDC,-0.08700,0.808610
ZYG11B,0.03240,0.755896
ZYX,0.13800,0.251409
ZZEF1,0.00210,0.398421


# Brca

In [12]:
br = cptac.Brca()

                                    

In [28]:
# We need to figure which of databaseID we want to use, for each protein
proteomics = br.get_proteomics()
fil = proteomics.columns.get_level_values('Name').duplicated(False)
duplicates = proteomics[proteomics.columns[fil]]
duplicates

Name,ABCB7,ABCB7,ABLIM3,ABLIM3,ACIN1,ACIN1,ACLY,ACLY,ACOX1,ACOX1,...,WDR54,WDR54,WHSC1L1,WHSC1L1,YBX3,YBX3,ZAK,ZAK,ZNF185,ZNF185
Database_ID,NP_001258628.1|NP_001258626.1,NP_004290.2|NP_001258625.1|NP_001258627.1,NP_001287947.1|NP_001287944.1,NP_001287956.1,NP_001158286.1,NP_001158287.1|NP_055792.1|NP_001158288.1|NP_001158289.1,NP_001290203.1|NP_001087.2,NP_001290204.1|NP_942127.1,NP_004026.2|NP_001171968.1,NP_009223.2,...,NP_001307752.1|NP_001307753.1,NP_001307754.1,NP_060248.2,NP_075447.1,NP_001138898.1,NP_003642.3,NP_057737.2,NP_598407.1,NP_001171579.1|NP_001171584.1,NP_001171580.1|NP_001171578.1|NP_001171581.1
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
CPT000814,1.0917,0.0508,-2.4889,,-1.4695,-1.5928,-0.7593,-5.9971,0.1271,0.0449,...,0.0116,0.6690,1.8430,5.1223,-2.5711,0.1721,-0.2427,0.9723,-1.2406,-0.5460
CPT001846,-0.3479,-0.7086,-0.5013,-1.7350,-1.5887,-1.6706,-0.3044,-3.3574,2.6842,-0.1632,...,0.7900,1.1682,0.4607,-0.8393,-2.0801,0.9434,0.8458,2.2521,0.3352,0.9695
X01BR001,1.1956,-0.0171,0.0961,-1.4710,-3.5740,-0.4782,-0.0726,-2.7093,1.3472,0.9800,...,-0.1943,-0.4163,1.8340,0.1110,-2.2567,-0.7579,-0.1537,-0.1281,0.5017,
X01BR008,-1.5473,-0.2247,-0.4993,,-0.1057,0.8210,-1.1537,-1.1583,-0.3826,-1.2567,...,-3.6662,,0.2718,0.6883,1.6608,2.3106,-1.6823,-1.9889,-1.1515,-1.5633
X01BR009,-1.0404,0.6316,-0.2648,,-0.2494,0.3054,0.1667,-0.7758,0.5365,1.1016,...,-3.1232,,-0.6603,-1.2279,-1.5104,-0.2468,-0.8349,-0.8837,2.4114,2.2290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X21BR001,-1.7766,-0.2736,-2.6944,,0.8575,1.1513,-1.4801,-1.4229,2.8285,5.3117,...,-2.3434,,1.1825,3.7567,-2.7230,-0.9887,-1.1473,-0.4036,-2.0106,-1.8650
X21BR002,0.1679,0.7299,1.9825,,-0.5279,0.2388,0.2914,-0.3625,-0.0973,2.7151,...,-0.0815,,-1.1450,-0.4124,1.2367,-0.5490,0.5723,-1.2816,-0.1813,-0.6198
X21BR010,-1.2730,-0.5020,3.1096,4.3258,-1.7271,0.6109,1.0829,-3.0010,0.1647,0.1318,...,1.2257,0.2830,-0.4778,-1.0635,-3.4428,-0.4987,-0.3067,-0.1714,0.9716,1.3576
X22BR005,0.5127,0.5379,-0.6598,,-2.5121,-0.2102,2.0881,2.6132,0.0020,-1.1381,...,-0.7389,-0.2893,-0.0268,-0.2318,0.7825,0.6998,1.5917,1.0378,-0.4871,-1.5122


In [30]:
no_duplicates = proteomics[proteomics.columns[~fil]]

In [31]:
no_duplicates

Name,A1BG,A2M,A2ML1,AAAS,AACS,AADAT,AAED1,AAGAB,AAK1,AAMDC,...,ZSCAN31,ZSWIM8,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Database_ID,NP_570602.2,NP_000005.2,NP_653271.2|NP_001269353.1,NP_056480.1|NP_001166937.1,NP_076417.2|NP_001306769.1|NP_001306768.1,NP_057312.1|NP_001273611.1,NP_714542.1,NP_078942.3|NP_001258815.1,NP_055726.3,NP_078960.1|NP_001303886.1|NP_001303887.1,...,NP_665916.1|NP_001230172.1|NP_116078.4|NP_001166148.1|NP_001229731.1|NP_001012458.1,NP_055852.2|NP_001229416.1|NP_001229417.1,NP_004715.1,NP_060445.3|NP_001274750.1,NP_008988.2|NP_001005413.1,NP_079388.3|NP_001035743.1,NP_078922.1,NP_001010972.1,NP_055928.3,NP_056349.1|NP_001295166.1
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
CPT000814,-0.6712,-0.2075,2.7959,1.3969,-1.0899,,1.6708,-0.3484,-0.4756,-0.7299,...,-5.2868,-0.6536,0.3384,2.1169,1.3910,-2.1230,0.9136,-0.8082,-1.4793,0.9136
CPT001846,1.3964,1.3302,-5.0948,0.7674,-1.6845,,2.1022,-0.5814,0.2916,-2.2857,...,-0.7592,0.4711,0.6018,0.2062,-0.2137,-2.1219,0.0860,2.5814,-0.2852,-0.1074
X01BR001,2.0219,1.6269,-3.2943,0.3352,-1.0739,1.2255,0.2754,-1.1187,-0.0534,-0.2519,...,,0.2306,-0.3010,0.3395,-0.5316,,0.4996,0.7622,-1.5607,0.0256
X01BR008,-0.5290,0.3267,1.4342,0.4938,-2.8676,,,-1.0691,-0.3643,-1.8173,...,-2.1789,0.2695,0.1506,1.0498,0.7546,1.7889,-0.2499,-0.2590,-0.1263,0.3725
X01BR009,1.2556,3.4489,2.8043,-0.2956,-1.7261,,,-2.0471,-0.3547,-0.8298,...,-2.3990,-0.2596,0.1898,-0.5010,-0.4189,0.3080,0.5057,0.2181,-0.2288,-0.2750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X21BR001,-0.6610,-0.6402,-4.8578,1.2319,-1.6491,,,-0.3074,-0.3074,-0.0266,...,-0.2528,0.5090,0.0306,0.4908,-0.5570,2.3864,0.3764,-0.6974,1.3541,1.1123
X21BR002,-1.3735,0.4227,-4.9553,0.6327,-3.1434,,,0.3071,0.7562,-1.6912,...,-3.3351,0.1548,1.0792,-0.6619,-1.4444,-0.3704,0.4909,0.3938,0.2992,-0.3494
X21BR010,1.1583,0.3329,-5.7358,-0.1658,-2.0413,-1.2433,0.9090,-0.2410,0.6717,-0.1651,...,-0.7054,0.2752,0.8850,-2.6704,-0.9444,-1.9717,0.0650,0.6300,-0.0686,0.1798
X22BR005,0.4948,-1.0986,-8.8314,0.2826,-1.0123,-2.5732,5.7567,1.7644,0.5415,0.1531,...,-0.3936,-0.0340,-0.9367,-0.1922,1.2572,1.3220,-1.0698,0.4012,-0.3792,1.2752


In [15]:
#get the pval and min for duplicates
#get the 

Name,ABCB7,ABCB7,ABLIM3,ABLIM3,ACIN1,ACIN1,ACLY,ACLY,ACOX1,ACOX1,...,WDR54,WDR54,WHSC1L1,WHSC1L1,YBX3,YBX3,ZAK,ZAK,ZNF185,ZNF185
Database_ID,NP_001258628.1|NP_001258626.1,NP_004290.2|NP_001258625.1|NP_001258627.1,NP_001287947.1|NP_001287944.1,NP_001287956.1,NP_001158286.1,NP_001158287.1|NP_055792.1|NP_001158288.1|NP_001158289.1,NP_001290203.1|NP_001087.2,NP_001290204.1|NP_942127.1,NP_004026.2|NP_001171968.1,NP_009223.2,...,NP_001307752.1|NP_001307753.1,NP_001307754.1,NP_060248.2,NP_075447.1,NP_001138898.1,NP_003642.3,NP_057737.2,NP_598407.1,NP_001171579.1|NP_001171584.1,NP_001171580.1|NP_001171578.1|NP_001171581.1
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
CPT000814,1.0917,0.0508,-2.4889,,-1.4695,-1.5928,-0.7593,-5.9971,0.1271,0.0449,...,0.0116,0.6690,1.8430,5.1223,-2.5711,0.1721,-0.2427,0.9723,-1.2406,-0.5460
CPT001846,-0.3479,-0.7086,-0.5013,-1.7350,-1.5887,-1.6706,-0.3044,-3.3574,2.6842,-0.1632,...,0.7900,1.1682,0.4607,-0.8393,-2.0801,0.9434,0.8458,2.2521,0.3352,0.9695
X01BR001,1.1956,-0.0171,0.0961,-1.4710,-3.5740,-0.4782,-0.0726,-2.7093,1.3472,0.9800,...,-0.1943,-0.4163,1.8340,0.1110,-2.2567,-0.7579,-0.1537,-0.1281,0.5017,
X01BR008,-1.5473,-0.2247,-0.4993,,-0.1057,0.8210,-1.1537,-1.1583,-0.3826,-1.2567,...,-3.6662,,0.2718,0.6883,1.6608,2.3106,-1.6823,-1.9889,-1.1515,-1.5633
X01BR009,-1.0404,0.6316,-0.2648,,-0.2494,0.3054,0.1667,-0.7758,0.5365,1.1016,...,-3.1232,,-0.6603,-1.2279,-1.5104,-0.2468,-0.8349,-0.8837,2.4114,2.2290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X21BR001,-1.7766,-0.2736,-2.6944,,0.8575,1.1513,-1.4801,-1.4229,2.8285,5.3117,...,-2.3434,,1.1825,3.7567,-2.7230,-0.9887,-1.1473,-0.4036,-2.0106,-1.8650
X21BR002,0.1679,0.7299,1.9825,,-0.5279,0.2388,0.2914,-0.3625,-0.0973,2.7151,...,-0.0815,,-1.1450,-0.4124,1.2367,-0.5490,0.5723,-1.2816,-0.1813,-0.6198
X21BR010,-1.2730,-0.5020,3.1096,4.3258,-1.7271,0.6109,1.0829,-3.0010,0.1647,0.1318,...,1.2257,0.2830,-0.4778,-1.0635,-3.4428,-0.4987,-0.3067,-0.1714,0.9716,1.3576
X22BR005,0.5127,0.5379,-0.6598,,-2.5121,-0.2102,2.0881,2.6132,0.0020,-1.1381,...,-0.7389,-0.2893,-0.0268,-0.2318,0.7825,0.6998,1.5917,1.0378,-0.4871,-1.5122


In [16]:
proteomics = br.reduce_multiindex(duplicates, flatten=True)
proteomics

Name,ABCB7_NP_001258628.1|NP_001258626.1,ABCB7_NP_004290.2|NP_001258625.1|NP_001258627.1,ABLIM3_NP_001287947.1|NP_001287944.1,ABLIM3_NP_001287956.1,ACIN1_NP_001158286.1,ACIN1_NP_001158287.1|NP_055792.1|NP_001158288.1|NP_001158289.1,ACLY_NP_001290203.1|NP_001087.2,ACLY_NP_001290204.1|NP_942127.1,ACOX1_NP_004026.2|NP_001171968.1,ACOX1_NP_009223.2,...,WDR54_NP_001307752.1|NP_001307753.1,WDR54_NP_001307754.1,WHSC1L1_NP_060248.2,WHSC1L1_NP_075447.1,YBX3_NP_001138898.1,YBX3_NP_003642.3,ZAK_NP_057737.2,ZAK_NP_598407.1,ZNF185_NP_001171579.1|NP_001171584.1,ZNF185_NP_001171580.1|NP_001171578.1|NP_001171581.1
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPT000814,1.0917,0.0508,-2.4889,,-1.4695,-1.5928,-0.7593,-5.9971,0.1271,0.0449,...,0.0116,0.6690,1.8430,5.1223,-2.5711,0.1721,-0.2427,0.9723,-1.2406,-0.5460
CPT001846,-0.3479,-0.7086,-0.5013,-1.7350,-1.5887,-1.6706,-0.3044,-3.3574,2.6842,-0.1632,...,0.7900,1.1682,0.4607,-0.8393,-2.0801,0.9434,0.8458,2.2521,0.3352,0.9695
X01BR001,1.1956,-0.0171,0.0961,-1.4710,-3.5740,-0.4782,-0.0726,-2.7093,1.3472,0.9800,...,-0.1943,-0.4163,1.8340,0.1110,-2.2567,-0.7579,-0.1537,-0.1281,0.5017,
X01BR008,-1.5473,-0.2247,-0.4993,,-0.1057,0.8210,-1.1537,-1.1583,-0.3826,-1.2567,...,-3.6662,,0.2718,0.6883,1.6608,2.3106,-1.6823,-1.9889,-1.1515,-1.5633
X01BR009,-1.0404,0.6316,-0.2648,,-0.2494,0.3054,0.1667,-0.7758,0.5365,1.1016,...,-3.1232,,-0.6603,-1.2279,-1.5104,-0.2468,-0.8349,-0.8837,2.4114,2.2290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X21BR001,-1.7766,-0.2736,-2.6944,,0.8575,1.1513,-1.4801,-1.4229,2.8285,5.3117,...,-2.3434,,1.1825,3.7567,-2.7230,-0.9887,-1.1473,-0.4036,-2.0106,-1.8650
X21BR002,0.1679,0.7299,1.9825,,-0.5279,0.2388,0.2914,-0.3625,-0.0973,2.7151,...,-0.0815,,-1.1450,-0.4124,1.2367,-0.5490,0.5723,-1.2816,-0.1813,-0.6198
X21BR010,-1.2730,-0.5020,3.1096,4.3258,-1.7271,0.6109,1.0829,-3.0010,0.1647,0.1318,...,1.2257,0.2830,-0.4778,-1.0635,-3.4428,-0.4987,-0.3067,-0.1714,0.9716,1.3576
X22BR005,0.5127,0.5379,-0.6598,,-2.5121,-0.2102,2.0881,2.6132,0.0020,-1.1381,...,-0.7389,-0.2893,-0.0268,-0.2318,0.7825,0.6998,1.5917,1.0378,-0.4871,-1.5122


In [17]:
mut_type = br.get_genotype_all_vars(gene)
mut_type = mut_type.drop(columns = ["Mutation_Status", "Location"])

prot_and_mutations = mut_type.join(proteomics)


# Keep two values to compare
compare = ['Wildtype_Tumor','Missense_Mutation']
get = prot_and_mutations['Mutation'].isin(compare)
missense_wt = prot_and_mutations[get]
missense_wt['Mutation'].value_counts()



Wildtype_Tumor       53
Missense_Mutation    33
Name: Mutation, dtype: int64

## Get the difference in medians between wiltype and missense

In [18]:
missense = missense_wt[missense_wt.Mutation == "Missense_Mutation"] #get all missense_mutations
wt = missense_wt[missense_wt.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype

In [19]:
brca_d = {}

for prot in proteomics:
    dif_brca = wt_med[prot] - missense_med[prot]
    brca_d[prot] = dif_brca
#     print(wt_med[prot], '-', missense_med[prot], '=',dif_endo)

brca_df = pd.DataFrame.from_dict(brca_d, orient='index', columns=['Brca'])

brca_df

Unnamed: 0,Brca
ABCB7_NP_001258628.1|NP_001258626.1,0.28370
ABCB7_NP_004290.2|NP_001258625.1|NP_001258627.1,0.18180
ABLIM3_NP_001287947.1|NP_001287944.1,-0.33190
ABLIM3_NP_001287956.1,-0.37175
ACIN1_NP_001158286.1,0.10270
...,...
YBX3_NP_003642.3,-0.03920
ZAK_NP_057737.2,-0.09030
ZAK_NP_598407.1,-0.21270
ZNF185_NP_001171579.1|NP_001171584.1,-0.19150


## Get the t-test results

In [20]:
cols = list(missense_wt.columns[1:])
t_test = u.wrap_ttest(missense_wt, 'Mutation', cols, return_all = True)
t_test = t_test.set_index('Comparison')
t_test

Unnamed: 0_level_0,P_Value
Comparison,Unnamed: 1_level_1
DNM2_NP_001005361.1|NP_001005362.1,0.001085
CD99_NP_001308297.1|NP_001308298.1,0.002056
AFDN_NP_001278893.1,0.002576
LGALS8_NP_963838.1,0.002597
HAGH_NP_001035517.1,0.004242
...,...
AKAP9_NP_671714.1,0.992857
CAMK1D_NP_065130.1,0.993589
RANBP3_NP_003615.2|NP_001287794.1,0.998764
INF2_NP_001026884.3|NP_116103.1,0.999194


In [21]:
df = brca_df.join(t_test)
df

Unnamed: 0,Brca,P_Value
ABCB7_NP_001258628.1|NP_001258626.1,0.28370,0.253549
ABCB7_NP_004290.2|NP_001258625.1|NP_001258627.1,0.18180,0.641102
ABLIM3_NP_001287947.1|NP_001287944.1,-0.33190,0.852467
ABLIM3_NP_001287956.1,-0.37175,0.135988
ACIN1_NP_001158286.1,0.10270,0.800310
...,...,...
YBX3_NP_003642.3,-0.03920,0.559371
ZAK_NP_057737.2,-0.09030,0.780759
ZAK_NP_598407.1,-0.21270,0.989303
ZNF185_NP_001171579.1|NP_001171584.1,-0.19150,0.171569


In [22]:
#now we need to fiter out the ones that we don't want to keep
#get all duplicates of one gene
duplicates = df[df.index.get_level_values(0).str.contains('ABCB7')]
#compare the p values and keep best one
min_row = duplicates[duplicates.P_Value == duplicates.P_Value.min()]
min_row



Unnamed: 0,Brca,P_Value
ABCB7_NP_001258628.1|NP_001258626.1,0.2837,0.253549


In [23]:
cl = cptac.Colon()

                                    

In [24]:
#  Step 1 - Create dataframe in order to do comparisons with wrap_ttest
proteomics = cl.get_proteomics()
protfilter = proteomics.index.str.endswith('.N')
proteomics = proteomics[~protfilter]
proteomics

Name,A1BG,A1CF,A2M,AAAS,AACS,AAGAB,AAK1,AAMDC,AAMP,AAR2,...,ZNHIT6,ZNRD1,ZNRF2,ZPR1,ZRANB2,ZW10,ZWILCH,ZWINT,ZYX,ZZEF1
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01CO005,-1.100,0.3180,-0.4870,0.0995,0.1550,0.1690,0.0653,-0.147,0.1140,0.3410,...,,,0.0384,0.0221,0.2500,0.0869,0.0331,,-0.7330,-0.2650
01CO006,-1.120,-0.4410,-0.3470,-0.0029,0.0957,0.3960,-0.0363,-0.549,0.2200,0.2480,...,,0.324,,-0.3940,0.0846,0.0010,-0.3450,,-0.6580,0.0052
01CO008,-1.200,0.1600,-1.8500,0.1190,-0.0924,0.0187,-0.2140,0.328,-0.2820,-0.3480,...,,,-0.3840,-0.1680,0.3570,-0.3250,0.3490,,-0.8210,0.2000
01CO013,-1.890,0.1120,-0.3290,0.6700,0.1160,0.3130,-0.2380,-0.274,-0.5540,0.2700,...,,,,-0.0656,-0.2950,0.4630,-0.4480,0.0493,-0.9040,-0.1580
01CO014,-0.523,-0.2480,-0.6380,0.2890,0.3780,0.8220,0.0018,-1.080,0.4980,-0.1200,...,0.0322,,0.0467,0.2170,0.2360,0.2870,0.3460,,-0.8120,-0.0273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21CO006,-0.557,-0.4470,0.3420,-0.1390,0.1400,0.0517,0.0039,-0.118,0.1020,0.1750,...,,,,-0.2750,-0.0306,0.0866,,,0.0873,-0.0152
21CO007,-0.975,0.3980,-1.1500,0.5730,-0.1280,0.0600,-0.0559,-1.330,-0.3650,0.2920,...,,,0.2460,0.0047,0.1650,0.1030,,,-0.7520,-0.2410
22CO004,-1.280,0.5180,-0.8720,0.2580,0.4240,0.2980,-0.1500,-0.658,0.0066,-0.0378,...,,,0.4490,-0.0616,0.3850,0.1160,-0.0437,,-0.8840,0.0088
22CO006,0.515,-1.2100,-0.0283,0.1780,-0.3030,0.1500,0.3990,-1.000,0.2060,-0.2970,...,,,-0.5610,0.4740,0.0836,-0.2510,0.1760,,0.0584,-0.3010


In [25]:
prot_list = list(proteomics.columns)

mut_type = cl.get_genotype_all_vars(gene)
mut_type = mut_type.drop(columns = ["Mutation_Status", "Location"])
mut_type



Name,Mutation
Patient_ID,Unnamed: 1_level_1
01CO001,Wildtype_Tumor
01CO005,Wildtype_Tumor
01CO006,Wildtype_Tumor
01CO008,Amplification
01CO013,Wildtype_Tumor
...,...
21CO007,Amplification
22CO004,Wildtype_Tumor
22CO006,Wildtype_Tumor
24CO005,Wildtype_Tumor


In [26]:
mut_type.join(proteomics)

Name,Mutation,A1BG,A1CF,A2M,AAAS,AACS,AAGAB,AAK1,AAMDC,AAMP,...,ZNHIT6,ZNRD1,ZNRF2,ZPR1,ZRANB2,ZW10,ZWILCH,ZWINT,ZYX,ZZEF1
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01CO001,Wildtype_Tumor,,,,,,,,,,...,,,,,,,,,,
01CO005,Wildtype_Tumor,-1.100,0.3180,-0.4870,0.0995,0.1550,0.1690,0.0653,-0.147,0.1140,...,,,0.0384,0.0221,0.2500,0.0869,0.0331,,-0.7330,-0.2650
01CO006,Wildtype_Tumor,-1.120,-0.4410,-0.3470,-0.0029,0.0957,0.3960,-0.0363,-0.549,0.2200,...,,0.324,,-0.3940,0.0846,0.0010,-0.3450,,-0.6580,0.0052
01CO008,Amplification,-1.200,0.1600,-1.8500,0.1190,-0.0924,0.0187,-0.2140,0.328,-0.2820,...,,,-0.3840,-0.1680,0.3570,-0.3250,0.3490,,-0.8210,0.2000
01CO013,Wildtype_Tumor,-1.890,0.1120,-0.3290,0.6700,0.1160,0.3130,-0.2380,-0.274,-0.5540,...,,,,-0.0656,-0.2950,0.4630,-0.4480,0.0493,-0.9040,-0.1580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21CO007,Amplification,-0.975,0.3980,-1.1500,0.5730,-0.1280,0.0600,-0.0559,-1.330,-0.3650,...,,,0.2460,0.0047,0.1650,0.1030,,,-0.7520,-0.2410
22CO004,Wildtype_Tumor,-1.280,0.5180,-0.8720,0.2580,0.4240,0.2980,-0.1500,-0.658,0.0066,...,,,0.4490,-0.0616,0.3850,0.1160,-0.0437,,-0.8840,0.0088
22CO006,Wildtype_Tumor,0.515,-1.2100,-0.0283,0.1780,-0.3030,0.1500,0.3990,-1.000,0.2060,...,,,-0.5610,0.4740,0.0836,-0.2510,0.1760,,0.0584,-0.3010
24CO005,Wildtype_Tumor,,,,,,,,,,...,,,,,,,,,,


## Join difference in media and t-test result