In [1]:
import pandas as pd
import cptac
import scipy.stats as stats
import numpy as np
import statsmodels.stats.multitest as ssm
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
ccrcc = cptac.Ccrcc()
en = cptac.Endometrial()
luad = cptac.Luad()
lscc = cptac.Lscc()
hnscc = cptac.Hnscc()

Checking that hnscc index is up-to-date...      



                                          

In [3]:
cancer_list = [ccrcc, en, luad, lscc, hnscc]
cancer_names = ['ccrcc', 'en', 'luad', 'lscc', 'hnscc']

In [4]:
cancer_dfs = []
for cancer, name in zip(cancer_list, cancer_names):    
    trans_norm_df = cancer.get_transcriptomics(tissue_type = 'normal')
    if isinstance(trans_norm_df.columns, pd.MultiIndex):
        trans_norm_df = trans_norm_df.droplevel('Database_ID', axis = 1)
    trans_norm_df['patient_ID'] = trans_norm_df.index
    trans_norm_df = trans_norm_df.melt(id_vars = 'patient_ID', var_name = 'Gene', value_name = 'Transcriptomics')
    trans_norm_df['Tissue'] = ['normal'] * len(trans_norm_df)

    trans_tum_df = cancer.get_transcriptomics(tissue_type = 'tumor')
    if isinstance(trans_tum_df.columns, pd.MultiIndex):
        trans_tum_df = trans_norm_df.droplevel('Database_ID', axis = 1)
    trans_tum_df['patient_ID'] = trans_tum_df.index
    trans_tum_df = trans_tum_df.melt(id_vars = 'patient_ID', var_name = 'Gene', value_name = 'Transcriptomics')
    trans_tum_df['Tissue'] = ['tumor'] * len(trans_tum_df)
    trans_df = pd.concat([trans_norm_df, trans_tum_df])

    prot_norm_df = cancer.get_proteomics(tissue_type='normal')
    if isinstance(prot_norm_df.columns, pd.MultiIndex):
        prot_norm_df = prot_norm_df.droplevel('Database_ID', axis = 1)
    prot_norm_df['patient_ID'] = prot_norm_df.index
    prot_norm_df = prot_norm_df.melt(id_vars = 'patient_ID', var_name = 'Gene', value_name = 'Proteomics')
    prot_norm_df['Tissue'] = ['normal'] * len(prot_norm_df)

    prot_tum_df = cancer.get_proteomics(tissue_type='tumor')
    if isinstance(prot_tum_df.columns, pd.MultiIndex):
        prot_tum_df = prot_tum_df.droplevel('Database_ID', axis = 1)
    prot_tum_df['patient_ID'] = prot_tum_df.index
    prot_tum_df = prot_tum_df.melt(id_vars = 'patient_ID', var_name = 'Gene', value_name = 'Proteomics')
    prot_tum_df['Tissue'] = ['tumor'] * len(prot_tum_df)

    trans_df = pd.concat([trans_norm_df, trans_tum_df])
    prot_df = pd.concat([prot_norm_df, prot_tum_df])

    cancer_df = pd.merge(trans_df, prot_df, on = ['patient_ID', 'Gene', 'Tissue'])
    cancer_df = cancer_df.dropna()
    cancer_df['Cancer'] = [name] * len(cancer_df)
    cancer_dfs.append(cancer_df)
df = pd.concat(cancer_dfs)
df


Unnamed: 0,patient_ID,Gene,Transcriptomics,Tissue,Proteomics,Cancer
0,C3L-00004.N,A1BG,0.859821,normal,0.291127,ccrcc
1,C3L-00010.N,A1BG,0.824992,normal,-0.579658,ccrcc
2,C3L-00011.N,A1BG,0.673249,normal,-0.096200,ccrcc
3,C3L-00026.N,A1BG,3.043751,normal,-0.209196,ccrcc
4,C3L-00079.N,A1BG,0.801038,normal,-0.395915,ccrcc
...,...,...,...,...,...,...
1766693,C3N-04275,ZZZ3,11.520000,tumor,18.577359,hnscc
1766695,C3N-04277,ZZZ3,11.510000,tumor,19.897997,hnscc
1766696,C3N-04278,ZZZ3,11.220000,tumor,19.898459,hnscc
1766697,C3N-04279,ZZZ3,11.330000,tumor,19.755773,hnscc


In [5]:
RNA_transport_genes = []
file_name = 'data/kegg_rna_transport.txt'
with open(file_name) as file:
    for line in file:
        print(line)
        if 'RefSeq' in line:
            refseqs = re.search('RefSeq.*', line).group()
            refseqs = refseqs.split(';')[0]
            refseqs = refseqs.strip('RefSeq)')
            refseqs = refseqs.split(',')
            for gene in refseqs:
                gene = gene.strip()
                gene = gene.upper()
                if gene != '':
                    RNA_transport_genes.append(gene)
                    print(gene)

ID                   Definition

----------------------------------------------------------------------------------------------------

hsa:100101267        K14316 nuclear pore complex protein Nup121 | (RefSeq) POM121C, POM121-2; POM121 transmembrane nucleo 

POM121C
POM121-2
hsa:10073            K13151 snurportin-1 | (RefSeq) SNUPN, KPNBL, RNUT1, Snurportin1; snurportin 1 

SNUPN
KPNBL
RNUT1
SNURPORTIN1
hsa:10189            K12881 THO complex subunit 4 | (RefSeq) ALYREF, ALY, ALY/REF, BEF, REF, THOC4; Aly/REF export factor 

ALYREF
ALY
ALY/REF
BEF
REF
THOC4
hsa:101954264        K14276 U1 spliceosomal RNA | (RefSeq) RNVU1-4, RNU1-102, RNU1-50, RNVU1-5, vU1.4, vU1.5; RNA, varian 

RNVU1-4
RNU1-102
RNU1-50
RNVU1-5
VU1.4
VU1.5
hsa:101954268        K14276 U1 spliceosomal RNA | (RefSeq) RNVU1-20, RNU1-110, vU1.20; RNA, variant U1 small nuclear 20 

RNVU1-20
RNU1-110
VU1.20
hsa:101954271        K14280 U6 spliceosomal RNA | (RefSeq) RNU6-9, U6-9; RNA, U6 small nuclear 9 

RNU6-9
U6-9
hsa:10195

In [9]:
luad_mutations = cptac.utils.get_frequently_mutated(luad)
luad_mutations.sort_values('Unique_Samples_Mut', ascending = False)

Name,Gene,Unique_Samples_Mut,Missense_Mut,Truncation_Mut
94,TP53,0.536364,0.400000,0.145455
31,EGFR,0.345455,0.345455,0.000000
54,MUC16,0.336364,0.300000,0.072727
96,TTN,0.318182,0.281818,0.081818
81,RYR2,0.318182,0.272727,0.081818
...,...,...,...,...
92,TENM1,0.109091,0.100000,0.009091
73,POM121L12,0.109091,0.109091,0.000000
20,COL3A1,0.109091,0.090909,0.027273
76,PXDNL,0.109091,0.109091,0.000000


In [7]:
cptac.version()

'0.8.8'

In [8]:
pd.show_versions()




INSTALLED VERSIONS
------------------
commit           : f2c8480af2f25efdbd803218b9d87980f416563e
python           : 3.8.3.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
Version          : 10.0.19041
machine          : AMD64
processor        : Intel64 Family 6 Model 78 Stepping 3, GenuineIntel
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : English_United States.1252

pandas           : 1.2.3
numpy            : 1.18.5
pytz             : 2020.1
dateutil         : 2.8.1
pip              : 20.1.1
setuptools       : 49.2.0.post20200714
Cython           : 0.29.21
pytest           : 5.4.3
hypothesis       : None
sphinx           : 3.1.2
blosc            : None
feather          : None
xlsxwriter       : 1.2.9
lxml.etree       : 4.5.2
html5lib         : 1.1
pymysql          : None
psycopg2         : None
jinja2           : 2.11.2
IPython          : 7.16.1
pandas_datareader: None
bs4              : 4.9.1
bottleneck   