In [1]:
import cptac
import pandas as pd
import pyensembl
import scipy.stats as stats

In [2]:
ensembl = pyensembl.EnsemblRelease()

In [4]:
cptac.download(dataset="luad")
cptac.download(dataset="hnscc")
cptac.download(dataset="ovarian")
cptac.download(dataset="colon")
cptac.download(dataset="brca")
cptac.download(dataset="lscc")

Checking that lscc index is up-to-date...   



Password for lscc dataset: ········      
Wrong password. Try again: ········       
Wrong password. Try again: ········       
                                           

True

In [8]:
luad = cptac.Luad()
brca = cptac.Brca()
ovarian = cptac.Ovarian()
colon = cptac.Colon()
lscc = cptac.Lscc()
hnscc = cptac.Hnscc()

Checking that brca index is up-to-date...



version 3scc v3.2.......                    
Checking that hnscc index is up-to-date...



                                          



In [9]:
luad_cnv = luad.get_CNV()
hnscc_cnv = hnscc.get_CNV()
ovarian_cnv =  ovarian.get_CNV()
colon_cnv = colon.get_CNV()
brca_cnv = brca.get_CNV()
lscc_cnv = lscc.get_CNV()

In [10]:
def somatic_mutations_to_binary(df):
    new_df = df.copy()
    new_df['v'] = 1
    new_df['m'] = df['Gene'] + '_' + df['Location']
    df.reset_index()
    new_df = pd.pivot_table(new_df, values=['v'], index=['Patient_ID'], columns=['m'], fill_value=0)
    if isinstance(new_df.columns, pd.MultiIndex):
        new_df.columns = new_df.columns.droplevel(0)
        new_df.columns = new_df.columns.rename("Name")
    return new_df

In [11]:
luad_mutations = somatic_mutations_to_binary(luad.get_somatic_mutation())
hnscc_mutations = somatic_mutations_to_binary(hnscc.get_somatic_mutation())
ovarian_mutations = somatic_mutations_to_binary(ovarian.get_somatic_mutation())
colon_mutations = colon.get_somatic_mutation_binary()
brca_mutations = somatic_mutations_to_binary(brca.get_somatic_mutation())
lscc_mutations = somatic_mutations_to_binary(lscc.get_somatic_mutation())

## Find if it has event

In [12]:
deletion_event = 30794385.5

In [13]:
def calc_percent(row):
    values = list(row)
    return(len([x for x in values if x <= -0.2]) / len(values))

In [14]:
def add_chromo_and_loc(my_dict):
    chromo = list()
    locations = list()
    not_found = list()
    count = 0
    for gene in list(my_dict.index.get_level_values(0)):
        count += 1
        try:
            e_gene = ensembl.genes_by_name(gene)
            chromo.append(e_gene[0].contig)
            locations.append((e_gene[0].start + e_gene[0].end)/2)
        except:
            not_found.append(gene)
            chromo.append(None)
            locations.append(None)
    my_dict['chromo'] = chromo
    my_dict['location'] = locations
    return my_dict

In [15]:
luad_cnv_with_loc = add_chromo_and_loc(luad_cnv.transpose())
ovarian_cnv_with_loc = add_chromo_and_loc(ovarian_cnv.transpose())
hnscc_cnv_with_loc = add_chromo_and_loc(hnscc_cnv.transpose())
colon_cnv_with_loc = add_chromo_and_loc(colon_cnv.transpose())
brca_cnv_with_loc = add_chromo_and_loc(brca_cnv.transpose())
lscc_cnv_with_loc = add_chromo_and_loc(lscc_cnv.transpose())

In [16]:
# Drop any genes that we don't have a location for
luad_cnv_with_loc = luad_cnv_with_loc.dropna(subset=['location'])
ovarian_cnv_with_loc = ovarian_cnv_with_loc.dropna(subset=['location'])
hnscc_cnv_with_loc = hnscc_cnv_with_loc.dropna(subset=['location'])
colon_cnv_with_loc = colon_cnv_with_loc.dropna(subset=['location'])
brca_cnv_with_loc = brca_cnv_with_loc.dropna(subset=['location'])
lscc_cnv_with_loc = lscc_cnv_with_loc.dropna(subset=['location'])

In [17]:
#subset chromo 8
luad_cnv_8 = luad_cnv_with_loc[luad_cnv_with_loc.chromo == '8']
ovarian_cnv_8 = ovarian_cnv_with_loc[ovarian_cnv_with_loc.chromo == '8']
hnscc_cnv_8 = hnscc_cnv_with_loc[hnscc_cnv_with_loc.chromo == '8']
colon_cnv_8 = colon_cnv_with_loc[colon_cnv_with_loc.chromo == '8']
brca_cnv_8 = brca_cnv_with_loc[brca_cnv_with_loc.chromo == '8']
lscc_cnv_8 = lscc_cnv_with_loc[lscc_cnv_with_loc.chromo == '8']

In [18]:
hnscc_cnv_8

Patient_ID,C3L-00977,C3L-00987,C3L-00994,C3L-00995,C3L-00997,C3L-00999,C3L-01138,C3L-01237,C3L-02621,C3L-02651,...,C3N-04273,C3N-04275,C3N-04276,C3N-04277,C3N-04278,C3N-04279,C3N-04280,C3N-04611,chromo,location
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AARD,0.072361,0.210952,0.103086,0.057373,0.328693,0.301213,0.163875,0.166089,0.037074,0.050557,...,0.202301,0.020019,0.724381,0.170302,-0.002952,0.101854,0.521691,0.096770,8,116941347.0
ABRA,0.072361,0.210952,0.103086,0.048827,0.328693,0.301213,0.163875,-0.037503,0.037074,-0.162906,...,0.202301,0.020019,0.708072,0.170302,-0.002952,0.101854,0.521691,0.096770,8,106764863.5
AC004083.1,0.072361,0.210393,0.103086,0.063999,0.118595,0.296006,0.163875,-0.037503,0.037074,0.054629,...,0.202301,0.084557,-0.039987,0.170302,-0.002952,0.101202,0.521691,0.096770,8,90409233.0
AC004908.1,-0.013255,-0.312128,-0.055534,-0.114651,-0.300611,0.062232,-0.114464,-0.531023,-0.161088,-0.108144,...,-0.171326,-0.021807,0.335878,-0.162786,0.310982,0.049098,-0.068395,0.065471,8,234617.0
AC004908.2,-0.013255,-0.312128,-0.055534,-0.114651,-0.300611,0.062232,-0.114464,-0.531023,-0.161088,-0.108144,...,-0.171326,-0.021807,0.335878,-0.162786,0.310982,0.049098,-0.068395,0.065471,8,233405.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF705G,-0.001148,-0.326293,-0.168886,-0.114651,-0.300611,-0.034491,-0.057668,-0.223789,-0.128206,-0.191189,...,-0.345319,-0.139703,0.517813,-0.168402,0.174169,0.059761,-0.068395,-0.190256,8,7370537.5
ZNF706,0.072361,0.244959,0.103086,0.195381,0.328693,0.301213,0.163875,-0.037503,0.037074,0.053359,...,0.202301,0.084557,0.799756,0.170302,-0.002952,0.101202,0.521691,0.096770,8,101192035.5
ZNF707,0.072361,0.190741,0.103086,0.057373,0.309763,0.317816,0.150918,0.158996,0.038811,0.050557,...,0.575884,0.020019,0.033192,0.170302,-0.283259,0.064778,0.528837,0.096770,8,143699175.0
ZNHIT1P1,0.072361,0.190741,0.103086,0.057373,0.309763,0.303817,0.150918,0.158996,0.038811,0.050557,...,0.575884,0.020019,0.033192,0.170302,-0.283259,0.064778,0.528837,0.096770,8,142858575.5


In [19]:
luad_gene_list = list(luad_cnv_8[luad_cnv_8.location < deletion_event].index)
ovarian_gene_list = list(ovarian_cnv_8[ovarian_cnv_8.location < deletion_event].index)
hnscc_gene_list = list(hnscc_cnv_8[hnscc_cnv_8.location < deletion_event].index)
colon_gene_list = list(colon_cnv_8[colon_cnv_8.location < deletion_event].index)
brca_gene_list = list(brca_cnv_8[brca_cnv_8.location < deletion_event].index)
lscc_gene_list = list(lscc_cnv_8[lscc_cnv_8.location < deletion_event].index)

In [20]:
brca_gene_list_edit = [i[0] for i in brca_gene_list]

In [21]:
genes = list(set().union(luad_gene_list, ovarian_gene_list, hnscc_gene_list, colon_gene_list, brca_gene_list_edit, lscc_gene_list))

In [22]:
print(genes)

['AC015468.2', 'AC100861.2', 'AC013643.2', 'AC018437.1', 'NKX3-1', 'AC084838.1', 'AC084121.12', 'AC037459.3', 'AC023403.1', 'AC105046.1', 'INTS9', 'CDCA2', 'AC011008.1', 'AC105233.1', 'UBXN8', 'AC018437.3', 'AC010941.1', 'XPO7', 'SPAG11A', 'AC090197.1', 'AF228730.2', 'AF233439.2', 'AC090820.1', 'AC019270.1', 'MIR598', 'AP006248.5', 'OR7E10P', 'AC068880.3', 'DEFB109D', 'EXTL3', 'MIR6843', 'DEFB104B', 'DOK2', 'AC144568.1', 'ALG1L12P', 'AC015468.1', 'AC021613.2', 'AC100802.1', 'AC130360.1', 'RN7SL293P', 'ZDHHC2', 'TNFRSF10B', 'RNU6-842P', 'AC069185.1', 'MIR383', 'AC022784.3', 'DEFB134', 'TMEM97P2', 'AC011726.1', 'AC104964.3', 'ELP3', 'RPL23AP53', 'SARAF', 'LONRF1', 'AC084121.2', 'AC037459.1', 'RNA5SP260', 'SNRPCP6', 'DEFT1P2', 'MIR3926-2', 'DEFA5', 'AC245123.1', 'LINC00681', 'LPL', 'ERICH1', 'AC104997.1', 'HSPD1P3', 'AC105206.1', 'AC037459.2', 'CSGALNACT1', 'NPM2', 'AC084121.1', 'AC112673.1', 'RN7SL303P', 'AC021678.1', 'USP17L7', 'RPL10P19', 'AC087273.2', 'OR4F21', 'AC022559.1', 'AC019257

In [23]:
luad_df_before_event = luad_cnv[luad_gene_list]
hnscc_df_before_event = hnscc_cnv[hnscc_gene_list]
ovarian_df_before_event = ovarian_cnv[ovarian_gene_list]
colon_df_before_event = colon_cnv[colon_gene_list]
brca_df_before_event = brca_cnv[brca_gene_list]
lscc_df_before_event = lscc_cnv[lscc_gene_list]

In [30]:
luad_df_before_event

Name,ADAM28,ADAM7,ADAMDEC1,ADRA1A,AGPAT5,ANGPT2,ARHGEF10,ASAH1,ATP6V1B2,BIN3,...,XKR5,XKR6,XPO7,ZDHHC2,ZNF395,ZNF596,ZNF705B,ZNF705D,ZNF705G,percent
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00001,-0.2392,-0.2392,-0.2392,-0.2392,-0.2471,-0.2471,-0.2471,-0.2392,-0.2392,-0.2392,...,-0.2471,-0.2453,-0.2392,-0.2392,-0.2392,-0.2471,-0.3543,-0.2453,-0.1132,0.978495
C3L-00009,-0.1389,-0.1389,-0.1389,-0.1389,-0.1476,-0.1476,-0.1476,-0.1389,-0.1389,-0.1389,...,-0.1476,-0.1391,-0.1389,-0.1389,-0.1389,-0.0163,-0.2660,-0.1391,-0.2660,0.118280
C3L-00080,-0.2973,-0.2973,-0.2973,-0.2973,-0.3089,-0.3089,-0.3089,-0.2973,-0.2973,-0.2973,...,-0.3089,-0.2993,-0.2973,-0.2973,-0.2973,-0.3089,-0.4377,-0.5301,-0.3136,0.983871
C3L-00083,-0.0342,-0.0342,-0.0342,-0.0342,-0.0313,-0.0313,-0.0313,-0.0342,-0.0342,-0.0342,...,-0.0313,-0.0313,-0.0342,-0.0342,-0.0342,-0.0313,-0.0313,-0.0313,-0.0313,0.000000
C3L-00093,-0.2217,-0.2217,-0.2217,-0.2217,-0.2217,-0.2217,-0.2217,-0.2217,-0.2217,-0.2217,...,-0.2217,-0.2217,-0.2217,-0.2217,-0.0052,-0.2217,-0.2217,-0.2217,-0.2217,0.870968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-02729,-0.7857,-0.7857,-0.7857,-0.7857,-0.7903,-0.7903,-0.7905,0.1526,0.1526,-0.7857,...,-0.7903,0.1357,-0.7857,-0.7520,-0.7857,-0.7905,0.1314,0.4523,0.0878,0.634409
X11LU013,-0.4281,-0.4281,-0.4281,-0.4281,-0.4961,-0.4961,-0.4961,-0.4281,-0.4281,-0.4281,...,-0.4961,-0.4603,-0.4281,-0.4281,-0.4281,-0.4961,-0.3695,-0.4603,-0.6183,0.994624
X11LU016,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,...,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,0.000000
X11LU022,-0.0646,-0.0646,-0.0646,-0.0646,0.0649,0.0649,0.1551,-0.0646,-0.0646,-0.0646,...,0.0649,0.0923,-0.0646,-0.0646,-0.0646,0.0435,0.0923,0.0923,0.0696,0.000000


In [25]:
luad_df_before_event['percent'] = luad_df_before_event.apply(calc_percent, axis=1)
hnscc_df_before_event['percent'] = hnscc_df_before_event.apply(calc_percent, axis=1)
ovarian_df_before_event['percent'] = ovarian_df_before_event.apply(calc_percent, axis=1)
colon_df_before_event['percent'] = colon_df_before_event.apply(calc_percent, axis=1)
brca_df_before_event['percent'] = brca_df_before_event.apply(calc_percent, axis=1)
lscc_df_before_event['percent'] = lscc_df_before_event.apply(calc_percent, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [26]:
luad_has_event = list(luad_df_before_event[luad_df_before_event.percent > .8].index)
hnscc_has_event = list(hnscc_df_before_event[hnscc_df_before_event.percent > .8].index)
ovarian_has_event = list(ovarian_df_before_event[ovarian_df_before_event.percent > .8].index)
colon_has_event = list(colon_df_before_event[colon_df_before_event.percent > .8].index)
brca_has_event = list(brca_df_before_event[brca_df_before_event.percent > .8].index)
lscc_has_event = list(lscc_df_before_event[lscc_df_before_event.percent > .8].index)

## Append Row

In [27]:
luad_mutations['event'] = luad_mutations.index.isin(luad_has_event).astype(int)
hnscc_mutations['event'] = hnscc_mutations.index.isin(hnscc_has_event).astype(int)
ovarian_mutations['event'] = ovarian_mutations.index.isin(ovarian_has_event).astype(int)
colon_mutations['event'] = colon_mutations.index.isin(colon_has_event).astype(int)
brca_mutations['event'] = brca_mutations.index.isin(brca_has_event).astype(int)
lscc_mutations['event'] = lscc_mutations.index.isin(lscc_has_event).astype(int)

In [28]:
luad_has_event = pd.DataFrame({'loss_event': luad_mutations['event']}, index = luad_mutations.index)
ovarian_has_event = pd.DataFrame({'loss_event': ovarian_mutations['event']}, index = ovarian_mutations.index)
hnscc_has_event = pd.DataFrame({'loss_event': hnscc_mutations['event']}, index = hnscc_mutations.index)
colon_has_event = pd.DataFrame({'loss_event': colon_mutations['event']}, index = colon_mutations.index)
brca_has_event = pd.DataFrame({'loss_event': brca_mutations['event']}, index = brca_mutations.index)
lscc_has_event = pd.DataFrame({'loss_event': lscc_mutations['event']}, index = lscc_mutations.index)

In [29]:
luad_has_event.to_csv("luad_has_loss_event.csv")
ovarian_has_event.to_csv("ovarian_has_loss_event.csv")
hnscc_has_event.to_csv("hnscc_has_loss_event.csv")
colon_has_event.to_csv("colon_has_loss_event.csv")
brca_has_event.to_csv("brca_has_loss_event.csv")
lscc_has_event.to_csv("lscc_has_loss_event.csv")

In [45]:
luad_mutations

Name,A1BG_p.L110Q,A1BG_p.V221V,A1CF_p.D344Y,A1CF_p.E77D,A2ML1_p.L645I,A2M_p.L1365*,A2M_p.S395F,A4GNT_p.Q40L,AAAS_p.S328L,AACS_p.V616L,...,ZYX_p.X137_splice,ZZEF1_p.D2460Y,ZZEF1_p.E1265*,ZZEF1_p.G2696A,ZZEF1_p.Q1356K,ZZEF1_p.X638_splice,ZZZ3_p.D626Y,ZZZ3_p.S875S,ZZZ3_p.V320M,event
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
C3L-00009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3L-00080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
C3L-00083,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3L-00093,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-02729,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X11LU013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
X11LU016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X11LU022,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [173]:
# luad_mutations.to_csv("luad_mutations_parm.csv")
# hnscc_mutations.to_csv("hnscc_muations_parm.csv")
# ovarian_mutations.to_csv("ovarian_mutations_parm.csv")
# colon_mutations.to_csv("colon_mutations_parm.csv")
# brca_mutations.to_csv("brca_mutations_parm.csv")
# lscc_mutations.to_csv("lscc_mutations_parm.csv")

## Run the Fishers Tests

In [43]:
def run_fishers_test(df, cancer):
    results = pd.DataFrame(columns=['Mutation', f'{cancer}_odds', f'{cancer}_pvalue'])
    cols = list(df.columns)
    cols.remove('event')
    i = 0
    for col in cols:
        table = pd.crosstab(df[col], df['event'])
        oddsratio, pvalue = stats.fisher_exact(table)
        results.loc[i] = [col, oddsratio, pvalue]
        i += 1
    return results

In [33]:
 import time 435618

In [48]:
start = time.time()
luad_fishers = run_fishers_test(luad_mutations, 'Luad')
print(time.time()  - start)

1029.2760038375854


In [55]:
luad_fishers.to_csv("luad_fishers.tsv", sep='\t')

In [170]:
start = time.time()
table = pd.crosstab(luad_mutations['A1BG_p.L110Q'], luad_mutations['event'])
oddsratio, pvalue = stats.fisher_exact(table)
print(time.time()  - start)

0.025036334991455078
