In [5]:
import cptac
import pandas as pd
import pyensembl
import scipy.stats as stats

In [6]:
ensembl = pyensembl.EnsemblRelease()

In [7]:
luad = cptac.Luad()
brca = cptac.Brca()
ovarian = cptac.Ovarian()
colon = cptac.Colon()
lscc = cptac.Lscc()
hnscc = cptac.Hnscc()

Checking that brca index is up-to-date...



Checking that hnscc index is up-to-date...  



                                          



In [8]:
luad_cnv = luad.get_CNV()
hnscc_cnv = hnscc.get_CNV()
ovarian_cnv =  ovarian.get_CNV()
colon_cnv = colon.get_CNV()
brca_cnv = brca.get_CNV()
lscc_cnv = lscc.get_CNV()

In [9]:
def somatic_mutations_to_binary(df):
    new_df = df.copy()
    new_df['v'] = 1
    new_df['m'] = df['Gene'] + '_' + df['Location']
    df.reset_index()
    new_df = pd.pivot_table(new_df, values=['v'], index=['Patient_ID'], columns=['m'], fill_value=0)
    if isinstance(new_df.columns, pd.MultiIndex):
        new_df.columns = new_df.columns.droplevel(0)
        new_df.columns = new_df.columns.rename("Name")
    return new_df

In [10]:
luad_mutations = somatic_mutations_to_binary(luad.get_somatic_mutation())
hnscc_mutations = somatic_mutations_to_binary(hnscc.get_somatic_mutation())
ovarian_mutations = somatic_mutations_to_binary(ovarian.get_somatic_mutation())
colon_mutations = colon.get_somatic_mutation_binary()
brca_mutations = somatic_mutations_to_binary(brca.get_somatic_mutation())
lscc_mutations = somatic_mutations_to_binary(lscc.get_somatic_mutation())

## Find if it has event

In [11]:
deletion_event = 30794385.5

In [12]:
def calc_percent(row):
    values = list(row)
    return(len([x for x in values if x <= -0.2]) / len(values))

In [13]:
def add_chromo_and_loc(my_dict):
    chromo = list()
    locations = list()
    not_found = list()
    count = 0
    for gene in list(my_dict.index.get_level_values(0)):
        count += 1
        try:
            e_gene = ensembl.genes_by_name(gene)
            chromo.append(e_gene[0].contig)
            locations.append((e_gene[0].start + e_gene[0].end)/2)
        except:
            not_found.append(gene)
            chromo.append(None)
            locations.append(None)
    my_dict['chromo'] = chromo
    my_dict['location'] = locations
    return my_dict

In [14]:
luad_cnv_with_loc = add_chromo_and_loc(luad_cnv.transpose())
ovarian_cnv_with_loc = add_chromo_and_loc(ovarian_cnv.transpose())
hnscc_cnv_with_loc = add_chromo_and_loc(hnscc_cnv.transpose())
colon_cnv_with_loc = add_chromo_and_loc(colon_cnv.transpose())
brca_cnv_with_loc = add_chromo_and_loc(brca_cnv.transpose())
lscc_cnv_with_loc = add_chromo_and_loc(lscc_cnv.transpose())

In [15]:
# Drop any genes that we don't have a location for
luad_cnv_with_loc = luad_cnv_with_loc.dropna(subset=['location'])
ovarian_cnv_with_loc = ovarian_cnv_with_loc.dropna(subset=['location'])
hnscc_cnv_with_loc = hnscc_cnv_with_loc.dropna(subset=['location'])
colon_cnv_with_loc = colon_cnv_with_loc.dropna(subset=['location'])
brca_cnv_with_loc = brca_cnv_with_loc.dropna(subset=['location'])
lscc_cnv_with_loc = lscc_cnv_with_loc.dropna(subset=['location'])

In [16]:
#subset chromo 8
luad_cnv_8 = luad_cnv_with_loc[luad_cnv_with_loc.chromo == '8']
ovarian_cnv_8 = ovarian_cnv_with_loc[ovarian_cnv_with_loc.chromo == '8']
hnscc_cnv_8 = hnscc_cnv_with_loc[hnscc_cnv_with_loc.chromo == '8']
colon_cnv_8 = colon_cnv_with_loc[colon_cnv_with_loc.chromo == '8']
brca_cnv_8 = brca_cnv_with_loc[brca_cnv_with_loc.chromo == '8']
lscc_cnv_8 = lscc_cnv_with_loc[lscc_cnv_with_loc.chromo == '8']

In [17]:
hnscc_cnv_8

Patient_ID,C3L-00977,C3L-00987,C3L-00994,C3L-00995,C3L-00997,C3L-00999,C3L-01138,C3L-01237,C3L-02621,C3L-02651,...,C3N-04273,C3N-04275,C3N-04276,C3N-04277,C3N-04278,C3N-04279,C3N-04280,C3N-04611,chromo,location
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AARD,0.072361,0.210952,0.103086,0.057373,0.328693,0.301213,0.163875,0.166089,0.037074,0.050557,...,0.202301,0.020019,0.724381,0.170302,-0.002952,0.101854,0.521691,0.096770,8,116941347.0
ABRA,0.072361,0.210952,0.103086,0.048827,0.328693,0.301213,0.163875,-0.037503,0.037074,-0.162906,...,0.202301,0.020019,0.708072,0.170302,-0.002952,0.101854,0.521691,0.096770,8,106764863.5
AC004083.1,0.072361,0.210393,0.103086,0.063999,0.118595,0.296006,0.163875,-0.037503,0.037074,0.054629,...,0.202301,0.084557,-0.039987,0.170302,-0.002952,0.101202,0.521691,0.096770,8,90409233.0
AC004908.1,-0.013255,-0.312128,-0.055534,-0.114651,-0.300611,0.062232,-0.114464,-0.531023,-0.161088,-0.108144,...,-0.171326,-0.021807,0.335878,-0.162786,0.310982,0.049098,-0.068395,0.065471,8,234617.0
AC004908.2,-0.013255,-0.312128,-0.055534,-0.114651,-0.300611,0.062232,-0.114464,-0.531023,-0.161088,-0.108144,...,-0.171326,-0.021807,0.335878,-0.162786,0.310982,0.049098,-0.068395,0.065471,8,233405.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF705G,-0.001148,-0.326293,-0.168886,-0.114651,-0.300611,-0.034491,-0.057668,-0.223789,-0.128206,-0.191189,...,-0.345319,-0.139703,0.517813,-0.168402,0.174169,0.059761,-0.068395,-0.190256,8,7370537.5
ZNF706,0.072361,0.244959,0.103086,0.195381,0.328693,0.301213,0.163875,-0.037503,0.037074,0.053359,...,0.202301,0.084557,0.799756,0.170302,-0.002952,0.101202,0.521691,0.096770,8,101192035.5
ZNF707,0.072361,0.190741,0.103086,0.057373,0.309763,0.317816,0.150918,0.158996,0.038811,0.050557,...,0.575884,0.020019,0.033192,0.170302,-0.283259,0.064778,0.528837,0.096770,8,143699175.0
ZNHIT1P1,0.072361,0.190741,0.103086,0.057373,0.309763,0.303817,0.150918,0.158996,0.038811,0.050557,...,0.575884,0.020019,0.033192,0.170302,-0.283259,0.064778,0.528837,0.096770,8,142858575.5


In [18]:
luad_gene_list = list(luad_cnv_8[luad_cnv_8.location < deletion_event].index)
ovarian_gene_list = list(ovarian_cnv_8[ovarian_cnv_8.location < deletion_event].index)
hnscc_gene_list = list(hnscc_cnv_8[hnscc_cnv_8.location < deletion_event].index)
colon_gene_list = list(colon_cnv_8[colon_cnv_8.location < deletion_event].index)
brca_gene_list = list(brca_cnv_8[brca_cnv_8.location < deletion_event].index)
lscc_gene_list = list(lscc_cnv_8[lscc_cnv_8.location < deletion_event].index)

In [19]:
brca_gene_list_edit = [i[0] for i in brca_gene_list]

In [20]:
genes = list(set().union(luad_gene_list, ovarian_gene_list, hnscc_gene_list, colon_gene_list, brca_gene_list_edit, lscc_gene_list))

In [21]:
print(genes)

['AC102945.1', 'ADAM24P', 'AC104964.3', 'AP006248.5', 'AC009623.1', 'AC021242.3', 'FGL1', 'AC079193.2', 'R3HCC1', 'FAM90A3P', 'LONRF1', 'AC018437.1', 'FAM90A4P', 'DEFA6', 'DLC1', 'AC105233.2', 'AC040975.1', 'LPL', 'AF131216.3', 'MCPH1-AS1', 'AC034111.2', 'EXTL3', 'RP1L1', 'AC037459.3', 'OR7E15P', 'AC120193.1', 'AC087203.1', 'PIWIL2', 'RNU6-1276P', 'AC025062.3', 'AC105233.5', 'DEFB4B', 'INTS10', 'PDLIM2', 'HMBOX1', 'EXTL3-AS1', 'AC114550.3', 'OR7E10P', 'MIR4286', 'PTK2B', 'MIR596', 'AC021678.1', 'RPL23AP54', 'RHOBTB2', 'AC018398.1', 'ENPP7P12', 'RPL35P6', 'MIR3926-1', 'ARHGEF10', 'NEFM', 'TNFRSF10B', 'DEFB105A', 'AC022784.6', 'ALG1L11P', 'PINX1', 'AC021242.2', 'AC018437.3', 'AC130352.1', 'MIR4659B', 'AC107959.1', 'AC011586.2', 'RBPMS-AS1', 'TNFRSF10D', 'DEFA9P', 'AC091185.1', 'NAT2', 'RN7SL474P', 'NATP', 'MIR6876', 'AC100861.1', 'AC023403.1', 'AC022784.5', 'DEFB4A', 'AC090150.1', 'DEFA4', 'AC108449.1', 'AC084121.9', 'AC131254.2', 'ENPP7P1', 'SLC25A37', 'AC105046.1', 'LINC02209', 'LINC00

In [22]:
luad_df_before_event = luad_cnv[luad_gene_list]
hnscc_df_before_event = hnscc_cnv[hnscc_gene_list]
ovarian_df_before_event = ovarian_cnv[ovarian_gene_list]
colon_df_before_event = colon_cnv[colon_gene_list]
brca_df_before_event = brca_cnv[brca_gene_list]
lscc_df_before_event = lscc_cnv[lscc_gene_list]

In [23]:
luad_df_before_event

Name,ADAM28,ADAM7,ADAMDEC1,ADRA1A,AGPAT5,ANGPT2,ARHGEF10,ASAH1,ATP6V1B2,BIN3,...,VPS37A,XKR5,XKR6,XPO7,ZDHHC2,ZNF395,ZNF596,ZNF705B,ZNF705D,ZNF705G
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00001,-0.2392,-0.2392,-0.2392,-0.2392,-0.2471,-0.2471,-0.2471,-0.2392,-0.2392,-0.2392,...,-0.2392,-0.2471,-0.2453,-0.2392,-0.2392,-0.2392,-0.2471,-0.3543,-0.2453,-0.1132
C3L-00009,-0.1389,-0.1389,-0.1389,-0.1389,-0.1476,-0.1476,-0.1476,-0.1389,-0.1389,-0.1389,...,-0.1389,-0.1476,-0.1391,-0.1389,-0.1389,-0.1389,-0.0163,-0.2660,-0.1391,-0.2660
C3L-00080,-0.2973,-0.2973,-0.2973,-0.2973,-0.3089,-0.3089,-0.3089,-0.2973,-0.2973,-0.2973,...,-0.2973,-0.3089,-0.2993,-0.2973,-0.2973,-0.2973,-0.3089,-0.4377,-0.5301,-0.3136
C3L-00083,-0.0342,-0.0342,-0.0342,-0.0342,-0.0313,-0.0313,-0.0313,-0.0342,-0.0342,-0.0342,...,-0.0342,-0.0313,-0.0313,-0.0342,-0.0342,-0.0342,-0.0313,-0.0313,-0.0313,-0.0313
C3L-00093,-0.2217,-0.2217,-0.2217,-0.2217,-0.2217,-0.2217,-0.2217,-0.2217,-0.2217,-0.2217,...,-0.2217,-0.2217,-0.2217,-0.2217,-0.2217,-0.0052,-0.2217,-0.2217,-0.2217,-0.2217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-02729,-0.7857,-0.7857,-0.7857,-0.7857,-0.7903,-0.7903,-0.7905,0.1526,0.1526,-0.7857,...,-0.0300,-0.7903,0.1357,-0.7857,-0.7520,-0.7857,-0.7905,0.1314,0.4523,0.0878
X11LU013,-0.4281,-0.4281,-0.4281,-0.4281,-0.4961,-0.4961,-0.4961,-0.4281,-0.4281,-0.4281,...,-0.4281,-0.4961,-0.4603,-0.4281,-0.4281,-0.4281,-0.4961,-0.3695,-0.4603,-0.6183
X11LU016,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,...,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210,-0.0210
X11LU022,-0.0646,-0.0646,-0.0646,-0.0646,0.0649,0.0649,0.1551,-0.0646,-0.0646,-0.0646,...,-0.0646,0.0649,0.0923,-0.0646,-0.0646,-0.0646,0.0435,0.0923,0.0923,0.0696


In [24]:
luad_df_before_event['percent'] = luad_df_before_event.apply(calc_percent, axis=1)
hnscc_df_before_event['percent'] = hnscc_df_before_event.apply(calc_percent, axis=1)
ovarian_df_before_event['percent'] = ovarian_df_before_event.apply(calc_percent, axis=1)
colon_df_before_event['percent'] = colon_df_before_event.apply(calc_percent, axis=1)
brca_df_before_event['percent'] = brca_df_before_event.apply(calc_percent, axis=1)
lscc_df_before_event['percent'] = lscc_df_before_event.apply(calc_percent, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [25]:
luad_has_event = list(luad_df_before_event[luad_df_before_event.percent > .8].index)
hnscc_has_event = list(hnscc_df_before_event[hnscc_df_before_event.percent > .8].index)
ovarian_has_event = list(ovarian_df_before_event[ovarian_df_before_event.percent > .8].index)
colon_has_event = list(colon_df_before_event[colon_df_before_event.percent > .8].index)
brca_has_event = list(brca_df_before_event[brca_df_before_event.percent > .8].index)
lscc_has_event = list(lscc_df_before_event[lscc_df_before_event.percent > .8].index)

## Append Row

In [26]:
luad_mutations['event'] = luad_mutations.index.isin(luad_has_event).astype(int)
hnscc_mutations['event'] = hnscc_mutations.index.isin(hnscc_has_event).astype(int)
ovarian_mutations['event'] = ovarian_mutations.index.isin(ovarian_has_event).astype(int)
colon_mutations['event'] = colon_mutations.index.isin(colon_has_event).astype(int)
brca_mutations['event'] = brca_mutations.index.isin(brca_has_event).astype(int)
lscc_mutations['event'] = lscc_mutations.index.isin(lscc_has_event).astype(int)

In [27]:
luad_has_event = pd.DataFrame({'loss_event': luad_mutations['event']}, index = luad_mutations.index)
ovarian_has_event = pd.DataFrame({'loss_event': ovarian_mutations['event']}, index = ovarian_mutations.index)
hnscc_has_event = pd.DataFrame({'loss_event': hnscc_mutations['event']}, index = hnscc_mutations.index)
colon_has_event = pd.DataFrame({'loss_event': colon_mutations['event']}, index = colon_mutations.index)
brca_has_event = pd.DataFrame({'loss_event': brca_mutations['event']}, index = brca_mutations.index)
lscc_has_event = pd.DataFrame({'loss_event': lscc_mutations['event']}, index = lscc_mutations.index)

In [28]:
luad_has_event.to_csv("luad_has_loss_event.csv")
ovarian_has_event.to_csv("ovarian_has_loss_event.csv")
hnscc_has_event.to_csv("hnscc_has_loss_event.csv")
colon_has_event.to_csv("colon_has_loss_event.csv")
brca_has_event.to_csv("brca_has_loss_event.csv")
lscc_has_event.to_csv("lscc_has_loss_event.csv")

In [173]:
# luad_mutations.to_csv("luad_mutations_parm.csv")
# hnscc_mutations.to_csv("hnscc_muations_parm.csv")
# ovarian_mutations.to_csv("ovarian_mutations_parm.csv")
# colon_mutations.to_csv("colon_mutations_parm.csv")
# brca_mutations.to_csv("brca_mutations_parm.csv")
# lscc_mutations.to_csv("lscc_mutations_parm.csv")

## Run the Fishers Tests

In [166]:
def run_fishers_test(df, cancer):
    results = pd.DataFrame(columns=['Mutation', f'{cancer}_odds', f'{cancer}_pvalue'])
    cols = list(df.columns)
    cols.remove('event')
    for col in cols:
        table = pd.crosstab(df[col], df['event'])
        oddsratio, pvalue = stats.fisher_exact(table)
        results.append([col, oddsratio, pvalue], ignore_index=True)
    return results

In [167]:
luad_fishers = run_fishers_test(luad_mutations, 'Luad')

KeyboardInterrupt: 

In [168]:
import time

In [170]:
start = time.time()
table = pd.crosstab(luad_mutations['A1BG_p.L110Q'], luad_mutations['event'])
oddsratio, pvalue = stats.fisher_exact(table)
print(time.time()  - start)

0.025036334991455078
