In [1]:
import cptac
import pandas as pd
import pyensembl
import scipy.stats as stats

In [2]:
ensembl = pyensembl.EnsemblRelease()

In [3]:
luad = cptac.Luad()
brca = cptac.Brca()
ovarian = cptac.Ovarian()
colon = cptac.Colon()
lscc = cptac.Lscc()
hnscc = cptac.Hnscc()

Checking that brca index is up-to-date...

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Checking that hnscc index is up-to-date...  



                                          



In [4]:
luad_cnv = luad.get_CNV()
hnscc_cnv = hnscc.get_CNV()
ovarian_cnv =  ovarian.get_CNV()
colon_cnv = colon.get_CNV()
brca_cnv = brca.get_CNV()
lscc_cnv = lscc.get_CNV()

In [5]:
def somatic_mutations_to_binary(df):
    new_df = df.copy()
    new_df['v'] = 1
    new_df['m'] = df['Gene'] + '_' + df['Location']
    df.reset_index()
    new_df = pd.pivot_table(new_df, values=['v'], index=['Patient_ID'], columns=['m'], fill_value=0)
    if isinstance(new_df.columns, pd.MultiIndex):
        new_df.columns = new_df.columns.droplevel(0)
        new_df.columns = new_df.columns.rename("Name")
    return new_df

In [6]:
luad_mutations = somatic_mutations_to_binary(luad.get_somatic_mutation())
hnscc_mutations = somatic_mutations_to_binary(hnscc.get_somatic_mutation())
ovarian_mutations = somatic_mutations_to_binary(ovarian.get_somatic_mutation())
colon_mutations = colon.get_somatic_mutation_binary()
brca_mutations = somatic_mutations_to_binary(brca.get_somatic_mutation())
lscc_mutations = somatic_mutations_to_binary(lscc.get_somatic_mutation())

## Find if it has event

In [29]:
insertion_event_start = 80794385.5
insertion_event_end = 130794385.5

In [30]:
def calc_percent(row):
    values = list(row)
    return(len([x for x in values if x >= 0.2]) / len(values))

In [31]:
def add_chromo_and_loc(my_dict):
    chromo = list()
    locations = list()
    not_found = list()
    count = 0
    for gene in list(my_dict.index.get_level_values(0)):
        count += 1
        try:
            e_gene = ensembl.genes_by_name(gene)
            chromo.append(e_gene[0].contig)
            locations.append((e_gene[0].start + e_gene[0].end)/2)
        except:
            not_found.append(gene)
            chromo.append(None)
            locations.append(None)
    my_dict['chromo'] = chromo
    my_dict['location'] = locations
    return my_dict

In [32]:
luad_cnv_with_loc = add_chromo_and_loc(luad_cnv.transpose())
ovarian_cnv_with_loc = add_chromo_and_loc(ovarian_cnv.transpose())
hnscc_cnv_with_loc = add_chromo_and_loc(hnscc_cnv.transpose())
colon_cnv_with_loc = add_chromo_and_loc(colon_cnv.transpose())
brca_cnv_with_loc = add_chromo_and_loc(brca_cnv.transpose())
lscc_cnv_with_loc = add_chromo_and_loc(lscc_cnv.transpose())

In [33]:
# Drop any genes that we don't have a location for
luad_cnv_with_loc = luad_cnv_with_loc.dropna(subset=['location'])
ovarian_cnv_with_loc = ovarian_cnv_with_loc.dropna(subset=['location'])
hnscc_cnv_with_loc = hnscc_cnv_with_loc.dropna(subset=['location'])
colon_cnv_with_loc = colon_cnv_with_loc.dropna(subset=['location'])
brca_cnv_with_loc = brca_cnv_with_loc.dropna(subset=['location'])
lscc_cnv_with_loc = lscc_cnv_with_loc.dropna(subset=['location'])

In [34]:
#subset chromo 8
luad_cnv_8 = luad_cnv_with_loc[luad_cnv_with_loc.chromo == '8']
ovarian_cnv_8 = ovarian_cnv_with_loc[ovarian_cnv_with_loc.chromo == '8']
hnscc_cnv_8 = hnscc_cnv_with_loc[hnscc_cnv_with_loc.chromo == '8']
colon_cnv_8 = colon_cnv_with_loc[colon_cnv_with_loc.chromo == '8']
brca_cnv_8 = brca_cnv_with_loc[brca_cnv_with_loc.chromo == '8']
lscc_cnv_8 = lscc_cnv_with_loc[lscc_cnv_with_loc.chromo == '8']

In [35]:
hnscc_cnv_8

Patient_ID,C3L-00977,C3L-00987,C3L-00994,C3L-00995,C3L-00997,C3L-00999,C3L-01138,C3L-01237,C3L-02621,C3L-02651,...,C3N-04273,C3N-04275,C3N-04276,C3N-04277,C3N-04278,C3N-04279,C3N-04280,C3N-04611,chromo,location
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AARD,0.072361,0.210952,0.103086,0.057373,0.328693,0.301213,0.163875,0.166089,0.037074,0.050557,...,0.202301,0.020019,0.724381,0.170302,-0.002952,0.101854,0.521691,0.096770,8,116941347.0
ABRA,0.072361,0.210952,0.103086,0.048827,0.328693,0.301213,0.163875,-0.037503,0.037074,-0.162906,...,0.202301,0.020019,0.708072,0.170302,-0.002952,0.101854,0.521691,0.096770,8,106764863.5
AC004083.1,0.072361,0.210393,0.103086,0.063999,0.118595,0.296006,0.163875,-0.037503,0.037074,0.054629,...,0.202301,0.084557,-0.039987,0.170302,-0.002952,0.101202,0.521691,0.096770,8,90409233.0
AC004908.1,-0.013255,-0.312128,-0.055534,-0.114651,-0.300611,0.062232,-0.114464,-0.531023,-0.161088,-0.108144,...,-0.171326,-0.021807,0.335878,-0.162786,0.310982,0.049098,-0.068395,0.065471,8,234617.0
AC004908.2,-0.013255,-0.312128,-0.055534,-0.114651,-0.300611,0.062232,-0.114464,-0.531023,-0.161088,-0.108144,...,-0.171326,-0.021807,0.335878,-0.162786,0.310982,0.049098,-0.068395,0.065471,8,233405.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF705G,-0.001148,-0.326293,-0.168886,-0.114651,-0.300611,-0.034491,-0.057668,-0.223789,-0.128206,-0.191189,...,-0.345319,-0.139703,0.517813,-0.168402,0.174169,0.059761,-0.068395,-0.190256,8,7370537.5
ZNF706,0.072361,0.244959,0.103086,0.195381,0.328693,0.301213,0.163875,-0.037503,0.037074,0.053359,...,0.202301,0.084557,0.799756,0.170302,-0.002952,0.101202,0.521691,0.096770,8,101192035.5
ZNF707,0.072361,0.190741,0.103086,0.057373,0.309763,0.317816,0.150918,0.158996,0.038811,0.050557,...,0.575884,0.020019,0.033192,0.170302,-0.283259,0.064778,0.528837,0.096770,8,143699175.0
ZNHIT1P1,0.072361,0.190741,0.103086,0.057373,0.309763,0.303817,0.150918,0.158996,0.038811,0.050557,...,0.575884,0.020019,0.033192,0.170302,-0.283259,0.064778,0.528837,0.096770,8,142858575.5


In [36]:
luad_gene_list = list(luad_cnv_8[luad_cnv_8.location > insertion_event_start][luad_cnv_8.location < insertion_event_end].index)
ovarian_gene_list = list(ovarian_cnv_8[ovarian_cnv_8.location > insertion_event_start][ovarian_cnv_8.location < insertion_event_end].index)
hnscc_gene_list = list(hnscc_cnv_8[hnscc_cnv_8.location > insertion_event_start][hnscc_cnv_8.location < insertion_event_end].index)
colon_gene_list = list(colon_cnv_8[colon_cnv_8.location > insertion_event_start][colon_cnv_8.location < insertion_event_end].index)
brca_gene_list = list(brca_cnv_8[brca_cnv_8.location > insertion_event_start][brca_cnv_8.location < insertion_event_end].index)
lscc_gene_list = list(lscc_cnv_8[lscc_cnv_8.location > insertion_event_start][lscc_cnv_8.location < insertion_event_end].index)


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  


In [37]:
brca_gene_list_edit = [i[0] for i in brca_gene_list]

In [38]:
genes = list(set().union(luad_gene_list, ovarian_gene_list, hnscc_gene_list, colon_gene_list, brca_gene_list_edit, lscc_gene_list))

In [40]:
print(genes)

['AC023590.1', 'YWHAZ', 'RPS23P1', 'SLC30A8', 'ENY2', 'REXO1L1P', 'RN7SL563P', 'ENPP2', 'AC027373.1', 'EIF3E', 'MIR6844', 'SNX31', 'HMGB1P46', 'RFPL4AP5', 'RNU6-1172P', 'AC009901.1', 'HAS2-AS1', 'AC023632.4', 'AC090142.3', 'AP000428.1', 'RPS12P15', 'AC060765.1', 'NIPAL2', 'RPL30', 'AC068228.3', 'AC132219.2', 'MIR4471', 'TAGLN2P1', 'RNU4-37P', 'RRM2B', 'AC105328.1', 'AP001331.1', 'RIDA', 'EEF1A1P37', 'PABPC1', 'RBM12B', 'AC108860.1', 'BAALC', 'FABP5', 'AC079209.1', 'C8orf76', 'AC084083.1', 'AC021546.1', 'AP001205.1', 'AC106038.1', 'AC104370.1', 'AF121898.1', 'AC108860.2', 'AC011626.1', 'VIRMA', 'SLC10A5', 'FABP4', 'ATP6V1C1', 'AP000424.1', 'SLC2A3P4', 'MIR5680', 'AP001330.5', 'RAD21-AS1', 'IARS2P1', 'AP000428.2', 'BAALC-AS1', 'COL14A1', 'AC025647.1', 'MIR378D2', 'AC087752.3', 'RBM12B-AS1', 'AC090572.2', 'LINC01151', 'LINC00861', 'AP001208.1', 'MIR1204', 'AZIN1-AS1', 'LINC02237', 'AC020688.1', 'AC103760.1', 'TMEM74', 'AC090987.1', 'RNU6-1092P', 'RNF19A', 'AC037486.1', 'HNRNPA1P4', 'RBIS'

In [41]:
luad_df_before_event = luad_cnv[luad_gene_list]
hnscc_df_before_event = hnscc_cnv[hnscc_gene_list]
ovarian_df_before_event = ovarian_cnv[ovarian_gene_list]
colon_df_before_event = colon_cnv[colon_gene_list]
brca_df_before_event = brca_cnv[brca_gene_list]
lscc_df_before_event = lscc_cnv[lscc_gene_list]

In [42]:
luad_df_before_event['percent'] = luad_df_before_event.apply(calc_percent, axis=1)
hnscc_df_before_event['percent'] = hnscc_df_before_event.apply(calc_percent, axis=1)
ovarian_df_before_event['percent'] = ovarian_df_before_event.apply(calc_percent, axis=1)
colon_df_before_event['percent'] = colon_df_before_event.apply(calc_percent, axis=1)
brca_df_before_event['percent'] = brca_df_before_event.apply(calc_percent, axis=1)
lscc_df_before_event['percent'] = lscc_df_before_event.apply(calc_percent, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [43]:
luad_df_before_event

Name,AARD,ABRA,ANGPT1,ANKRD46,ANXA13,ASAP1,ATAD2,ATP6V0D2,ATP6V1C1,AZIN1,...,WDYHV1,WWP1,YWHAZ,ZFAND1,ZFPM2,ZHX1,ZHX2,ZNF572,ZNF706,percent
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00001,0.2465,0.2710,0.2710,0.9111,0.2465,0.2379,0.2465,0.2379,0.9111,0.9111,...,0.2465,0.2379,0.9111,-0.0510,0.7149,0.2465,0.2465,0.2465,0.9111,0.923529
C3L-00009,0.1353,0.1353,0.1353,0.1353,0.1353,0.1353,0.1353,0.1353,0.1353,0.1353,...,0.1353,0.1353,0.1353,0.1490,0.1353,0.1353,0.1353,0.1353,0.1353,0.000000
C3L-00080,0.2276,0.2276,0.2276,0.2276,0.2276,0.2276,0.2276,0.2276,0.2276,0.2276,...,0.2276,0.2276,0.2276,0.2276,0.2276,0.2276,0.2276,0.2276,0.2276,1.000000
C3L-00083,0.0324,0.0324,0.0324,0.0324,0.0324,0.0324,0.0324,0.0324,0.0324,0.0324,...,0.0324,0.0324,0.0324,0.0324,0.0324,0.0324,0.0324,0.0324,0.0324,0.000000
C3L-00093,-0.0118,-0.0118,-0.0118,-0.0118,-0.0118,-0.0118,-0.0118,-0.0118,-0.0118,-0.0118,...,-0.0118,-0.0118,-0.0118,-0.0118,-0.0118,-0.0118,-0.0118,-0.0118,-0.0118,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-02729,0.2680,0.2680,0.2680,0.2680,0.2680,0.2680,0.2680,0.2680,0.2680,0.2680,...,0.2680,0.2680,0.2680,0.2680,0.2680,0.2680,0.2680,0.2680,0.2680,1.000000
X11LU013,1.7162,1.6158,1.0356,1.6999,1.9229,2.0949,1.9229,1.4070,1.6999,1.6999,...,1.9229,1.2774,1.6999,1.5233,1.4927,1.9229,2.0650,2.1335,1.6999,1.000000
X11LU016,-0.0182,-0.0182,-0.0182,-0.0209,-0.0182,-0.0120,-0.0182,-0.0209,-0.0209,-0.0209,...,-0.0182,-0.0209,-0.0209,-0.0209,-0.0306,-0.0182,-0.0182,-0.0182,-0.0209,0.000000
X11LU022,0.2046,0.4634,0.4634,0.4634,0.2046,0.2046,0.2046,0.4706,0.4634,0.4634,...,0.2046,0.3191,0.4634,0.0923,0.4634,0.2046,0.2046,0.2046,0.4634,0.888235


In [44]:
luad_has_event = list(luad_df_before_event[luad_df_before_event.percent > .8].index)
hnscc_has_event = list(hnscc_df_before_event[hnscc_df_before_event.percent > .8].index)
ovarian_has_event = list(ovarian_df_before_event[ovarian_df_before_event.percent > .8].index)
colon_has_event = list(colon_df_before_event[colon_df_before_event.percent > .8].index)
brca_has_event = list(brca_df_before_event[brca_df_before_event.percent > .8].index)
lscc_has_event = list(lscc_df_before_event[lscc_df_before_event.percent > .8].index)

In [45]:
luad_has_event

['C3L-00001',
 'C3L-00080',
 'C3L-00893',
 'C3L-01924',
 'C3L-02365',
 'C3N-00175',
 'C3N-00217',
 'C3N-00546',
 'C3N-00547',
 'C3N-00549',
 'C3N-00551',
 'C3N-00556',
 'C3N-00559',
 'C3N-00560',
 'C3N-00574',
 'C3N-00580',
 'C3N-00704',
 'C3N-01016',
 'C3N-01021',
 'C3N-01030',
 'C3N-01074',
 'C3N-01405',
 'C3N-01414',
 'C3N-02000',
 'C3N-02002',
 'C3N-02067',
 'C3N-02089',
 'C3N-02529',
 'C3N-02586',
 'C3N-02729',
 'X11LU013',
 'X11LU022']

## Append Row

In [46]:
luad_mutations['event'] = luad_mutations.index.isin(luad_has_event).astype(int)
hnscc_mutations['event'] = hnscc_mutations.index.isin(hnscc_has_event).astype(int)
ovarian_mutations['event'] = ovarian_mutations.index.isin(ovarian_has_event).astype(int)
colon_mutations['event'] = colon_mutations.index.isin(colon_has_event).astype(int)
brca_mutations['event'] = brca_mutations.index.isin(brca_has_event).astype(int)
lscc_mutations['event'] = lscc_mutations.index.isin(lscc_has_event).astype(int)

In [47]:
luad_mutations['event']

Patient_ID
C3L-00001    1
C3L-00009    0
C3L-00080    1
C3L-00083    0
C3L-00093    0
            ..
C3N-02729    1
X11LU013     1
X11LU016     0
X11LU022     1
X11LU035     0
Name: event, Length: 109, dtype: int32

In [48]:
luad_has_event = pd.DataFrame({'gain_event': luad_mutations['event']}, index = luad_mutations.index)
ovarian_has_event = pd.DataFrame({'gain_event': ovarian_mutations['event']}, index = ovarian_mutations.index)
hnscc_has_event = pd.DataFrame({'gain_event': hnscc_mutations['event']}, index = hnscc_mutations.index)
colon_has_event = pd.DataFrame({'gain_event': colon_mutations['event']}, index = colon_mutations.index)
brca_has_event = pd.DataFrame({'gain_event': brca_mutations['event']}, index = brca_mutations.index)
lscc_has_event = pd.DataFrame({'gain_event': lscc_mutations['event']}, index = lscc_mutations.index)

In [50]:
luad_has_event.to_csv("luad_has_gain_event.csv")
ovarian_has_event.to_csv("ovarian_has_gain_event.csv")
hnscc_has_event.to_csv("hnscc_has_gain_event.csv")
colon_has_event.to_csv("colon_has_gain_event.csv")
brca_has_event.to_csv("brca_has_gain_event.csv")
lscc_has_event.to_csv("lscc_has_gain_event.csv")

In [24]:
# luad_mutations.to_csv("luad_mutations_qarm.csv")
# hnscc_mutations.to_csv("hnscc_muations_qarm.csv")
# ovarian_mutations.to_csv("ovarian_mutations_qarm.csv")
# colon_mutations.to_csv("colon_mutations_qarm.csv")
# brca_mutations.to_csv("brca_mutations_qarm.csv")
# lscc_mutations.to_csv("lscc_mutations_qarm.csv")