In [51]:
import pandas as pd
from locations import merged_maf_file_name
from locations import location_gene_panels
from locations import all_panel_genes_file_name
from locations import location_data
from locations import gene_list_file
from locations import location_output
from locations import gene_coordinates_file
from count_combinations import compute_samples
from count_combinations import are_all_fluxes_computable
import os

In [10]:
db = pd.read_csv(merged_maf_file_name)
db = db[db['Variant_Classification'] != 'Silent']
db = db[~pd.isnull(db['Mutation'])]

In [11]:
db.groupby('Source').size()

Source
Broad       51384
FM-AD       10120
Genie       41176
MSK2015      6769
MSK2017      2695
MSK2018      1932
OncoSG      63415
TCGA       166830
TSP          1015
TracerX     33761
dtype: int64

In [13]:
possible_sample_options = ['pan_data','smoking','nonsmoking']
samples_used = possible_sample_options[0]

In [11]:
db = pd.read_csv(merged_maf_file_name)

## Add panel information (should be in the importing_maf_data module)
panels_used = {key:pd.read_csv(
    os.path.join(location_gene_panels,
                 f"{key.lower()}_panels_used.txt")).set_index(
                     "Sample Identifier").to_dict()
    for key in ['Genie', 'MSK2017', 'MSK2018']}

for key in ['Genie', 'MSK2017', 'MSK2018']:
    db.loc[db['Source'] == key, 'Panel'] = db['Sample ID'].map(
        panels_used[key]['Sequence Assay ID' if key == 'Genie'
                         else 'Gene Panel'])

db.loc[db['Source'] == 'TSP', 'Panel'] = 'TSP'
db.loc[db['Source'] == 'FM-AD', 'Panel'] = 'FoundationOne'

In [None]:
#SETUP
db = db[db['Variant_Classification'] != 'Silent']
db = db[~pd.isnull(db['Mutation'])]


all_panel_genes = pd.read_csv(all_panel_genes_file_name)
included_panels = pd.unique(all_panel_genes['SEQ_ASSAY_ID'])

panels_to_remove_for_tp53_kras = []
for panel in included_panels:
    if ('TP53' not in all_panel_genes[
            all_panel_genes['SEQ_ASSAY_ID'] == panel]['Hugo_Symbol'].tolist()
        or 'KRAS' not in all_panel_genes[
            all_panel_genes['SEQ_ASSAY_ID'] == panel]['Hugo_Symbol'].tolist()):
        panels_to_remove_for_tp53_kras.append(panel)
db = db[~db['Panel'].isin(panels_to_remove_for_tp53_kras)]

smoking_sample_ids = pd.read_csv(
    os.path.join(location_data, 'smoking_sample_ids.txt'),
    header=None).iloc[:,0].tolist()
nonsmoking_sample_ids = pd.read_csv(
    os.path.join(location_data, 'nonsmoking_sample_ids.txt'),
    header=None).iloc[:,0].tolist()

if samples_used == 'smoking':
    db = db[db['Sample ID'].isin(smoking_sample_ids)]
elif samples_used == 'nonsmoking':
    db = db[db['Sample ID'].isin(nonsmoking_sample_ids)]

gene_list = list(pd.read_csv(gene_list_file, header=None)[0])

genes_with_uncomputable_fluxes = []
for i, gene in enumerate(gene_list[2:]):
    if i%50 == 0: print(f"(gene number {i}/{len(gene_list[2:])})")
    panels_to_remove = []
    for panel in included_panels:
        if gene not in all_panel_genes[
                all_panel_genes['SEQ_ASSAY_ID'] == panel]['Hugo_Symbol'].tolist():
            panels_to_remove.append(panel)
    subsetted_db = db[~db['Panel'].isin(panels_to_remove)]

    if not are_all_fluxes_computable(subsetted_db, mutations = ['TP53', 'KRAS', gene]):
        genes_with_uncomputable_fluxes.append(gene)

for gene in genes_with_uncomputable_fluxes:
    gene_list.remove(gene)

(gene number 0/1288)
(gene number 1/1288)
(gene number 2/1288)
(gene number 3/1288)
(gene number 4/1288)
(gene number 5/1288)
(gene number 6/1288)
(gene number 7/1288)
(gene number 8/1288)
(gene number 9/1288)
(gene number 10/1288)
(gene number 11/1288)
(gene number 12/1288)
(gene number 13/1288)
(gene number 14/1288)
(gene number 15/1288)
(gene number 16/1288)
(gene number 17/1288)
(gene number 18/1288)
(gene number 19/1288)
(gene number 20/1288)
(gene number 21/1288)
(gene number 22/1288)
(gene number 23/1288)
(gene number 24/1288)
(gene number 25/1288)
(gene number 26/1288)
(gene number 27/1288)
(gene number 28/1288)
(gene number 29/1288)
(gene number 30/1288)
(gene number 31/1288)
(gene number 32/1288)
(gene number 33/1288)
(gene number 34/1288)
(gene number 35/1288)
(gene number 36/1288)
(gene number 37/1288)
(gene number 38/1288)
(gene number 39/1288)
(gene number 40/1288)
(gene number 41/1288)
(gene number 42/1288)
(gene number 43/1288)
(gene number 44/1288)
(gene number 45/1288

(gene number 361/1288)
(gene number 362/1288)
(gene number 363/1288)
(gene number 364/1288)
(gene number 365/1288)
(gene number 366/1288)
(gene number 367/1288)
(gene number 368/1288)
(gene number 369/1288)
(gene number 370/1288)
(gene number 371/1288)
(gene number 372/1288)
(gene number 373/1288)
(gene number 374/1288)
(gene number 375/1288)
(gene number 376/1288)
(gene number 377/1288)
(gene number 378/1288)
(gene number 379/1288)
(gene number 380/1288)
(gene number 381/1288)
(gene number 382/1288)
(gene number 383/1288)
(gene number 384/1288)
(gene number 385/1288)
(gene number 386/1288)
(gene number 387/1288)
(gene number 388/1288)
(gene number 389/1288)
(gene number 390/1288)
(gene number 391/1288)
(gene number 392/1288)
(gene number 393/1288)
(gene number 394/1288)
(gene number 395/1288)
(gene number 396/1288)
(gene number 397/1288)
(gene number 398/1288)
(gene number 399/1288)
(gene number 400/1288)
(gene number 401/1288)
(gene number 402/1288)
(gene number 403/1288)
(gene numbe

(gene number 718/1288)
(gene number 719/1288)
(gene number 720/1288)
(gene number 721/1288)
(gene number 722/1288)
(gene number 723/1288)
(gene number 724/1288)
(gene number 725/1288)
(gene number 726/1288)
(gene number 727/1288)
(gene number 728/1288)
(gene number 729/1288)
(gene number 730/1288)
(gene number 731/1288)
(gene number 732/1288)
(gene number 733/1288)
(gene number 734/1288)
(gene number 735/1288)
(gene number 736/1288)
(gene number 737/1288)
(gene number 738/1288)
(gene number 739/1288)
(gene number 740/1288)
(gene number 741/1288)
(gene number 742/1288)
(gene number 743/1288)
(gene number 744/1288)
(gene number 745/1288)
(gene number 746/1288)
(gene number 747/1288)
(gene number 748/1288)
(gene number 749/1288)
(gene number 750/1288)
(gene number 751/1288)
(gene number 752/1288)
(gene number 753/1288)
(gene number 754/1288)
(gene number 755/1288)
(gene number 756/1288)
(gene number 757/1288)
(gene number 758/1288)
(gene number 759/1288)
(gene number 760/1288)
(gene numbe

In [53]:
def compute_pts_per_single_mutation(data, mutation=None):
    pts_per_mutation = [len(set(
        data[
            (data['Start_Position'] >= genes.loc[mutation, 'start']) &
            (data['Start_Position'] <= genes.loc[mutation, 'end']) &
            (data['Chromosome'] == genes.loc[mutation, 'chromosome'])]
        ['Sample ID']))]

    return pts_per_mutation

In [57]:
genes = pd.read_csv(gene_coordinates_file, index_col='gene')

patients_per_gene = dict()
for i, gene in enumerate(gene_list[2:]):
    panels_to_remove = []
    for panel in included_panels:
        if gene not in all_panel_genes[
                all_panel_genes['SEQ_ASSAY_ID'] == panel]['Hugo_Symbol'].tolist():
            panels_to_remove.append(panel)
    subsetted_db = db[~db['Panel'].isin(panels_to_remove)]
    patients_per_gene[gene] = compute_pts_per_single_mutation(subsetted_db, mutation=gene)

In [63]:
patients_per_gene = {k:v[0] for k,v in patients_per_gene.items()}

In [64]:
patients_per_gene = pd.DataFrame.from_dict(patients_per_gene, orient = 'index').rename(columns = {0:'num_patients'})

In [66]:
patients_per_gene.to_csv(location_output + '/patients_per_gene.csv')