In [1]:
import pandas as pd
import math
from scipy.special import comb
import numpy as np
import scipy.stats as stats

In [2]:
df = pd.read_csv("microindels-1.csv")

In [3]:
tumor = set(df['Project'])

In [4]:
tumor

{'TCGA-BRCA',
 'TCGA-COAD',
 'TCGA-HNSC',
 'TCGA-KIRC',
 'TCGA-KIRP',
 'TCGA-LIHC',
 'TCGA-LUAD',
 'TCGA-STAD',
 'TCGA-THCA'}

In [8]:
from  statsmodels.stats.multitest import fdrcorrection
import os

test = []
for project in tumor:
    print(project)
    sub_df = df.loc[df['Project'] == project]
    sum_L = sum(sub_df['GeneLength'])
    sum_m = sum(set(sub_df['SampeNum'].values))
    print("total # of mutated genes",len(set(sub_df['Symbol'])))
    sum_N = sum(sub_df['MutationNum'])
    
    w = sum_N/3e9
    
    sub_genes = list(set(sub_df['Symbol'].values))
    cols_out = list(sub_df.columns.values) + ['pval']
    df_out = pd.DataFrame(columns = cols_out)
    
    for g in sub_genes:
        
        X = sum(sub_df.loc[sub_df["Symbol"] == g]['MutationNum'])
        L = sub_df.loc[sub_df["Symbol"] == g]['GeneLength'].values[0]
        
        #m = sum(sub_df.loc[sub_df["Symbol"] == g]['SampleNum'])
        mean = L*w
        sigma = L*w*(1-w)
        
        pvalue = 1-stats.norm.cdf(X, mean, sigma)
        if pvalue > 0.01:
            test.append(g)
        newline = pd.DataFrame([[project,None,sum_m,g,X,L,pvalue]],columns = cols_out)
        df_out = pd.concat([df_out,newline])

    
    df_out['adjpval'] = fdrcorrection(df_out['pval'])[1]
    #df_out = df_out.loc[df_out['adjpval'] < 1e-3]
    
    df_out.to_csv("combined-stage-0724/all/" + project + ".csv")
    

TCGA-KIRC
total # of mutated genes 10545
TCGA-HNSC
total # of mutated genes 16104
TCGA-LIHC
total # of mutated genes 14248
TCGA-STAD
total # of mutated genes 18085
TCGA-LUAD
total # of mutated genes 17875
TCGA-COAD
total # of mutated genes 19115
TCGA-THCA
total # of mutated genes 5630
TCGA-BRCA
total # of mutated genes 17795
TCGA-KIRP
total # of mutated genes 9437


In [5]:
import pandas as pd
import math
from scipy.special import comb
import numpy as np
import scipy.stats as stats

In [6]:
import os
true = []
df_dict = {}
for root,dirs,files in os.walk("combined-stage-0724/all/"):
    for file in files:
        s_d = pd.read_csv("combined-stage-0724/all/" + file)
        s_d = s_d.loc[s_d['adjpval']<1e-3]
        df_dict[file.split(".")[0]] = s_d
        print(file,len(set(s_d['Symbol'])))
        true.append(set(s_d['Symbol']))

TCGA-BRCA.csv 8534
TCGA-COAD.csv 8956
TCGA-HNSC.csv 7689
TCGA-KIRC.csv 6339
TCGA-KIRP.csv 6073
TCGA-LIHC.csv 7439
TCGA-LUAD.csv 7645
TCGA-STAD.csv 8510
TCGA-THCA.csv 4180


In [7]:
idx,top_10_li = [],[]
for key in df_dict.keys():
    now_df = df_dict[key]
    idx.append(key.split("-")[1])
    top_10_li.append(list(now_df.loc[now_df['adjpval'] == 0].sort_values("MutationNum",ascending=False).head(10)['Symbol']))

In [19]:
pd.DataFrame(top_10_li,index = idx).to_csv("top_10_genes.csv")

In [12]:
all_genes = set()
for item in top_10_li:
    for i in item:
        all_genes.add(i)

In [14]:
all_genes

{'AHNAK2',
 'ALB',
 'APC',
 'APOB',
 'ARID1A',
 'ATM',
 'BAP1',
 'BRAF',
 'CCDC168',
 'CDH1',
 'CDKN2A',
 'COL11A1',
 'CTNNB1',
 'DNAH5',
 'FAT1',
 'FAT4',
 'FLG',
 'FUT9',
 'GATA3',
 'HRAS',
 'INTS2',
 'KDM5C',
 'KIAA1109',
 'KMT2C',
 'KMT2D',
 'KRAS',
 'LRP2',
 'MACF1',
 'MALAT1',
 'MAP3K1',
 'MET',
 'MTOR',
 'MUC16',
 'MUC17',
 'NOTCH1',
 'NRAS',
 'OBSCN',
 'PBRM1',
 'PIK3CA',
 'SETD2',
 'SPTA1',
 'TG',
 'TP53',
 'TTN',
 'VHL',
 'XIRP2',
 'ZFHX4'}