In [1]:
cd ../..

/home/nazif/thesis/mirscribe-vcf


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import pandas as pd

In [3]:
kb = pd.read_csv("data/oncokb/cancerGeneList.tsv", sep="\t")

# Create a dictionary mapping the old column names to the new ones
new_column_names = {
    'Hugo Symbol': 'gene_symbol',
    'Entrez Gene ID': 'entrez_gene_id',
    'GRCh37 Isoform': 'grch37_isoform',
    'GRCh37 RefSeq': 'grch37_refseq',
    'GRCh38 Isoform': 'grch38_isoform',
    'GRCh38 RefSeq': 'grch38_refseq',
    'Is Oncogene': 'is_oncogene',
    'Is Tumor Suppressor Gene': 'is_tumor_suppressor',
    '# of occurrence within resources (Column J-P)': 'occurrence_within_resources',
    'OncoKB Annotated': 'oncokb_annotated',
    'MSK-IMPACT': 'msk_impact',
    'MSK-HEME': 'msk_heme',
    'FOUNDATION ONE': 'foundation_one',
    'FOUNDATION ONE HEME': 'foundation_one_heme',
    'Vogelstein': 'vogelstein',
    'COSMIC CGC (v99)': 'is_cosmic_tier_1',
    'Gene Aliases': 'gene_aliases'
}

# Rename the columns using the dictionary
kb = kb.rename(columns=new_column_names)

kb.head()

Unnamed: 0,gene_symbol,entrez_gene_id,grch37_isoform,grch37_refseq,grch38_isoform,grch38_refseq,is_oncogene,is_tumor_suppressor,occurrence_within_resources,oncokb_annotated,msk_impact,msk_heme,foundation_one,foundation_one_heme,vogelstein,is_cosmic_tier_1,gene_aliases
0,ABL1,25,ENST00000318560,NM_005157.4,ENST00000318560,NM_005157.4,Yes,No,7,Yes,Yes,Yes,Yes,Yes,Yes,Yes,"ABL, JTK7, c-ABL"
1,AKT1,207,ENST00000349310,NM_001014431.1,ENST00000349310,NM_001014431.1,Yes,No,7,Yes,Yes,Yes,Yes,Yes,Yes,Yes,"AKT, PKB, PRKBA, RAC, RAC-alpha"
2,ALK,238,ENST00000389048,NM_004304.4,ENST00000389048,NM_004304.4,Yes,No,7,Yes,Yes,Yes,Yes,Yes,Yes,Yes,CD246
3,AMER1,139285,ENST00000330258,NM_152424.3,ENST00000374869,NM_152424.3,No,Yes,7,Yes,Yes,Yes,Yes,Yes,Yes,Yes,"FAM123B, FLJ39827, RP11-403E24.2, WTX"
4,APC,324,ENST00000257430,NM_000038.5,ENST00000257430,NM_000038.5,No,Yes,7,Yes,Yes,Yes,Yes,Yes,Yes,Yes,"DP2.5, PPP1R46"


In [4]:
ids = (pd.read_csv("data/oncokb/entrez_id_to_ensg.tsv", sep="\t")
       .dropna(how='all')
       .rename(columns={'NCBI Gene ID': 'ncbi_gene_id', 'Ensembl gene ID': 'ensembl_gene_id'})
)

ids.ncbi_gene_id = ids.fillna(0).ncbi_gene_id.astype(int)

kb = kb.merge(ids, left_on="entrez_gene_id", right_on="ncbi_gene_id", how="left")
kb.drop("ncbi_gene_id", axis=1, inplace=True)

In [5]:
missing_genes = kb[((kb["is_oncogene"] == "Yes") | (kb["is_tumor_suppressor"] == "Yes")) & (kb.ensembl_gene_id.isna())].entrez_gene_id
missing_genes

826    6955
827    6957
828    6964
829    6965
Name: entrez_gene_id, dtype: int64

# checking hgnc if these genes are available

In [6]:
hgnc = pd.read_csv("data/hgnc/hgnc.tsv", sep="\t")
hgnc[hgnc["NCBI Gene ID"].isin(missing_genes)]

Unnamed: 0,Approved symbol,Approved name,MANE Select Ensembl transcript ID (supplied by NCBI),Ensembl gene ID,Gene group name,NCBI Gene ID,Locus type,Enzyme IDs
39737,TRA,T cell receptor alpha locus,,,T cell receptor alpha locus at 14q11.2,6955.0,gene with protein product,
39963,TRB,T cell receptor beta locus,,,T cell receptor beta locus at 7q34,6957.0,gene with protein product,
40091,TRD,T cell receptor delta locus,,,T cell receptor delta locus at 14q11.2,6964.0,gene with protein product,
40201,TRG,T cell receptor gamma locus,,,T cell receptor gamma locus at 7p14,6965.0,gene with protein product,


# these genes have no reference in both GRCh37 & 38.

In [7]:
# drop the missing genes
kb.drop(kb[((kb["is_oncogene"] == "Yes") | (kb["is_tumor_suppressor"] == "Yes")) & (kb.ensembl_gene_id.isna())].index, inplace=True)

# filter cols
cols_to_keep = ['gene_symbol', 'ensembl_gene_id', 'is_oncogene', 'is_tumor_suppressor', 'is_cosmic_tier_1']
kb = kb[cols_to_keep]

# change column types to bool
for column in kb.columns:
    if column.startswith("is_"):
        kb[column] = kb[column].map({"Yes": True, "No": False})

In [10]:
kb.head()

Unnamed: 0,gene_symbol,ensembl_gene_id,is_oncogene,is_tumor_suppressor,is_cosmic_tier_1
0,ABL1,ENSG00000097007,True,False,True
1,AKT1,ENSG00000142208,True,False,True
2,ALK,ENSG00000171094,True,False,True
3,AMER1,ENSG00000184675,False,True,True
4,APC,ENSG00000134982,False,True,True


In [12]:
kb.to_csv("data/oncokb/oncokb.csv", index=False)