In [1]:
cd ../..

/home/nazif/thesis/mirscribe-vcf


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
# db engine
engine = create_engine('sqlite:///data/db/mirscribe.db')

# pyensembl db engine
pyensembl = create_engine('sqlite:////home/nazif/.cache/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.gtf.db')


In [3]:
genes = pd.read_sql("gene", pyensembl, columns=["gene_id", "gene_name", "seqname", "start", "end", "gene_biotype"])

genes = genes.replace(r'^\s*$', np.nan, regex=True)
genes = genes.infer_objects(copy=False)

genes.rename(columns={"seqname": "chr"}, inplace=True)

genes.head()


Unnamed: 0,gene_id,gene_name,chr,start,end,gene_biotype
0,ENSG00000223972,DDX11L1,1,11869,14412,pseudogene
1,ENSG00000227232,WASH7P,1,14363,29806,pseudogene
2,ENSG00000243485,MIR1302-10,1,29554,31109,lincRNA
3,ENSG00000237613,FAM138A,1,34554,36081,lincRNA
4,ENSG00000268020,OR4G4P,1,52473,54936,pseudogene


# add biomart data

In [4]:
e75 = pd.read_csv("data/biomart/ensembl75_g37.tsv", sep="\t")
colnames = {'Gene stable ID': 'gene_id',
            'EntrezGene ID': 'entrez_id',
            'Gene description': 'gene_description',
            'Gene name': 'gene_name',
            'Gene type': 'biomart_biotype',
            }

e75 = e75.rename(columns={"Ensembl Gene ID": "gene_id"})


e112 = pd.read_csv("data/biomart/ensembl112_g37.tsv", sep="\t")
colnames = {'Gene stable ID': 'gene_id',
            'NCBI gene (formerly Entrezgene) ID': 'entrez_id',
            'Gene description': 'gene_description',
            'Gene type': 'biomart_biotype',
            }

e112 = e112.rename(columns=colnames)

In [5]:
pyen = set(genes.gene_id.tolist())
e75s = set(e75.gene_id.tolist())
e112s = set(e112.gene_id.tolist())

# compare 
print(f"there are {len(pyen - e75s)} genes in pyensembl75 not in e75")
print(f"there are {len(e75s - pyen)} genes in e75 not in pyensembl75")
print(f"there are {len(pyen & e75s)} genes in both")
print("######################")
print(f"there are {len(pyen - e112s)} genes in pyensembl75 not in e112")
print(f"there are {len(e112s - pyen)} genes in e112 not in pyensembl75")
print(f"there are {len(pyen & e112s)} genes in both")

there are 0 genes in pyensembl75 not in e75
there are 425 genes in e75 not in pyensembl75
there are 63677 genes in both
######################
there are 0 genes in pyensembl75 not in e112
there are 0 genes in e112 not in pyensembl75
there are 63677 genes in both


whole pipeline is done with pyensembl75, therefore i must use biomart export from ensembl 112. They're fully compatible

In [6]:
e112.entrez_id.value_counts()

entrez_id
80864.0        20
259215.0       17
100169763.0    16
554313.0       16
8367.0         16
               ..
149837.0        1
200634.0        1
6718.0          1
29125.0         1
100653067.0     1
Name: count, Length: 25788, dtype: int64

In [7]:
e112[e112.entrez_id == 80864.0]

Unnamed: 0,gene_id,entrez_id,gene_description,biomart_biotype
24600,ENSG00000206329,80864.0,palmitoyl-protein thioesterase 2 [Source:HGNC ...,protein_coding
25563,ENSG00000254875,80864.0,,protein_coding
26024,ENSG00000244444,80864.0,"EGF-like-domain, multiple 8 [Source:HGNC Symbo...",protein_coding
30170,ENSG00000206256,80864.0,palmitoyl-protein thioesterase 2 [Source:HGNC ...,protein_coding
33847,ENSG00000242038,80864.0,"EGF-like-domain, multiple 8 [Source:HGNC Symbo...",protein_coding
36844,ENSG00000227600,80864.0,palmitoyl-protein thioesterase 2 [Source:HGNC ...,protein_coding
37573,ENSG00000239974,80864.0,"EGF-like-domain, multiple 8 [Source:HGNC Symbo...",protein_coding
39416,ENSG00000236649,80864.0,palmitoyl-protein thioesterase 2 [Source:HGNC ...,protein_coding
39637,ENSG00000240389,80864.0,"EGF-like-domain, multiple 8 [Source:HGNC Symbo...",protein_coding
53506,ENSG00000168452,80864.0,palmitoyl-protein thioesterase 2 [Source:HGNC ...,protein_coding


In [8]:
e112_to_merge = e112[["gene_id","gene_description"]].drop_duplicates()
df = pd.merge(genes, e112_to_merge, on="gene_id", how="left")

df.head()

Unnamed: 0,gene_id,gene_name,chr,start,end,gene_biotype,gene_description
0,ENSG00000223972,DDX11L1,1,11869,14412,pseudogene,DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 l...
1,ENSG00000227232,WASH7P,1,14363,29806,pseudogene,WAS protein family homolog 7 pseudogene [Sourc...
2,ENSG00000243485,MIR1302-10,1,29554,31109,lincRNA,microRNA 1302-10 [Source:HGNC Symbol;Acc:38233]
3,ENSG00000237613,FAM138A,1,34554,36081,lincRNA,"family with sequence similarity 138, member A ..."
4,ENSG00000268020,OR4G4P,1,52473,54936,pseudogene,"olfactory receptor, family 4, subfamily G, mem..."


In [9]:
df.isna().sum()

gene_id                 0
gene_name               0
chr                     0
start                   0
end                     0
gene_biotype            0
gene_description    23066
dtype: int64

In [10]:
df["gene_description"] = df["gene_description"].fillna("no_description")
# df["entrez_id"] = df["entrez_id"].fillna(-1).astype(int)


In [11]:
df.isna().sum()

gene_id             0
gene_name           0
chr                 0
start               0
end                 0
gene_biotype        0
gene_description    0
dtype: int64

# adding oncokb data

In [12]:
kb = pd.read_csv("data/oncokb/oncokb.csv")
kb = kb.rename(columns={"gene_symbol": "gene_name", "ensembl_gene_id": "gene_id", "is_oncogene": "is_oncogene_oncokb", "is_tumor_suppressor": "is_tumor_suppressor_oncokb", "is_cosmic_tier_1": "is_cosmic_tier_1_oncokb"})

# drop columns where all bool values are false
cols = ["is_oncogene_oncokb", "is_tumor_suppressor_oncokb", "is_cosmic_tier_1_oncokb"]
kb = kb[~(kb[cols] == False).all(axis=1)]

kb.head()



Unnamed: 0,gene_name,gene_id,is_oncogene_oncokb,is_tumor_suppressor_oncokb,is_cosmic_tier_1_oncokb
0,ABL1,ENSG00000097007,True,False,True
1,AKT1,ENSG00000142208,True,False,True
2,ALK,ENSG00000171094,True,False,True
3,AMER1,ENSG00000184675,False,True,True
4,APC,ENSG00000134982,False,True,True


In [13]:
kb.gene_id.isin(df.gene_id).value_counts()

gene_id
True     931
False      7
Name: count, dtype: int64

In [14]:
missing_gene_names = kb[~kb.gene_id.isin(df.gene_id)].gene_name
missing_gene_names

146      H3C2
453    H2AC17
700      H4C9
715       IGH
716       IGK
717       IGL
741     MLLT6
Name: gene_name, dtype: object

146      H3C2 is HIST1H3D in grch37

453    H2AC17 is whole locus

700      H4C9 not found in grch37

715       IGH whole locus

716       IGK whole locus

717       IGL whole locus

741     MLLT6 found in gene_names

whole loci are discarded. other genes are appended

In [15]:
kb = kb.drop(columns=["gene_name"])
df = pd.merge(df, kb, on="gene_id", how="left")
df.head()

Unnamed: 0,gene_id,gene_name,chr,start,end,gene_biotype,gene_description,is_oncogene_oncokb,is_tumor_suppressor_oncokb,is_cosmic_tier_1_oncokb
0,ENSG00000223972,DDX11L1,1,11869,14412,pseudogene,DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 l...,,,
1,ENSG00000227232,WASH7P,1,14363,29806,pseudogene,WAS protein family homolog 7 pseudogene [Sourc...,,,
2,ENSG00000243485,MIR1302-10,1,29554,31109,lincRNA,microRNA 1302-10 [Source:HGNC Symbol;Acc:38233],,,
3,ENSG00000237613,FAM138A,1,34554,36081,lincRNA,"family with sequence similarity 138, member A ...",,,
4,ENSG00000268020,OR4G4P,1,52473,54936,pseudogene,"olfactory receptor, family 4, subfamily G, mem...",,,


# adding details of 2 genes

In [16]:
df[df.gene_name.isin(["MLLT6", "HIST1H3D"])]

Unnamed: 0,gene_id,gene_name,chr,start,end,gene_biotype,gene_description,is_oncogene_oncokb,is_tumor_suppressor_oncokb,is_cosmic_tier_1_oncokb
18369,ENSG00000197409,HIST1H3D,6,26197068,26199521,protein_coding,"histone cluster 1, H3d [Source:HGNC Symbol;Acc...",,,
48365,ENSG00000108292,MLLT6,17,36861795,36886056,protein_coding,myeloid/lymphoid or mixed-lineage leukemia (tr...,,,


In [17]:
df.loc[df.gene_name == 'HIST1H3D', 'is_cosmic_tier_1_oncokb'] = True
df.loc[df.gene_name == 'MLLT6', 'is_cosmic_tier_1_oncokb'] = True

In [18]:
df.isna().sum()

gene_id                           0
gene_name                         0
chr                               0
start                             0
end                               0
gene_biotype                      0
gene_description                  0
is_oncogene_oncokb            62746
is_tumor_suppressor_oncokb    62746
is_cosmic_tier_1_oncokb       62744
dtype: int64

In [19]:
df.fillna(False, inplace=True)
df.head()

  df.fillna(False, inplace=True)


Unnamed: 0,gene_id,gene_name,chr,start,end,gene_biotype,gene_description,is_oncogene_oncokb,is_tumor_suppressor_oncokb,is_cosmic_tier_1_oncokb
0,ENSG00000223972,DDX11L1,1,11869,14412,pseudogene,DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 l...,False,False,False
1,ENSG00000227232,WASH7P,1,14363,29806,pseudogene,WAS protein family homolog 7 pseudogene [Sourc...,False,False,False
2,ENSG00000243485,MIR1302-10,1,29554,31109,lincRNA,microRNA 1302-10 [Source:HGNC Symbol;Acc:38233],False,False,False
3,ENSG00000237613,FAM138A,1,34554,36081,lincRNA,"family with sequence similarity 138, member A ...",False,False,False
4,ENSG00000268020,OR4G4P,1,52473,54936,pseudogene,"olfactory receptor, family 4, subfamily G, mem...",False,False,False


# adding intogen data

In [20]:
into = pd.read_json("data/intogen/driver_genes.json")

into.head()

Unnamed: 0,gene_name,is_driver
0,ACVR1,True
1,ACVR2A,True
2,ARAF,True
3,ASXL2,True
4,BAP1,True


In [21]:
into.gene_name.isin(df.gene_name).value_counts()

gene_name
True     86
False     1
Name: count, dtype: int64

In [22]:
into[~into.gene_name.isin(df.gene_name)].gene_name

78    TENT5C
Name: gene_name, dtype: object

FAM46C is the grch37 name of TENT5C

source: https://www.ensembl.org/Homo_sapiens/Gene/Summary?g=ENSG00000183508;r=1:117606048-117628389;t=ENST00000369448

In [23]:
df = pd.merge(df, into, on="gene_name", how="left")

In [24]:
df.loc[df.gene_name == "FAM46C", "is_driver"] = True

In [25]:
df["is_driver"] = df["is_driver"].fillna(False)

  df["is_driver"] = df["is_driver"].fillna(False)


In [26]:
df

Unnamed: 0,gene_id,gene_name,chr,start,end,gene_biotype,gene_description,is_oncogene_oncokb,is_tumor_suppressor_oncokb,is_cosmic_tier_1_oncokb,is_driver
0,ENSG00000223972,DDX11L1,1,11869,14412,pseudogene,DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 l...,False,False,False,False
1,ENSG00000227232,WASH7P,1,14363,29806,pseudogene,WAS protein family homolog 7 pseudogene [Sourc...,False,False,False,False
2,ENSG00000243485,MIR1302-10,1,29554,31109,lincRNA,microRNA 1302-10 [Source:HGNC Symbol;Acc:38233],False,False,False,False
3,ENSG00000237613,FAM138A,1,34554,36081,lincRNA,"family with sequence similarity 138, member A ...",False,False,False,False
4,ENSG00000268020,OR4G4P,1,52473,54936,pseudogene,"olfactory receptor, family 4, subfamily G, mem...",False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
63672,ENSG00000198695,MT-ND6,MT,14149,14673,protein_coding,mitochondrially encoded NADH dehydrogenase 6 [...,False,False,False,False
63673,ENSG00000210194,MT-TE,MT,14674,14742,Mt_tRNA,mitochondrially encoded tRNA glutamic acid [So...,False,False,False,False
63674,ENSG00000198727,MT-CYB,MT,14747,15887,protein_coding,mitochondrially encoded cytochrome b [Source:H...,False,False,False,False
63675,ENSG00000210195,MT-TT,MT,15888,15953,Mt_tRNA,mitochondrially encoded tRNA threonine [Source...,False,False,False,False


In [28]:
df.isna().sum()

gene_id                       0
gene_name                     0
chr                           0
start                         0
end                           0
gene_biotype                  0
gene_description              0
is_oncogene_oncokb            0
is_tumor_suppressor_oncokb    0
is_cosmic_tier_1_oncokb       0
is_driver                     0
dtype: int64

In [27]:
from sqlalchemy import create_engine

# db engine
engine = create_engine('sqlite:///data/db/mirscribe.db')

df.to_sql(name="genes", con=engine, if_exists="replace", index=False)

63677