In [1]:
cd ../..

/home/nazif/thesis/mirscribe-vcf


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
from sqlalchemy import create_engine, inspect
from sqlalchemy import create_engine, Column, Integer, String, ForeignKey, Float, UniqueConstraint
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
import pandas as pd
import numpy as np

# db engine
engine = create_engine('sqlite:///data/db/mirscribe.db')

# pyensembl db engine
pyensembl = create_engine('sqlite:////home/nazif/.cache/pyensembl/GRCh38/ensembl111/Homo_sapiens.GRCh38.111.gtf.db')


# mirnas

In [3]:
mirna_df = pd.read_csv('data/mirna/mirna.csv')
mirna_df["seed"] = mirna_df.sequence.str[1:8]

mirna_df.head()

Unnamed: 0,mirna_name,mirna_accession,sequence,mirna_family,conservation,seed
0,hsa-let-7a-5p,MIMAT0000062,AACTATACAACCTACTACCTCA,let-7-5p/98-5p,2.0,ACTATAC
1,hsa-let-7b-5p,MIMAT0000063,AACCACACAACCTACTACCTCA,let-7-5p/98-5p,2.0,ACCACAC
2,hsa-let-7c-5p,MIMAT0000064,AACCATACAACCTACTACCTCA,let-7-5p/98-5p,2.0,ACCATAC
3,hsa-let-7d-5p,MIMAT0000065,AACTATGCAACCTACTACCTCT,let-7-5p/98-5p,2.0,ACTATGC
4,hsa-let-7e-5p,MIMAT0000066,AACTATACAACCTCCTACCTCA,let-7-5p/98-5p,2.0,ACTATAC


In [4]:
mirna_df.isna().sum()

mirna_name         0
mirna_accession    0
sequence           0
mirna_family       0
conservation       0
seed               0
dtype: int64

# genes

In [5]:
genes = pd.read_sql("gene", pyensembl, columns=["gene_id", "gene_name", "seqname", "start", "end", "gene_biotype"])

genes = genes.replace(r'^\s*$', np.nan, regex=True)
# /tmp/ipykernel_91379/1564060209.py:1: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
genes = genes.infer_objects(copy=False)

genes.rename(columns={"seqname": "chr"}, inplace=True)

genes.head()


Unnamed: 0,gene_id,gene_name,chr,start,end,gene_biotype
0,ENSG00000279928,DDX11L17,1,182696,184174,unprocessed_pseudogene
1,ENSG00000228037,,1,2581560,2584533,lncRNA
2,ENSG00000142611,PRDM16,1,3069168,3438621,protein_coding
3,ENSG00000284616,,1,5301928,5307394,lncRNA
4,ENSG00000157911,PEX10,1,2403964,2413797,protein_coding


# transcripts

In [6]:
transcripts = pd.read_sql("transcript", pyensembl, columns=["transcript_id", "transcript_name", "seqname", "start", "end", "gene_id", "transcript_biotype"])

transcripts = transcripts.replace(r'^\s*$', np.nan, regex=True)

transcripts.rename(columns={"seqname": "chr"}, inplace=True)
transcripts.head()

Unnamed: 0,transcript_id,transcript_name,chr,start,end,gene_id,transcript_biotype
0,ENST00000624431,DDX11L17-201,1,182696,184174,ENSG00000279928,unprocessed_pseudogene
1,ENST00000424215,,1,2581560,2584533,ENSG00000228037,lncRNA
2,ENST00000511072,PRDM16-206,1,3069168,3434342,ENSG00000142611,protein_coding
3,ENST00000607632,PRDM16-210,1,3069183,3186591,ENSG00000142611,retained_intron
4,ENST00000378391,PRDM16-203,1,3069197,3435421,ENSG00000142611,protein_coding


In [7]:
transcripts.isna().sum()

transcript_id             0
transcript_name       31617
chr                       0
start                     0
end                       0
gene_id                   0
transcript_biotype        0
dtype: int64

# gsea hallmark

In [8]:
import json
import pandas as pd

# Load the JSON data
with open("data/gsea/h.all.v2023.2.Hs.json") as file:
    gsea_data = json.load(file)

# Create a dictionary to store the data
data_dict = {}

# Iterate over each hallmark property
for hallmark, hallmark_data in gsea_data.items():
    # Extract the gene symbols for the current hallmark
    gene_symbols = hallmark_data['geneSymbols']
    
    # Add the gene symbols as a column in the data dictionary
    data_dict[hallmark] = pd.Series(True, index=gene_symbols)

# Create the DataFrame from the data dictionary
gsea = pd.DataFrame(data_dict)

# Fill missing values with False
gsea.fillna(False, inplace=True)

# Reset the index to make gene IDs as rows
gsea.reset_index(inplace=True)
gsea.rename(columns={'index': 'gene_name'}, inplace=True)

# Function to remove "HALLMARK_" prefix and convert to lowercase
def clean_column_name(col_name):
    if col_name.startswith("HALLMARK_"):
        return col_name[len("HALLMARK_"):].lower()
    return col_name.lower()

# Apply the function to all column names
gsea.rename(columns=lambda col: clean_column_name(col), inplace=True)

gsea

  gsea.fillna(False, inplace=True)


Unnamed: 0,gene_name,tnfa_signaling_via_nfkb,hypoxia,cholesterol_homeostasis,mitotic_spindle,wnt_beta_catenin_signaling,tgf_beta_signaling,il6_jak_stat3_signaling,dna_repair,g2m_checkpoint,...,heme_metabolism,coagulation,il2_stat5_signaling,bile_acid_metabolism,peroxisome,allograft_rejection,spermatogenesis,kras_signaling_up,kras_signaling_dn,pancreas_beta_cells
0,A2M,False,False,False,False,False,False,True,False,False,...,False,True,False,False,False,False,False,False,False,False
1,AAAS,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,AADAT,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,AARS1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,ABAT,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4379,ZNRF4,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4380,ZPBP,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4381,ZW10,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4382,ZWINT,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
genes_dict = dict(zip(genes.gene_name, genes.gene_id))

gsea["gene_id"] = gsea["gene_name"].map(genes_dict)

gsea.isna().sum()

gene_name                            0
tnfa_signaling_via_nfkb              0
hypoxia                              0
cholesterol_homeostasis              0
mitotic_spindle                      0
wnt_beta_catenin_signaling           0
tgf_beta_signaling                   0
il6_jak_stat3_signaling              0
dna_repair                           0
g2m_checkpoint                       0
apoptosis                            0
notch_signaling                      0
adipogenesis                         0
estrogen_response_early              0
estrogen_response_late               0
androgen_response                    0
myogenesis                           0
protein_secretion                    0
interferon_alpha_response            0
interferon_gamma_response            0
apical_junction                      0
apical_surface                       0
hedgehog_signaling                   0
complement                           0
unfolded_protein_response            0
pi3k_akt_mtor_signaling  

In [10]:
# manually taken from ENSEMBL
gsea.loc[gsea.gene_name == 'METTL7B', 'gene_id'] = 'ENSG00000163575'
gsea.loc[gsea.gene_name == 'THEG', 'gene_id'] = 'ENSG00000105549'


# intogen

In [11]:
cols = ["SYMBOL", "CGC_GENE", "CGC_CANCER_GENE"]

into = pd.read_csv("data/intogen/Compendium_Cancer_Genes.tsv", sep="\t", usecols=cols)
into.drop_duplicates(subset=["SYMBOL"], inplace=True)

into.rename(columns={"SYMBOL": "gene_name", "CGC_GENE": "in_cgc", "CGC_CANCER_GENE": "is_driver"}, inplace=True)


into.head()

Unnamed: 0,gene_name,in_cgc,is_driver
0,ABCC4,False,False
1,ABL1,True,False
2,ABL2,True,False
4,ACKR3,True,False
5,ACSL3,True,False


In [12]:
genes_set = set(genes.gene_name)
into_set = set(into.gene_name)

# see if all genes in into are in genes
print(genes_set.issuperset(into_set))

True


In [13]:
# add driver data to genes table
genes = genes.merge(into, on="gene_name", how="left").fillna({"in_cgc": False, "is_driver": False})

  genes = genes.merge(into, on="gene_name", how="left").fillna({"in_cgc": False, "is_driver": False})


In [14]:
genes

Unnamed: 0,gene_id,gene_name,chr,start,end,gene_biotype,in_cgc,is_driver
0,ENSG00000279928,DDX11L17,1,182696,184174,unprocessed_pseudogene,False,False
1,ENSG00000228037,,1,2581560,2584533,lncRNA,False,False
2,ENSG00000142611,PRDM16,1,3069168,3438621,protein_coding,True,False
3,ENSG00000284616,,1,5301928,5307394,lncRNA,False,False
4,ENSG00000157911,PEX10,1,2403964,2413797,protein_coding,False,False
...,...,...,...,...,...,...,...,...
63236,ENSG00000271254,,KI270711.1,4612,29626,protein_coding,False,False
63237,ENSG00000275987,U1,KI270713.1,30437,30580,snRNA,False,False
63238,ENSG00000268674,,KI270713.1,35407,35916,protein_coding,False,False
63239,ENSG00000277475,,KI270713.1,31698,32528,protein_coding,False,False


In [15]:
genes[genes.is_driver].gene_biotype.value_counts()

gene_biotype
protein_coding    87
Name: count, dtype: int64

# SAVE TO DB

In [16]:
mirna_df.to_sql('mirnas', engine, if_exists='replace', index=False)
genes.to_sql('genes', engine, if_exists='replace', index=False)
transcripts.to_sql('transcripts', engine, if_exists='replace', index=False)
gsea.to_sql('gsea', engine, if_exists='replace', index=False)
 

4384