In [1]:
import pandas as pd
import csv

import os
from pyensembl import EnsemblRelease

# set pyensembl cache location
os.environ['PYENSEMBL_CACHE_DIR'] = "../../data"

# init database
data = EnsemblRelease(60)

# download and index db
data.download()
data.index()

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl60/Homo_sapiens.GRCh37.60.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl60/Homo_sapiens.GRCh37.60.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl60/Homo_sapiens.GRCh37.60.pep.all.fa.gz.pickle


In [2]:
def preprocess_clash_data(data="../data/supplementary_files/clash_data_parsed.tsv", drop_irrelevant_columns=True):
    """
    Preprocess the parsed CLASH data and return a df
    """
    clash_df = pd.read_csv(data, sep="\t", header=2)

    if drop_irrelevant_columns:
        columns_to_keep = ["microRNA_name", "miRNA_seq", "mRNA_name", "mRNA_start", "mRNA_end_extended", "mRNA_seq_extended", "seed_type"]
        clash_df = clash_df[columns_to_keep]
    # unzipping columns
    mirnas = clash_df["microRNA_name"].tolist()
    mirna_accessions = []
    mirna_names = []

    for i in mirnas:
        temp = i.split("_")
        mirna_accessions.append(temp[0])
        mirna_names.append(temp[2])

    clash_df["mirna_accession"] = mirna_accessions
    clash_df["mirna_name"] = mirna_names
    clash_df.drop(["microRNA_name"], axis=1, inplace=True)

    # --------------------------------------------------
    mrnas = clash_df["mRNA_name"].values.tolist()
    mrna_ensg_ids = []
    mrna_enst_ids = []
    mrna_names = []

    for i in mrnas:
        temp = i.split("_")
        mrna_ensg_ids.append(temp[0])
        mrna_enst_ids.append(temp[1])
        mrna_names.append(temp[2])

    clash_df["mrna_enst_id"] = mrna_enst_ids
    clash_df["mrna_ensg_id"] = mrna_ensg_ids
    clash_df["mrna_name"] = mrna_names
    clash_df.drop(["mRNA_name"], axis=1, inplace=True)

    return clash_df

def get_ensembl60_data(clash_df):

    # set pyensembl
    os.environ['PYENSEMBL_CACHE_DIR'] = "../../data"
    data = EnsemblRelease(60)
    data.download()
    data.index()
    
    ensts = clash_df["mrna_enst_id"].values.tolist()
    
    contigs = []
    starts = []
    ends = []
    
    for i in ensts:
        obj = data.transcript_by_id(i)

        contigs.append(obj.contig)
        starts.append(obj.start)
        ends.append(obj.end)

    clash_df["ensembl60_contig"] = contigs
    clash_df["ensembl60_start"] = starts
    clash_df["ensembl60_end"] = ends
    
    return clash_df

def get_ensembl109_data(clash_df):
    
    # set pyensembl
    os.environ['PYENSEMBL_CACHE_DIR'] = "../../data"
    data = EnsemblRelease(109)
    data.download()
    data.index()
    
    ensts = clash_df["mrna_enst_id"].values.tolist()

    contigs = []
    starts = []
    ends = []
    found_in_ensembl = []

    for i in ensts:
        try:
            obj = data.transcript_by_id(i)
        except Exception:
            contigs.append("NA")
            starts.append("NA")
            ends.append("NA")
            found_in_ensembl.append(0)
        else:
            contigs.append(obj.contig)
            starts.append(obj.start)
            ends.append(obj.end)
            found_in_ensembl.append(1)

    clash_df["ensembl109_contig"] = contigs
    clash_df["ensembl109_start"] = starts
    clash_df["ensembl109_end"] = ends
    clash_df["found_in_ensembl109"] = found_in_ensembl

    return clash_df

In [3]:
clash_df = preprocess_clash_data()
clash_df = get_ensembl60_data(clash_df)
#clash_df = get_ensembl109_data(clash_df)

clash_df.head()

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl60/Homo_sapiens.GRCh37.60.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl60/Homo_sapiens.GRCh37.60.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl60/Homo_sapiens.GRCh37.60.pep.all.fa.gz.pickle


Unnamed: 0,miRNA_seq,mRNA_start,mRNA_end_extended,mRNA_seq_extended,seed_type,mirna_accession,mirna_name,mrna_enst_id,mrna_ensg_id,mrna_name,ensembl60_contig,ensembl60_start,ensembl60_end
0,TGAGGTAGTAGGTTGTATAGTT,1791,1890,ATTTGTATCTACGATAAAAATTTTTATACAGAACCTACTGCCTCAA...,noncanonical_seed,MIMAT0000062,let-7a,ENST00000340828,ENSG00000113328,CCNG1,5,162864587,162872022
1,TGAGGTAGTAGGTTGTATAGTT,3857,3928,CAGGAAATACCCGTGCAACCAACTACCTCATATTCCATTCAGAATT...,9-mer,MIMAT0000062,let-7a,ENST00000343455,ENSG00000100697,DICER1,14,95552566,95623759
2,TGAGGTAGTAGGTTGTATAGTT,2385,2434,ACCCGCTATATGACCTGATGCCTTTCCTTCATTAAAGATGATTCTG...,noncanonical_seed,MIMAT0000062,let-7a,ENST00000436639,ENSG00000080546,SESN1,6,109307640,109416022
3,TGAGGTAGTAGGTTGTATAGTT,6570,6623,CAATGACTATGCAACCATACCTTACCACTAAATGTAGTACGCAAAA...,noncanonical_seed,MIMAT0000062,let-7a,ENST00000282516,ENSG00000164190,NIPBL,5,36876861,37066515
4,TGAGGTAGTAGGTTGTATAGTT,1164,1208,AATACTGGGAAACCTGCTACTTCGTCAGCTAACCAGAAACCTGTG,noncanonical_seed,MIMAT0000062,let-7a,ENST00000340139,ENSG00000138785,INTS12,4,106603784,106629838


In [10]:
clash_df["ensembl60_contig"]

0         5
1        14
2         6
3         5
4         4
         ..
18509    MT
18510    16
18511    14
18512    12
18513    19
Name: ensembl60_contig, Length: 18514, dtype: object

In [14]:
gene = data.gene_ids_at_locus(contig=5,position=162864587, end=162872022)

gene

['ENSG00000113328']

In [16]:
data.transcript_by_id('ENST00000340828').sequence

'AGGGCAGGCGCGGCCCCTTCGGCTCCGAGCTGACCCTGATCAGGGCCGAGTTGTCTCGGCGGCGCTGCCGAGGCCTCCACCCAGGACAGTCCCCCTCCCCGGGCCTCTCTCCTCTTGCCTACGAGTCCCCTCTCCTCGTAGGCCTCTCGGATCTGATATCGTGGGGTGAGGTGAGCAGGCCCGGGGAGGGTGGTTACCGCTGAGGAGCTGCAGTCTCTGTCAAGATGATAGAGGTACTGACAACAACTGACTCTCAGAAACTGCTACACCAGCTGAATGCCCTGTTGGAACAGGAGTCTAGATGTCAGCCAAAGGTCTGTGGTTTGAGACTAATTGAGTCTGCACACGATAATGGCCTCAGAATGACTGCAAGACTAAGGGACTTTGAAGTAAAAGATCTTCTTAGTCTAACTCAGTTCTTTGGCTTTGACACAGAGACATTTTCTCTAGCTGTGAATTTACTGGACAGATTCCTGTCTAAAATGAAGGTACAGCCCAAGCACCTTGGGTGTGTTGGACTGAGCTGCTTTTATTTGGCTGTAAAATCAATAGAAGAGGAAAGGAATGTCCCATTGGCAACTGACTTGATCCGAATAAGTCAATATAGGTTTACGGTTTCAGACTTGATGAGAATGGAAAAGATTGTATTGGAGAAGGTGTGTTGGAAAGTCAAAGCTACTACTGCCTTTCAATTTCTGCAACTGTATTATTCACTCCTTCAAGAGAACTTGCCACTTGAAAGGAGAAATAGCATTAATTTTGAAAGACTAGAAGCTCAACTGAAGGCATGTCATTGCAGGATCATATTTTCTAAAGCAAAGCCTTCTGTGTTGGCATTGTCTATCATTGCATTAGAGATCCAAGCACAGAAGTGTGTAGAGTTAACAGAAGGAATAGAATGTCTTCAGAAACATTCCAAGATAAATGGCAGAGATCTGACCTTCTGGCAAGAGCTTGTATCCAAATGTTTAACTGAATATTCATCAAATAAGTGTTCCA

In [18]:
asd = data.transcript_ids()

len(asd)

157480

In [23]:
total_transcripts = data.transcript_ids()
contigs = data.contigs()

print(f"the number of ensembl60 transcripts: {len(total_transcripts)}")
print(f"the number of ensembl60 contigs: {len(contigs)}")

the number of ensembl60 transcripts: 157480
the number of ensembl60 contigs: 72


In [27]:
mrna_enst_ids = clash_df["mrna_enst_id"].values.tolist()
mRNA_starts = clash_df["mRNA_start"].values.tolist()
mRNA_end_extendeds = clash_df["mRNA_end_extended"].values.tolist()
mRNA_seq_extendeds = clash_df["mRNA_seq_extended"].values.tolist()


for i, mrna in enumerate(mrna_enst_ids):
    mrna_seq = data.transcript_by_id(mrna).sequence
    
    start = mRNA_starts[i]
    end = mRNA_end_extendeds[i]
    
    seq_slice = mrna_seq[start:end]
    if seq_slice == mRNA_seq_extendeds[i]:
        print(f"{mrna} is correct")
    else:
        print(f"{mrna} is not correct")

ENST00000340828 is not correct
ENST00000343455 is not correct
ENST00000436639 is not correct
ENST00000282516 is not correct
ENST00000340139 is not correct
ENST00000307968 is not correct
ENST00000281182 is not correct
ENST00000261674 is not correct
ENST00000288207 is not correct
ENST00000373204 is not correct
ENST00000523638 is not correct
ENST00000294256 is not correct
ENST00000397354 is not correct
ENST00000375643 is not correct
ENST00000378444 is not correct
ENST00000336783 is not correct
ENST00000299543 is not correct
ENST00000263095 is not correct
ENST00000182096 is not correct
ENST00000443029 is not correct
ENST00000263563 is not correct
ENST00000399878 is not correct
ENST00000355540 is not correct
ENST00000313382 is not correct
ENST00000360428 is not correct
ENST00000434452 is not correct
ENST00000355522 is not correct
ENST00000297145 is not correct
ENST00000373625 is not correct
ENST00000317943 is not correct
ENST00000405409 is not correct
ENST00000421745 is not correct
ENST0000

In [32]:
random_gene = data.transcript_by_id('ENST00000340828')

methods = dir(random_gene)

print(methods)

['_SERIALIZABLE_KEYWORD_ALIASES', '_TRANSCRIPT_FEATURES', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_codon_positions', '_contiguous_offsets', '_reconstruct_nested_objects', '_sequence', '_transcript_feature_position_ranges', '_transcript_feature_positions', '_update_kwargs', 'biotype', 'can_overlap', 'coding_sequence', 'coding_sequence_position_ranges', 'complete', 'contains', 'contains_locus', 'contains_start_codon', 'contains_stop_codon', 'contig', 'db', 'distance_to_interval', 'distance_to_locus', 'end', 'exon_intervals', 'exons', 'first_start_codon_spliced_offset', 'five_prime_utr_sequence', 'from_dict', 'from_json', 'gene', 'gene_id', 'gene_name', 'genome', 'id', 'is_prote

In [37]:
random_gene

Transcript(transcript_id='ENST00000340828', transcript_name='CCNG1-001', gene_id='ENSG00000113328', biotype='protein_coding', contig='5', start=162864587, end=162872022, strand='+', genome='GRCh37')

In [44]:
random_gene.contains(5,162864587,162872020)

True