In [1]:
%cd ..
import os
from pyensembl import EnsemblRelease
import pandas as pd


# set pyensembl cache location
os.environ['PYENSEMBL_CACHE_DIR'] = "../data"



/run/media/nazif/2F946E411BA61D49/thesis


INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle


In [20]:
def get_transcripts(assembly):
    
    transcript_ids = []
    sequences = []
    is_protein_coding = []
    starts = []
    ends = []
    biotypes = []
    lengths = []
    strands = []
    chrs = []
    
    for gene in assembly.gene_ids():
        transcript = assembly.gene_by_id(gene).transcripts[0]
        
        transcript_ids.append(transcript.id)
        is_protein_coding.append(transcript.is_protein_coding)
        biotypes.append(transcript.biotype)
        
        chrs.append(transcript.contig)
        starts.append(transcript.start)
        ends.append(transcript.end)
        lengths.append(transcript.length)
        strands.append(transcript.strand)
        sequences.append(transcript.sequence)

    df = pd.DataFrame({
    "transcript_id": transcript_ids,
    "is_protein_coding": is_protein_coding,
    "biotype": biotypes,
    "contig": chrs,
    "start": starts,
    "end": ends,
    "length": lengths,
    "strand": strands,
    "sequence": sequences})   
    
    return df.sort_values(by=["contig", "start"]).reset_index(drop=True)



In [None]:
# init database
grch37 = EnsemblRelease(75)

# download and index db
grch37.download()
grch37.index()

In [18]:
# init database
grch38 = EnsemblRelease(109)

# download and index db
grch38.download()
grch38.index()

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh38/ensembl109/Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh38/ensembl109/Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh38/ensembl109/Homo_sapiens.GRCh38.pep.all.fa.gz.pickle


In [7]:
df = get_transcripts(grch37)
df.head()

Unnamed: 0,transcript_id,is_protein_coding,biotype,contig,start,end,length,strand,sequence
0,ENST00000456328,False,processed_transcript,1,11869,14409,2541,+,GTTAACTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCAT...
1,ENST00000438504,False,unprocessed_pseudogene,1,14363,29370,15008,-,TCCGGCAGAGCGGAAGCGGCGGCGGGAGCTTCCGGGAGGGCGGCTC...
2,ENST00000473358,False,lincRNA,1,29554,31097,1544,+,GTGCACACGGCTCCCATGCGTTGTCTTCCGAGCGTCAGGCCGCCCC...
3,ENST00000417324,False,lincRNA,1,34554,36081,1528,-,CACACAACGGGGTTTCGGGGCTGTGGACCCTGTGCCAGGAAAGGAA...
4,ENST00000606857,False,unprocessed_pseudogene,1,52473,53312,840,+,GCGGTATCTAAATTTGTATTGATTGGACTTTCAAGCTCTTGGGAGA...


In [22]:
df2 = get_transcripts(grch38)
df2.head()

Unnamed: 0,transcript_id,is_protein_coding,biotype,contig,start,end,length,strand,sequence
0,ENST00000456328,False,lncRNA,1,11869,14409,2541,+,GTTAACTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCAT...
1,ENST00000450305,False,transcribed_unprocessed_pseudogene,1,12010,13670,1661,+,GTGTCTGACTTCCAGCAACTGCTGGCCTGTGCCAGGGTGCAAGCTG...
2,ENST00000488147,False,unprocessed_pseudogene,1,14404,29570,15167,-,ATGGGAGCCGTGTGCACGTCGGGAGCTCGGAGTGAGCGCACCATGA...
3,ENST00000619216,False,miRNA,1,17369,17436,68,-,TGTGGGAGAGGAACATGGGCTCAGGACAGCGGGTGTCAGCTTGCCT...
4,ENST00000473358,False,lncRNA,1,29554,31097,1544,+,GTGCACACGGCTCCCATGCGTTGTCTTCCGAGCGTCAGGCCGCCCC...


In [29]:
df.isna().sum()

transcript_id        0
is_protein_coding    0
biotype              0
contig               0
start                0
end                  0
length               0
strand               0
sequence             0
dtype: int64

In [30]:
df2.isna().sum()


transcript_id           0
is_protein_coding       0
biotype                 0
contig                  0
start                   0
end                     0
length                  0
strand                  0
sequence             1073
dtype: int64

In [31]:
df2[df2.sequence.isna()].head()

Unnamed: 0,transcript_id,is_protein_coding,biotype,contig,start,end,length,strand,sequence
177,ENST00000456687,False,TEC,1,2566410,2569888,3479,+,
195,ENST00000624175,False,TEC,1,3205988,3208664,2677,+,
349,ENST00000639753,False,TEC,1,9826289,9828271,1983,-,
637,ENST00000624418,False,TEC,1,18109389,18115861,6473,+,
753,ENST00000624125,False,TEC,1,22364630,22366482,1853,-,


In [33]:
df2[df2.sequence.isna()].biotype.value_counts()

biotype
TEC         1054
artifact      19
Name: count, dtype: int64

In [34]:
df2[df2.sequence.isna()].contig.value_counts()


contig
16    132
12     99
17     99
11     76
19     72
5      71
2      50
18     50
15     47
21     45
7      39
4      35
1      35
6      33
10     30
13     29
8      29
3      26
14     21
9      20
X      15
20     10
22     10
Name: count, dtype: int64

In [None]:
# drop To be experimentally confirmed results
df2.dropna(subset=["sequence"], inplace=True)
# export
df.to_csv("data/pyensembl/grch37_transcripts.csv", index=False)
df2.to_csv("data/pyensembl/grch38_transcripts.csv", index=False)


In [39]:
df.to_csv("data/pyensembl/grch37_transcripts.csv", index=False)
df2.to_csv("data/pyensembl/grch38_transcripts.csv", index=False)