In [1]:
import os 
import subprocess
from transcriptomics import * 

Initializing transcriptomics package


# O biroi

Build transcriptome object 

In [2]:
genome_path = "/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/genome/Obir.assembly.v5.4.fasta"
transcriptome_path = "/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/GTF/RefSeq_With_GenBank_MT_And_ORs.gtf"
object_name = "biroi_transcriptome"
update_transcriptome_object(genome_path, transcriptome_path, object_name, other_fields=['gene_biotype', 'transcript_biotype']) 


Found 14255 unique genes.


100%|██████████| 14255/14255 [00:22<00:00, 644.41it/s] 


Transcriptome(genes=14153)
Transcriptome object has been updated and saved to biroi_transcriptome.pkl


In [11]:
object_name = "biroi_transcriptome"
tr = load_transcriptome_object(object_name)

In [14]:
tr.get_gene("Or5-L16").get_transcript_longest_cds().cds

[CDS(transcript=gnl|WGS:QOIP|geneOr5-L16-RA, Chr12:8748132-8748919, strand=+),
 CDS(transcript=gnl|WGS:QOIP|geneOr5-L16-RA, Chr12:8749267-8749366, strand=+),
 CDS(transcript=gnl|WGS:QOIP|geneOr5-L16-RA, Chr12:8749626-8749736, strand=+),
 CDS(transcript=gnl|WGS:QOIP|geneOr5-L16-RA, Chr12:8749882-8750037, strand=+),
 CDS(transcript=gnl|WGS:QOIP|geneOr5-L16-RA, Chr12:8750266-8750319, strand=+)]

In [4]:
check_exons_contain_all_features(tr)

In [5]:
# Look for genes with zero transcripts - these are all pseudogenes
genes_no_transcripts = []
for gene_name in tr.genes.keys():
    if len(tr.get_gene(gene_name).transcripts) == 0:
        genes_no_transcripts.append(gene_name)
len(genes_no_transcripts)

343

In [10]:
lncRNAs = [gene for gene in list(tr.genes.keys()) if tr.get_gene(gene).transcript_biotype == 'lnc_RNA']
len(lncRNAs)

1424

Export csv of all transcripts, genes, chromosomes, start and end 

In [8]:
import pandas as pd
all_genes = list(tr.genes.keys())
df = pd.DataFrame(all_genes, columns=['gene_name'])
df['chromosome'] = [tr.get_gene(gene).chromosome for gene in all_genes]
df['strand'] = [tr.get_gene(gene).strand for gene in all_genes]
df['num_transcripts'] = [len(tr.get_gene(gene).transcripts) for gene in all_genes]
df = df[df['num_transcripts'] > 0]
df['start'] = df['gene_name'].apply(lambda x: tr.get_gene(x).get_transcript_longest_bounds().get_bounds()[0])
df['end'] = df['gene_name'].apply(lambda x: tr.get_gene(x).get_transcript_longest_bounds().get_bounds()[1])
df

Unnamed: 0,gene_name,chromosome,strand,num_transcripts,start,end
0,CSP10.2,Chr13,+,1,1251897,1257450
1,Csp1,Chr10,-,1,2992852,2994140
2,Csp10,Chr13,+,1,1227032,1231037
3,Csp11,Chr10,+,1,8970673,8972214
4,Csp12,Chr5,+,1,13419564,13420947
...,...,...,...,...,...,...
14148,Trnay-gua_3,Chr8,+,1,9477506,9477604
14149,Trnay-gua_4,Chr8,+,1,9477679,9477777
14150,Trnay-gua_5,Chr8,+,1,9477853,9477951
14151,Trnay-gua_6,Chr8,+,1,9478429,9478525


In [9]:
df.to_csv("/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/all-genes/OBir_gene_db.csv", index=False)

Export mRNA to Fasta file

In [13]:
output_dir = "/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_no_introns"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_path = os.path.join(output_dir, "mRNA_no_introns.fasta")

with open(output_path, "w") as output_file:
    for gene_name in tr.genes.keys():
        gene = tr.genes[gene_name]
        for transcript in gene.transcripts:
            output_file.write(f">{transcript.name} gene={gene.name} location={transcript.chromosome}:{transcript.position[0]}-{transcript.position[1]} strand={transcript.strand} \n{transcript.mrna_sequence} \n")


Export mRNA (introns included) to Fasta file

In [14]:
output_dir = "/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_yes_introns"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_path = os.path.join(output_dir, "mRNA_yes_introns.fasta")

with open(output_path, "w") as output_file:
    for gene_name in tr.genes.keys():
        gene = tr.genes[gene_name]
        for transcript in gene.transcripts:
            output_file.write(f">{transcript.name} gene={gene.name} location={transcript.chromosome}:{transcript.position[0]}-{transcript.position[1]} strand={transcript.strand} \n{transcript.dna_sequence} \n")


Create Blast Databases

In [15]:
# Check that makeblastdb is installed
!makeblastdb -version

makeblastdb: 2.15.0+
 Package: blast 2.15.0, build Oct 19 2023 15:16:13


In [16]:
input_path = "/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_no_introns/mRNA_no_introns.fasta"
output_path = "/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_no_introns/mRNA_no_introns"
command = f"makeblastdb -in {input_path} -dbtype nucl -parse_seqids -out {output_path}"
subprocess.run(command, shell=True)



Building a new DB, current time: 11/09/2024 14:54:15
New DB name:   /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_no_introns/mRNA_no_introns
New DB title:  /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_no_introns/mRNA_no_introns.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_no_introns/mRNA_no_introns
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 26777 sequences in 0.814745 seconds.




CompletedProcess(args='makeblastdb -in /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_no_introns/mRNA_no_introns.fasta -dbtype nucl -parse_seqids -out /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_no_introns/mRNA_no_introns', returncode=0)

In [17]:
input_path = "/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_yes_introns/mRNA_yes_introns.fasta"
output_path = "/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_yes_introns/mRNA_yes_introns"
command = f"makeblastdb -in {input_path} -dbtype nucl -parse_seqids -out {output_path}"
subprocess.run(command, shell=True)



Building a new DB, current time: 11/09/2024 14:54:16
New DB name:   /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_yes_introns/mRNA_yes_introns
New DB title:  /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_yes_introns/mRNA_yes_introns.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_yes_introns/mRNA_yes_introns
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 26777 sequences in 2.2277 seconds.




CompletedProcess(args='makeblastdb -in /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_yes_introns/mRNA_yes_introns.fasta -dbtype nucl -parse_seqids -out /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/mRNA_yes_introns/mRNA_yes_introns', returncode=0)