# Build Transcriptome Object and Prepare BLAST Databases

antman environment

In [2]:
import os 
import subprocess
import sys 
sys.path.append('../')
from transcriptomics import * 

Initializing transcriptomics package


# O biroi

Build transcriptome object 

In [5]:
!ls ../raw-data/OBir/genome

Obir.assembly.v5.4.fasta     Obir.assembly.v5.4.fasta.fai


In [6]:
genome_path = "../raw-data/OBir/genome/Obir.assembly.v5.4.fasta"
transcriptome_path = "../raw-data/OBir/transcriptome/RefSeq_With_GenBank_MT_And_ORs.gtf"
object_name = "../raw-data/OBir/transcriptome/biroi_transcriptome"
update_transcriptome_object(genome_path, transcriptome_path, object_name, other_fields=['gene_biotype', 'transcript_biotype']) 


Found 14255 unique genes.


100%|██████████| 14255/14255 [00:22<00:00, 625.96it/s] 


Transcriptome(genes=14153)
Transcriptome object has been updated and saved to ../raw-data/OBir/transcriptome/biroi_transcriptome.pkl


In [7]:
object_name = "../raw-data/OBir/transcriptome/biroi_transcriptome"
tr = load_transcriptome_object(object_name)

In [8]:
check_exons_contain_all_features(tr)

In [9]:
# Look for genes with zero transcripts - these are all pseudogenes
genes_no_transcripts = []
for gene_name in tr.genes.keys():
    if len(tr.get_gene(gene_name).transcripts) == 0:
        genes_no_transcripts.append(gene_name)
len(genes_no_transcripts)

343

In [10]:
lncRNAs = [gene for gene in list(tr.genes.keys()) if tr.get_gene(gene).transcript_biotype == 'lnc_RNA']
len(lncRNAs)

1424

# BLAST DATABASES

Export mRNA to Fasta file

In [11]:
output_dir = "../raw-data/OBir/mRNA_no_introns"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_path = os.path.join(output_dir, "mRNA_no_introns.fasta")

with open(output_path, "w") as output_file:
    for gene_name in tr.genes.keys():
        gene = tr.genes[gene_name]
        for transcript in gene.transcripts:
            output_file.write(f">{transcript.name} gene={gene.name} location={transcript.chromosome}:{transcript.position[0]}-{transcript.position[1]} strand={transcript.strand} \n{transcript.mrna_sequence} \n")


Export mRNA (introns included) to Fasta file

In [12]:
output_dir = "../raw-data/OBir/mRNA_yes_introns"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_path = os.path.join(output_dir, "mRNA_yes_introns.fasta")

with open(output_path, "w") as output_file:
    for gene_name in tr.genes.keys():
        gene = tr.genes[gene_name]
        for transcript in gene.transcripts:
            output_file.write(f">{transcript.name} gene={gene.name} location={transcript.chromosome}:{transcript.position[0]}-{transcript.position[1]} strand={transcript.strand} \n{transcript.dna_sequence} \n")


Create Blast Databases

In [13]:
# Check that makeblastdb is installed
!makeblastdb -version

makeblastdb: 2.15.0+
 Package: blast 2.15.0, build Oct 19 2023 15:16:13


In [15]:
# Create a BLAST database from the mRNA sequences without introns
input_path = "../raw-data/OBir/mRNA_no_introns/mRNA_no_introns.fasta"
output_path = "../raw-data/OBir/mRNA_no_introns/mRNA_no_introns"
command = f"makeblastdb -in {input_path} -dbtype nucl -parse_seqids -out {output_path}"
subprocess.run(command, shell=True)



Building a new DB, current time: 04/22/2025 18:56:44
New DB name:   /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA Paper/code/raw-data/OBir/mRNA_no_introns/mRNA_no_introns
New DB title:  ../raw-data/OBir/mRNA_no_introns/mRNA_no_introns.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA Paper/code/raw-data/OBir/mRNA_no_introns/mRNA_no_introns
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 26777 sequences in 0.847079 seconds.




BLAST Database error: No alias or index file found for nucleotide database [/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA] in search path [/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA Paper/code/transcriptomics::]


CompletedProcess(args='makeblastdb -in ../raw-data/OBir/mRNA_no_introns/mRNA_no_introns.fasta -dbtype nucl -parse_seqids -out ../raw-data/OBir/mRNA_no_introns/mRNA_no_introns', returncode=2)

In [17]:
# Create a BLAST database from the mRNA sequences with introns 
input_path = "../raw-data/OBir/mRNA_yes_introns/mRNA_yes_introns.fasta"
output_path = "../raw-data/OBir/mRNA_yes_introns/mRNA_yes_introns"
command = f"makeblastdb -in {input_path} -dbtype nucl -parse_seqids -out {output_path}"
subprocess.run(command, shell=True)



Building a new DB, current time: 04/22/2025 18:57:10
New DB name:   /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA Paper/code/raw-data/OBir/mRNA_yes_introns/mRNA_yes_introns
New DB title:  ../raw-data/OBir/mRNA_yes_introns/mRNA_yes_introns.fasta
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 26777 sequences in 2.3002 seconds.




BLAST Database error: No alias or index file found for nucleotide database [/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA] in search path [/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA Paper/code/transcriptomics::]


CompletedProcess(args='makeblastdb -in ../raw-data/OBir/mRNA_yes_introns/mRNA_yes_introns.fasta -dbtype nucl -parse_seqids -out ../raw-data/OBir/mRNA_yes_introns/mRNA_yes_introns', returncode=2)