This script creates a file containing all the intergenic regions between ORs and then creates a blast database using those intergenic regions. Do this prior to using the intergenic blast database in probe design. 

In [1]:
import Bio.SeqIO as SeqIO
import pandas as pd
import numpy as np
import os
import subprocess
import sys
sys.path.append('../')
from transcriptomics import * 
from hcr import * 

Initializing transcriptomics package
Initializing HCR package


In [2]:
# Load genome
genome_path = "../raw-data/OBir/genome/Obir.assembly.v5.4.fasta"
genome_seq = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta"))

In [3]:
# Load transcriptome 
tr = load_transcriptome_object("../raw-data/OBir/transcriptome/biroi_transcriptome") 

In [4]:
# Load or_db
or_db = pd.read_excel('../raw-data/OBir/ORs/OR List.xlsx') 

In [6]:
# Make a blast database of all intergenic regions in the transcriptome 

# Export gene sequence for each intergenic region for blasting against transcriptome 
output_dir = '../raw-data/OBir/intergenics/'

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Clear contents of the output directory
for file in os.listdir(output_dir):
    os.remove(f"{output_dir}/{file}")

with open(f"{output_dir}/all_intergenic_regions.fa", 'w') as f:
    TAs = np.unique(or_db.tandem_array.values)
    for TA in TAs: 
        # Get genes 
        genes = or_db[or_db['tandem_array'] == TA]['gene_name'].values

        # Get gene objects from tr 
        gene_objects = [tr.get_gene(gene) for gene in genes]

        # Sort by start position of each transcript 
        gene_objects = sorted(gene_objects, key=lambda x: x.get_transcript_longest_bounds().get_bounds()[0])

        # Get chromosome 
        chromosome = gene_objects[0].chromosome

        # Make a list of intergenic regions and sequences, saving them to files for blasting 
        for i in range(len(gene_objects)-1): 
            first_gene = gene_objects[i]
            second_gene = gene_objects[i+1]
            # Get bounds 
            first_gene_bounds = first_gene.get_transcript_longest_bounds().get_bounds()
            first_gene_end = first_gene_bounds[1] + 1 
            second_gene_bounds = second_gene.get_transcript_longest_bounds().get_bounds()
            second_gene_start = second_gene_bounds[0] - 1 
            # Get intergenic distance
            intergenic_distance = second_gene_bounds[0] - first_gene_bounds[1]
            # Verify the intergenic_distance > 10bp 
            if intergenic_distance > 10: 
                # Get strand 
                strand = first_gene.strand
                # Get intergenic sequence
                intergenic_sequence = get_sequence(genome_seq, chromosome, first_gene_end, second_gene_start, strand)

                # Write to file 
                f.write(f">T{TA}-{first_gene.name}-{second_gene.name}-{chromosome}:{first_gene_end}-{second_gene_start}\n{intergenic_sequence}\n")

print(f"Exported intergenic regions to {output_dir}")
    

Exported intergenic regions to ../raw-data/OBir/intergenics/


In [8]:
# Blast probe regions against these intergenic regions
input_path = '../raw-data/OBir/intergenics/all_intergenic_regions.fa'
output_path = '../raw-data/OBir/intergenics/all_intergenic_regions'
command = f"makeblastdb -in {input_path} -dbtype nucl -parse_seqids -out {output_path}"
subprocess.run(command, shell=True)



Building a new DB, current time: 05/24/2025 12:05:11
New DB name:   /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA Paper/code/raw-data/OBir/intergenics/all_intergenic_regions
New DB title:  ../raw-data/OBir/intergenics/all_intergenic_regions.fa
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA Paper/code/raw-data/OBir/intergenics/all_intergenic_regions
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 419 sequences in 0.0135281 seconds.




BLAST Database error: No alias or index file found for nucleotide database [/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA] in search path [/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA Paper/code/hcr-probe-design::]


CompletedProcess(args='makeblastdb -in ../raw-data/OBir/intergenics/all_intergenic_regions.fa -dbtype nucl -parse_seqids -out ../raw-data/OBir/intergenics/all_intergenic_regions', returncode=2)