`antman` kernel

This script creates a file containing all the intergenic regions between ORs and then creates a blast database using those intergenic regions. Do this prior to using the lncRNA blast database in probe design. 

In [1]:
import Bio.SeqIO as SeqIO
import pandas as pd
import numpy as np
import os
import subprocess
import sys
sys.path.append('../')
from transcriptomics import * 
from hcr import * 

Initializing transcriptomics package
Initializing HCR package


In [2]:
# Load genome
genome_path = "../raw-data/OBir/genome/Obir.assembly.v5.4.fasta"
genome_seq = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta"))

In [3]:
# Load transcriptome 
tr = load_transcriptome_object("../raw-data/OBir/transcriptome/biroi_transcriptome") 

In [4]:
# Load or_db
or_db = pd.read_excel('../raw-data/OBir/ORs/OR List.xlsx') 

In [5]:
# Make a blast database of all intergenic regions in the transcriptome 

# Export gene sequence for each intergenic region for blasting against transcriptome 
output_dir = '../raw-data/OBir/lncRNAs/'

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Make directory if it doesn't exist 
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Clear contents of the output directory
for file in os.listdir(output_dir):
    os.remove(f"{output_dir}/{file}")

gene_names = np.unique(or_db.gene_name.values)
with open(f"{output_dir}/all_ORs_antisense.fa", 'w') as f:
    for gene_name in gene_names: 
        # Get object 
        gene_obj = tr.get_gene(gene_name)
        # Get transcript with the longest bounds 
        transcript = gene_obj.get_transcript_longest_bounds()
        chromosome = transcript.chromosome 
        strand = transcript.strand 
        antisense_strand = '+' if strand == '-' else '-'
        start, end = transcript.get_bounds()
        # Get sequence
        sequence = get_sequence(genome_seq, chromosome, start, end, antisense_strand)
        # Make a name 
        name = f"{gene_name}"
        f.write(f">{name}-lnc\n{sequence}\n")

print(f"Exported antisense OR sequences to {output_dir}")
    

Exported antisense OR sequences to ../raw-data/OBir/lncRNAs/


In [6]:
# Blast probe regions against these intergenic regions
input_path = '../raw-data/OBir/lncRNAs/all_ORs_antisense.fa'
output_path = '../raw-data/OBir/lncRNAs/all_ORs_antisense'
command = f"makeblastdb -in {input_path} -dbtype nucl -parse_seqids -out {output_path}"
subprocess.run(command, shell=True)



Building a new DB, current time: 05/24/2025 12:07:58
New DB name:   /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA Paper/code/raw-data/OBir/lncRNAs/all_ORs_antisense
New DB title:  ../raw-data/OBir/lncRNAs/all_ORs_antisense.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 502 sequences in 0.0183821 seconds.




BLAST Database error: No alias or index file found for nucleotide database [/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA] in search path [/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/lncRNA Paper/code/hcr-probe-design::]


CompletedProcess(args='makeblastdb -in ../raw-data/OBir/lncRNAs/all_ORs_antisense.fa -dbtype nucl -parse_seqids -out ../raw-data/OBir/lncRNAs/all_ORs_antisense', returncode=2)