This script creates a file containing all the intergenic regions between ORs and then creates a blast database using those intergenic regions. 

In [1]:
import Bio.SeqIO as SeqIO
import pandas as pd
import numpy as np
import os
import subprocess
import sys
sys.path.append('/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/analysis')
from transcriptomics import * 
from hcr import * 

# Load genome
genome_fasta_path = '/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/genome/Obir.assembly.v5.4.fasta'
genome_seq = SeqIO.to_dict(SeqIO.parse(genome_fasta_path, "fasta"))

# Load transcriptome 
tr = load_transcriptome_object('biroi_transcriptome') 

# Load or_db
or_db = pd.read_excel('/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/OR List.xlsx') 

Initializing transcriptomics package
Initializing HCR package


In [2]:
# Make a blast database of all intergenic regions in the transcriptome 

# Export gene sequence for each intergenic region for blasting against transcriptome 
output_dir = '/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/lncRNAs/'

# Make directory if it doesn't exist 
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Clear contents of the output directory
for file in os.listdir(output_dir):
    os.remove(f"{output_dir}/{file}")

gene_names = np.unique(or_db.gene_name.values)
with open(f"{output_dir}/all_ORs_antisense.fa", 'w') as f:
    for gene_name in gene_names: 
        # Get object 
        gene_obj = tr.get_gene(gene_name)
        # Get transcript with the longest bounds 
        transcript = gene_obj.get_transcript_longest_bounds()
        chromosome = transcript.chromosome 
        strand = transcript.strand 
        antisense_strand = '+' if strand == '-' else '-'
        start, end = transcript.get_bounds()
        # Get sequence
        sequence = get_sequence(genome_seq, chromosome, start, end, antisense_strand)
        # Make a name 
        name = f"{gene_name}"
        f.write(f">{name}-lnc\n{sequence}\n")

print(f"Exported antisense OR sequences to {output_dir}")
    

Exported antisense OR sequences to /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/lncRNAs/


In [3]:
# Blast probe regions against these intergenic regions
input_path = '/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/lncRNAs/all_ORs_antisense.fa'
output_path = '/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/lncRNAs/all_ORs_antisense'
command = f"makeblastdb -in {input_path} -dbtype nucl -parse_seqids -out {output_path}"
subprocess.run(command, shell=True, check=True)



Building a new DB, current time: 11/25/2024 09:46:02
New DB name:   /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/lncRNAs/all_ORs_antisense
New DB title:  /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/lncRNAs/all_ORs_antisense.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 502 sequences in 0.0212119 seconds.




CompletedProcess(args='makeblastdb -in /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/lncRNAs/all_ORs_antisense.fa -dbtype nucl -parse_seqids -out /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/lncRNAs/all_ORs_antisense', returncode=0)