This script creates a file containing all the intergenic regions between ORs and then creates a blast database using those intergenic regions. 

In [2]:
import Bio.SeqIO as SeqIO
import pandas as pd
import numpy as np
import os
import subprocess
import sys
sys.path.append('/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/analysis')
from transcriptomics import * 
from hcr import * 

Initializing transcriptomics package
Initializing HCR package


In [3]:
# Load genome
genome_fasta_path = '/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/genome/Obir.assembly.v5.4.fasta'
genome_seq = SeqIO.to_dict(SeqIO.parse(genome_fasta_path, "fasta"))

In [4]:
# Load transcriptome 
tr = load_transcriptome_object('biroi_transcriptome') 

In [5]:
# Load or_db
or_db = pd.read_excel('/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/OR List.xlsx') 

In [None]:
# Make a blast database of all intergenic regions in the transcriptome 

# Export gene sequence for each intergenic region for blasting against transcriptome 
output_dir = '/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/intergenics/'

# Clear contents of the output directory
for file in os.listdir(output_dir):
    os.remove(f"{output_dir}/{file}")

with open(f"{output_dir}/all_intergenic_regions.fa", 'w') as f:
    TAs = np.unique(or_db.tandem_array.values)
    for TA in TAs: 
        # Get genes 
        genes = or_db[or_db['tandem_array'] == TA]['gene_name'].values

        # Get gene objects from tr 
        gene_objects = [tr.get_gene(gene) for gene in genes]

        # Sort by start position of each transcript 
        gene_objects = sorted(gene_objects, key=lambda x: x.get_transcript_longest_bounds().get_bounds()[0])

        # Get chromosome 
        chromosome = gene_objects[0].chromosome

        # Make a list of intergenic regions and sequences, saving them to files for blasting 
        for i in range(len(gene_objects)-1): 
            first_gene = gene_objects[i]
            second_gene = gene_objects[i+1]
            # Get bounds 
            first_gene_bounds = first_gene.get_transcript_longest_bounds().get_bounds()
            first_gene_end = first_gene_bounds[1] + 1 
            second_gene_bounds = second_gene.get_transcript_longest_bounds().get_bounds()
            second_gene_start = second_gene_bounds[0] - 1 
            # Get intergenic distance
            intergenic_distance = second_gene_bounds[0] - first_gene_bounds[1]
            # Verify the intergenic_distance > 10bp 
            if intergenic_distance > 10: 
                # Get strand 
                strand = first_gene.strand
                # Get intergenic sequence
                intergenic_sequence = get_sequence(genome_seq, chromosome, first_gene_end, second_gene_start, strand)

                # Write to file 
                f.write(f">T{TA}-{first_gene.name}-{second_gene.name}-{chromosome}:{first_gene_end}-{second_gene_start}\n{intergenic_sequence}\n")

print(f"Exported intergenic regions to {output_dir}")
    

Exported intergenic regions to /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/intergenics/


In [7]:
# Blast probe regions against these intergenic regions
input_path = '/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/intergenics/all_intergenic_regions.fa'
output_path = '/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/intergenics/all_intergenic_regions'
command = f"makeblastdb -in {input_path} -dbtype nucl -parse_seqids -out {output_path}"
subprocess.run(command, shell=True, check=True)



Building a new DB, current time: 11/09/2024 14:09:15
New DB name:   /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/intergenics/all_intergenic_regions
New DB title:  /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/intergenics/all_intergenic_regions.fa
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 419 sequences in 0.0150938 seconds.




CompletedProcess(args='makeblastdb -in /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/intergenics/all_intergenic_regions.fa -dbtype nucl -parse_seqids -out /Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/transcriptome/intergenics/all_intergenic_regions', returncode=0)