Updated November 9, 2024. Designs probes against a single intergenic region between two ORs. Blasts against transcriptome with introns, transcriptome without introns, and intergenic databases to verify specificity. 
Author: Giacomo Glotzer. 

In [1]:
from pygenomeviz import GenomeViz
import Bio.SeqIO as SeqIO
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import os
import subprocess
import sys
sys.path.append('/Users/giacomo.glotzer/Desktop/Rockefeller/Kronauer/analysis')
from transcriptomics import * 
from hcr import *
from plotting import * 

# Specify blast databases
new_transcriptome_db_no_introns = "../raw-data/OBir/mRNA_no_introns/mRNA_no_introns"
new_transcriptome_db_yes_introns = "../raw-data/OBir/mRNA_yes_introns/mRNA_yes_introns"
all_intergenic_regions_db = '../raw-data/OBir/intergenics/all_intergenic_regions'
all_lncRNAs_db = '../raw-data/OBir/lncRNAs/all_ORs_antisense'

# Load genome
genome_path = "../raw-data/OBir/genome/Obir.assembly.v5.4.fasta"
genome_seq = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta"))

# Load transcriptome
tr = load_transcriptome_object("../raw-data/OBir/transcriptome/biroi_transcriptome.pkl")

# Load or_db
or_db = pd.read_excel('../raw-data/OBir/ORs/OR List.xlsx') 

Initializing transcriptomics package
Initializing HCR package
Initializing plotting package


In [2]:
# set main directory for writing files 
main_directory = '../raw-data/single_intergenic_probe_design/'
if not os.path.exists(main_directory):
    os.makedirs(main_directory)

In [3]:
# Specify Gene 
flanking_genes = ["Or5-9E213", "Or5-9E214"] 

In [4]:
# Get gene objects
gene_objects = [tr.get_gene(gene) for gene in flanking_genes]

In [5]:
# Get chromosome 
chromosome = gene_objects[0].chromosome
chromosome

'Chr6'

In [6]:
# Get strand 
strand = [gene.strand for gene in gene_objects]
strand

['+', '+']

In [7]:
# Get gene bounds 
gene_bounds = [gene.get_transcript_longest_bounds().get_bounds() for gene in gene_objects]
gene_bounds

[[10898300, 10901757], [10905031, 10909861]]

In [8]:
# Verify that the bounds are in ascending order 
flattened_bounds = [bound for gene in gene_bounds for bound in gene]
assert flattened_bounds == sorted(flattened_bounds)

In [9]:
# Get intergenic bounds 
intergenic_bounds = [gene_bounds[0][1], gene_bounds[1][0]]

In [10]:
# Permitted off-targets
permitted_off_targets = []

In [11]:
# Here we use only the EXONS and UTRs, NO INTRONS 
sequence = get_sequence(genome_seq, chromosome, intergenic_bounds[0], intergenic_bounds[1], strand[0])
print(f"Sequence length: {len(sequence)}")
sequence

Sequence length: 3274


'TTATGTAACATTTAATGCGTGCTATCTATTCCTATCTGATACATTATGAACATTAATACATCGAAATTTTTTTGTTTGTTTAAAAAACTAACTTTTAATTTTATACTTGGTTTATTCAATCACATTATATGTAAATCAATTTATTATTTTAATAACGTCGCAGAGAGGTACGTCGAGAGTAGGTACTATATGTTGACAAAAGAATTGAACAGAGAAATAAAACTGGTTCTTTTAAATGAACAAACATTACATGAGAAATTGAAGAGCAGGAACACAGCTAAGGAACTAAAAATGAAACAAAAAGCAGAATTAGAGAAACTAATGAAAGAAAATTTAGAACGATATTAAGGGATCGGAATCAAAATGAACGATTGCATTATCTGTTAAGCACAGTAAAAATTTATTCTTAAAAACAGGCGAAAAAAAAACCGACGGAAGATAATGTTGTACCAGAAATTCAGAAGCAAAATATCAAGAAGAAAAAAAAATTGAAGTTTAGAGAGATAATATAGAGAGATGATAATTATATAATTTATATATTAAAATGATTGGATACAATAAATAATCAACTGAAAAGTATGTAAATAAATAACGTTCTTTTATATCTTCTTTGACAAATTTTTATTATAAATAAAAAATAATTGTTGGAGGAAATATATGTATATATAAAACTGATTCGCAGTAGCAAAATTAAAGCAGTTGGTGATTTCTCCTATGTGACTATCTCTATCTCGGAATTACTCTACATTAAAAAGGATGTATGCAAATAGTTTAAAATGTATGAAAATATAAAGCAGAATTATTTTTAGTGATCGTGATATTAGATATTTATGCAGCTATCGTTGTTAGCTGCATGCATATAAAATATAATACTGCTATAAAATTTAATACTGTAAAAAAAGATTCGATGCTTACAGAGAAATCATTCTTTTAAAGCATATCAGCTTCAAAAGAATTCTGTCACTTGCGCTTGGCTTGCAGTCAACATCTAAATTCGAG

In [12]:
# define a object called intergenic 
class Intergenic: 
    def __init__(self, gene_1, gene_2, chromosome, start, end, strand, sequence): 
        self.gene_1 = gene_1
        self.gene_2 = gene_2
        self.chromosome = chromosome
        self.start = start
        self.end = end
        self.strand = strand
        self.sequence = sequence
        self.length = len(sequence)
        self.name = f"Intergenic-{re.sub("Or5-","",gene_1)}-{re.sub("Or5-","",gene_2)}"
        self.short_name = f"{gene_1}-{gene_2}"

In [13]:
intergenic = Intergenic(flanking_genes[0], flanking_genes[1], chromosome, intergenic_bounds[0], intergenic_bounds[1], strand[0], sequence)
intergenic.name

'Intergenic-9E213-9E214'

In [14]:
# Export each gene sequence to a fasta file 
output_dir = os.path.join(main_directory, 'intergenic_seq_blast_input')

# Create output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Clear contents of the output directory
for file in os.listdir(output_dir):
    os.remove(f"{output_dir}/{file}")

with open(f"{output_dir}/{intergenic.short_name}.fasta", 'w') as f: 
    f.write(f">{intergenic.name}\n{intergenic.sequence}")

In [15]:
## Blast genes against the transcriptome without introns (mature mRNA), transcriptome with introns (pre-mRNA), and intergenic regions 
input_dir = os.path.join(main_directory, 'intergenic_seq_blast_input')
output_dir = os.path.join(main_directory, 'intergenic_seq_blast_output')

# create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Clear contents of the output directory
for file in os.listdir(output_dir):
    os.remove(f"{output_dir}/{file}")

# get gene sequence path 
input_path = f"{input_dir}/{intergenic.short_name}.fasta"
# no introns 
output_path = f"{output_dir}/{intergenic.short_name}_blasted_no_introns.csv"
command = f"blastn -task blastn -query {input_path} -db {new_transcriptome_db_no_introns} -ungapped -word_size 15 -reward 1 -penalty -5 -dust no -soft_masking false -max_target_seqs 10000 -out {output_path} -outfmt '10 qseqid sseqid sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore' -num_threads 4"
subprocess.run(command, shell=True, check=True)
# yes introns
output_path = f"{output_dir}/{intergenic.short_name}_blasted_yes_introns.csv"
command = f"blastn -task blastn -query {input_path} -db {new_transcriptome_db_yes_introns} -ungapped -word_size 15 -reward 1 -penalty -5 -dust no -soft_masking false -max_target_seqs 10000 -out {output_path} -outfmt '10 qseqid sseqid sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore' -num_threads 4"
subprocess.run(command, shell=True, check=True)
# intergenic 
output_path = f"{output_dir}/{intergenic.short_name}_blasted_intergenics.csv"
command = f"blastn -task blastn -query {input_path} -db {all_intergenic_regions_db} -ungapped -word_size 15 -reward 1 -penalty -5 -dust no -soft_masking false -max_target_seqs 10000 -out {output_path} -outfmt '10 qseqid sseqid sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore' -num_threads 4"
subprocess.run(command, shell=True, check=True)

CompletedProcess(args="blastn -task blastn -query ../raw-data/single_intergenic_probe_design/intergenic_seq_blast_input/Or5-9E213-Or5-9E214.fasta -db ../raw-data/OBir/intergenics/all_intergenic_regions -ungapped -word_size 15 -reward 1 -penalty -5 -dust no -soft_masking false -max_target_seqs 10000 -out ../raw-data/single_intergenic_probe_design/intergenic_seq_blast_output/Or5-9E213-Or5-9E214_blasted_intergenics.csv -outfmt '10 qseqid sseqid sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore' -num_threads 4", returncode=0)

In [16]:
length_thresh = 50 

## Iterate through blast results and annotate non-unique regions 
input_dir = os.path.join(main_directory, 'intergenic_seq_blast_output')
output_dir = os.path.join(main_directory, 'intergenic_seq_unique_regions') 

# Create output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Clear contents of the output directory
for file in os.listdir(output_dir):
    os.remove(f"{output_dir}/{file}")

# No introns
input_path = f"{input_dir}/{intergenic.short_name}_blasted_no_introns.csv"
blast_results_no_introns = pd.read_csv(input_path, header=None, names=['query_id', 'subject_id', 'subject_acc', 'percent_identity', 'length', 'mismatches', 'gap_opens', 'q_start', 'q_end', 's_start', 's_end', 'evalue', 'bitscore'])
blast_results_no_introns['source'] = 'no_introns'

# Yes introns
input_path = f"{input_dir}/{intergenic.short_name}_blasted_yes_introns.csv"
blast_results_yes_introns = pd.read_csv(input_path, header=None, names=['query_id', 'subject_id', 'subject_acc', 'percent_identity', 'length', 'mismatches', 'gap_opens', 'q_start', 'q_end', 's_start', 's_end', 'evalue', 'bitscore'])
blast_results_yes_introns['source'] = 'yes_introns'

# Yes introns
input_path = f"{input_dir}/{intergenic.short_name}_blasted_intergenics.csv"
blast_results_intergenics = pd.read_csv(input_path, header=None, names=['query_id', 'subject_id', 'subject_acc', 'percent_identity', 'length', 'mismatches', 'gap_opens', 'q_start', 'q_end', 's_start', 's_end', 'evalue', 'bitscore'])
blast_results_intergenics['source'] = 'intergenics'

# Concatenate 
blast_results = pd.concat([blast_results_no_introns, blast_results_yes_introns, blast_results_intergenics], axis=0)

# remove rows where length < length_thresh 
blast_results = blast_results.loc[(blast_results['length'].values >= length_thresh), :]

# GEt intergenic_location 
blast_results['intergenic_location'] = blast_results['subject_id'].apply(
    lambda x: "Chr"+x.split("Chr")[1] # for intergenic regions
    if 'Chr' in x
    else ""
)

# Get location of intergenic hits 
blast_results['intergenic_chr'] = blast_results['intergenic_location'].apply(
    lambda x: "Chr" + x.split('Chr')[1].split(':')[0] # for intergenic regions
    if 'Chr' in x
    else ""
)

# Get location of intergenic start 
blast_results['intergenic_start'] = blast_results['intergenic_location'].apply(
    lambda x: int(x.split('Chr')[1].split(':')[1].split('-')[0]) # for intergenic regions
    if 'Chr' in x
    else ""
)

# Get location of intergenic end 
blast_results['intergenic_end'] = blast_results['intergenic_location'].apply(
    lambda x: int(x.split('Chr')[1].split(':')[1].split('-')[1]) # for intergenic regions
    if 'Chr' in x
    else ""
)

# Overlapping with intergenic region? 
blast_results['overlapping_with_intergenic'] = blast_results.apply(
    lambda x: overlapping(intergenic.chromosome, intergenic.start, intergenic.end, x['intergenic_chr'], x['intergenic_start'], x['intergenic_end']),
    axis=1
)

# permitted_off_targets 
blast_results['permitted_off_target'] = blast_results.apply(
    lambda x: any([keyword in x['subject_id'] for keyword in permitted_off_targets]),
    axis=1
)

# Retrieve sequence
sequence = intergenic.sequence

# Get off-targets
off_targets = blast_results.loc[(blast_results['length'].values >= length_thresh) & # length threshold, no need for mismatch threshold here 
                                    (blast_results['overlapping_with_intergenic'] == False) & 
                                    (blast_results['permitted_off_target'] == False), :]  # ignore overlapping intergenics 

# Annotate off-targets 
for j, row in off_targets.iterrows():
    sequence = sequence[:row['q_start']] + '-'*(row['q_end'] - row['q_start']) + sequence[row['q_end']:]

# Save the annotated sequence to the gene object
intergenic.unique_sequence = sequence 

# Save the blast results to the gene object 
intergenic.blast_results = blast_results

# Save the sequence to new file 
with open(f"{output_dir}/{intergenic.short_name}_unique.fasta", 'w') as f:
    f.write(f">{intergenic.short_name}\n{intergenic.unique_sequence}")

print(f"Unique regions have been annotated and exported to {output_dir}")

Unique regions have been annotated and exported to ../raw-data/single_intergenic_probe_design/intergenic_seq_unique_regions


In [17]:
blast_results

Unnamed: 0,query_id,subject_id,subject_acc,percent_identity,length,mismatches,gap_opens,q_start,q_end,s_start,s_end,evalue,bitscore,source,intergenic_location,intergenic_chr,intergenic_start,intergenic_end,overlapping_with_intergenic,permitted_off_target
0,Intergenic-9E213-9E214,ref|XR_003407399.1|,XR_003407399,100.000,57,0,0,1598,1654,973,1029,1.200000e-23,114.0,no_introns,,,,,False,False
1,Intergenic-9E213-9E214,ref|XR_003407399.1|,XR_003407399,100.000,51,0,0,1598,1648,969,1019,4.880000e-20,102.0,no_introns,,,,,False,False
2,Intergenic-9E213-9E214,ref|XR_003407399.1|,XR_003407399,100.000,51,0,0,1598,1648,971,1021,4.880000e-20,102.0,no_introns,,,,,False,False
3,Intergenic-9E213-9E214,ref|XR_003407399.1|,XR_003407399,100.000,50,0,0,1599,1648,968,1017,1.950000e-19,100.0,no_introns,,,,,False,False
4,Intergenic-9E213-9E214,ref|XR_003407399.1|,XR_003407399,98.182,55,1,0,1601,1655,968,1022,7.790000e-19,98.4,no_introns,,,,,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403013,Intergenic-9E213-9E214,ref|XM_011350430.3|,XM_011350430,90.385,52,5,0,1597,1648,236,287,7.200000e-02,44.4,yes_introns,,,,,False,False
403031,Intergenic-9E213-9E214,ref|XM_011350428.3|,XM_011350428,90.385,52,5,0,1597,1648,103,154,7.200000e-02,44.4,yes_introns,,,,,False,False
0,Intergenic-9E213-9E214,T37-Or5-9E213-Or5-9E214-Chr6:10901758-10905030,T37-Or5-9E213-Or5-9E214-Chr6:10901758-10905030,100.000,3272,0,0,2,3273,1,3272,0.000000e+00,6540.0,intergenics,Chr6:10901758-10905030,Chr6,10901758,10905030,True,False
7,Intergenic-9E213-9E214,T37-Or5-9E213-Or5-9E214-Chr6:10901758-10905030,T37-Or5-9E213-Or5-9E214-Chr6:10901758-10905030,98.000,50,1,0,1606,1655,1597,1646,6.130000e-18,88.4,intergenics,Chr6:10901758-10905030,Chr6,10901758,10905030,True,False


In [18]:
sequence

'TTATGTAACATTTAATGCGTGCTATCTATTCCTATCTGATACATTATGAACATTAATACATCGAAATTTTTTTGTTTGTTTAAAAAACTAACTTTTAATTTTATACTTGGTTTATTCAATCACATTATATGTAAATCAATTTATTATTTTAATAACGTCGCAGAGAGGTACGTCGAGAGTAGGTACTATATGTTGACAAAAGAATTGAACAGAGAAATAAAACTGGTTCTTTTAAATGAACAAACATTACATGAGAAATTGAAGAGCAGGAACAC------------------------------------------------------------AGAACGATATTAAGGGATCGGAATCAAAATGAACGATTGCATTATCTGTTAAGCACAGTAAAAATTTATTCTTAAAAACAGGCGAAAAAAAAACCGACGGAAGATAATGTTGTACCAGAAATTCAGAAGCAAAATATCAAGAAGAAAAAAAAATTGAAGTTTAGAGAGATAATATAGAGAGATGATAATTATATAATTTATATATTAAAATGATTGGATACAATAAATAATCAACTGAAAAGTATGTAAATAAATAACGTTCTTTTATATCTTCTTTGACAAATTTTTATTATAAATAAAAAATAATTGTTGGAGGAAATATATGTATATATAAAACTGATTCGCAGTAGCAAAATTAAAGCAGTTGGTGATTTCTCCTATGTGACTATCTCTATCTCGGAATTACTCTACATTAAAAAGGATGTATGCAAATAGTTTAAAATGTATGAAAATATAAAGCAGAATTATTTTTAGTGATCGTGATATTAGATATTTATGCAGCTATCGTTGTTAGCTGCATGCATATAAAATATAATACTGCTATAAAATTTAATACTGTAAAAAAAGATTCGATGCTTACAGAGAAATCATTCTTTTAAAGCATATCAGCTTCAAAAGAATTCTGTCACTTGCGCTTGGCTTGCAGTCAACATCTAAATTCGAG

In [19]:
off_targets

Unnamed: 0,query_id,subject_id,subject_acc,percent_identity,length,mismatches,gap_opens,q_start,q_end,s_start,s_end,evalue,bitscore,source,intergenic_location,intergenic_chr,intergenic_start,intergenic_end,overlapping_with_intergenic,permitted_off_target
0,Intergenic-9E213-9E214,ref|XR_003407399.1|,XR_003407399,100.000,57,0,0,1598,1654,973,1029,1.200000e-23,114.0,no_introns,,,,,False,False
1,Intergenic-9E213-9E214,ref|XR_003407399.1|,XR_003407399,100.000,51,0,0,1598,1648,969,1019,4.880000e-20,102.0,no_introns,,,,,False,False
2,Intergenic-9E213-9E214,ref|XR_003407399.1|,XR_003407399,100.000,51,0,0,1598,1648,971,1021,4.880000e-20,102.0,no_introns,,,,,False,False
3,Intergenic-9E213-9E214,ref|XR_003407399.1|,XR_003407399,100.000,50,0,0,1599,1648,968,1017,1.950000e-19,100.0,no_introns,,,,,False,False
4,Intergenic-9E213-9E214,ref|XR_003407399.1|,XR_003407399,98.182,55,1,0,1601,1655,968,1022,7.790000e-19,98.4,no_introns,,,,,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392348,Intergenic-9E213-9E214,ref|XM_011342392.3|,XM_011342392,90.000,60,6,0,1596,1655,10331,10272,5.000000e-03,48.4,yes_introns,,,,,False,False
392356,Intergenic-9E213-9E214,ref|XM_011342391.3|,XM_011342391,90.000,60,6,0,1596,1655,7115,7056,5.000000e-03,48.4,yes_introns,,,,,False,False
402995,Intergenic-9E213-9E214,ref|XM_011350431.3|,XM_011350431,90.385,52,5,0,1597,1648,103,154,7.200000e-02,44.4,yes_introns,,,,,False,False
403013,Intergenic-9E213-9E214,ref|XM_011350430.3|,XM_011350430,90.385,52,5,0,1597,1648,236,287,7.200000e-02,44.4,yes_introns,,,,,False,False


In [20]:
# Determine how many probes fit on each gene 
sequence = intergenic.unique_sequence 
probes, regions, positions = design_hcr_probes(sequence, "B1")

print(f"There is enough space for {len(probes)} probes on {intergenic.name}")

There is enough space for 28 probes on Intergenic-9E213-9E214


In [21]:
# Specify amplifier 
amplifier = "B2"

In [22]:
# Specify how many probes 
n_probes = 30

In [23]:
# Export excel sheet with probes and excel sheet with probe binding regions 

# Design probes
sequence = intergenic.unique_sequence 
probes, regions, positions = design_hcr_probes(sequence, amplifier)

# Randomly select n_probes_per_gene probes 
np.random.seed(1)
if len(probes) < n_probes:
    indices = range(len(probes))
else: 
    indices = np.random.choice(range(len(probes)), n_probes, replace=False)
probes = [probes[i] for i in indices]
regions = [regions[i] for i in indices]

# Save probes and regions to gene object
intergenic.probes = probes
intergenic.regions = regions

# Collapse probes 
probes_collapsed = [probe for sublist in probes for probe in sublist]

# Get date 
today = pd.Timestamp.now().strftime('%Y-%m-%d')

# Export probes 
output_dir = os.path.join(main_directory, 'IDT_sheets')
# Create output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# Create output path
output_path = f"{output_dir}/{intergenic.name}-{amplifier}-{today}.xlsx"
# Create dataframe
df = pd.DataFrame({'Sequence': probes_collapsed})
# Add pool name column 
df['Pool name'] = f'{intergenic.name}-{amplifier}'
df = df[['Pool name', 'Sequence']]
df.to_excel(output_path, index=False)
print(f"{len(probes_collapsed)} probes have been exported to {output_path}")


# Export probe binding regions
output_dir = os.path.join(main_directory, 'probe_binding_regions_sheets') 
# Create output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# Create output path
output_path = f"{output_dir}/{intergenic.name}-{amplifier}-regions-{today}.xlsx"
# Make a vector of the gene names for each probe 
df = pd.DataFrame({'Gene': [intergenic.name for _ in range(len(probes))], 
                   'Region': regions, 
                   'Probe 1': [probe[0] for probe in intergenic.probes], 
                   'Probe 2': [probe[1] for probe in intergenic.probes]})
df.to_excel(output_path, index=False)

56 probes have been exported to ../raw-data/single_intergenic_probe_design/IDT_sheets/Intergenic-9E213-9E214-B2-2025-05-24.xlsx


In [24]:
# Export all_regions to individual fasta files 

output_dir = os.path.join(main_directory, 'probe_region_blast_input')
# Create output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Clear contents of the output directory
for file in os.listdir(output_dir):
    os.remove(f"{output_dir}/{file}")

# Export all_regions to individual fasta files in output_dir 
for i, region in enumerate(regions): 
    with open(f"{output_dir}/region-{i}.fasta", 'w') as f: 
        f.write(f">region-{i}\n{region}")

In [25]:
# Blast all regions against the transcriptome

input_dir = os.path.join(main_directory, 'probe_region_blast_input')
output_dir = os.path.join(main_directory, 'probe_region_blast_output') 

# Create output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Clear contents of the output directory
for file in os.listdir(output_dir):
    os.remove(f"{output_dir}/{file}")

# Blast each region against the transcriptome
for i, region in enumerate(regions): 
    input_path = f"{input_dir}/region-{i}.fasta"

    # Blast against transcriptome without introns
    output_path = f"{output_dir}/region-{i}-blasted_no_introns.csv"
    command = f"blastn -task blastn -query {input_path} -db {new_transcriptome_db_no_introns} -ungapped -word_size 15 -reward 1 -penalty -1 -dust no -soft_masking false -max_target_seqs 10000 -out {output_path} -outfmt '10 qseqid sseqid sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore' -num_threads 4"
    subprocess.run(command, shell=True, check=True)

    # Blast against transcriptome with introns
    output_path = f"{output_dir}/region-{i}-blasted_yes_introns.csv"
    command = f"blastn -task blastn -query {input_path} -db {new_transcriptome_db_yes_introns} -ungapped -word_size 15 -reward 1 -penalty -1 -dust no -soft_masking false -max_target_seqs 10000 -out {output_path} -outfmt '10 qseqid sseqid sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore' -num_threads 4"
    subprocess.run(command, shell=True, check=True)

    # Blast against intergenics 
    output_path = f"{output_dir}/region-{i}-blasted_intergenics.csv"
    command = f"blastn -task blastn -query {input_path} -db {all_intergenic_regions_db} -ungapped -word_size 15 -reward 1 -penalty -1 -dust no -soft_masking false -max_target_seqs 10000 -out {output_path} -outfmt '10 qseqid sseqid sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore' -num_threads 4"
    subprocess.run(command, shell=True, check=True)

print("All regions have been blasted against the transcriptome")

All regions have been blasted against the transcriptome


In [26]:
# Load in blast results
output_dir = os.path.join(main_directory, 'probe_region_blast_output') 
all_blast_results = []
for i, region in enumerate(regions):
    output_path = f"{output_dir}/region-{i}-blasted_no_introns.csv"
    blast_results = pd.read_csv(output_path, header=None, names=['query_id', 'subject_id', 'subject_acc', 'percent_identity', 'length', 'mismatches', 'gap_opens', 'q_start', 'q_end', 's_start', 's_end', 'evalue', 'bitscore'])
    blast_results['source']  = 'no_introns'
    all_blast_results.append(blast_results)
    output_path = f"{output_dir}/region-{i}-blasted_yes_introns.csv"
    blast_results = pd.read_csv(output_path, header=None, names=['query_id', 'subject_id', 'subject_acc', 'percent_identity', 'length', 'mismatches', 'gap_opens', 'q_start', 'q_end', 's_start', 's_end', 'evalue', 'bitscore'])
    blast_results['source']  = 'yes_introns'
    all_blast_results.append(blast_results)
    output_path = f"{output_dir}/region-{i}-blasted_intergenics.csv"
    blast_results = pd.read_csv(output_path, header=None, names=['query_id', 'subject_id', 'subject_acc', 'percent_identity', 'length', 'mismatches', 'gap_opens', 'q_start', 'q_end', 's_start', 's_end', 'evalue', 'bitscore'])
    blast_results['source']  = 'intergenics'
    all_blast_results.append(blast_results)

In [27]:
# Check for non-unique regions

num_hits_over_thresh = 0
length_thresh = 50
mismatch_thresh = 5 
all_off_targets = pd.DataFrame()

# Iterate through blast results 
for i, blast_results in enumerate(all_blast_results):
    if len(blast_results) == 0: 
        continue


    # GEt intergenic_location
    blast_results['intergenic_location'] = blast_results['subject_id'].apply(
        lambda x: "Chr"+x.split("Chr")[1] # for intergenic regions
        if 'Chr' in x
        else ""
    )

    # Get location of intergenic hits 
    blast_results['intergenic_chr'] = blast_results['intergenic_location'].apply(
        lambda x: "Chr" + x.split('Chr')[1].split(':')[0] # for intergenic regions
        if 'Chr' in x
        else ""
    )

    # Get location of intergenic start 
    blast_results['intergenic_start'] = blast_results['intergenic_location'].apply(
        lambda x: int(x.split('Chr')[1].split(':')[1].split('-')[0]) # for intergenic regions
        if 'Chr' in x
        else ""
    )

    # Get location of intergenic end 
    blast_results['intergenic_end'] = blast_results['intergenic_location'].apply(
        lambda x: int(x.split('Chr')[1].split(':')[1].split('-')[1]) # for intergenic regions
        if 'Chr' in x
        else ""
    )

    # Overlapping with intergenic region? 
    blast_results['overlapping_with_intergenic'] = blast_results.apply(
        lambda x: overlapping(intergenic.chromosome, intergenic.start, intergenic.end, x['intergenic_chr'], x['intergenic_start'], x['intergenic_end']),
        axis=1
    )

    # Get off-targets
    off_targets = blast_results.loc[(blast_results['length'].values >= length_thresh) & # length threshold, no need for mismatch threshold here 
                                    (blast_results['mismatches'].values <= mismatch_thresh) & # mismatch threshold
                                    (blast_results['overlapping_with_intergenic'] == False), :]  # ignore overlapping intergenics 

    # Add to the all_off_targets dataframe
    all_off_targets = pd.concat([all_off_targets, off_targets])

    # Update num_hits_over_thresh   
    num_hits_over_thresh += len(off_targets)

print(f"Found {num_hits_over_thresh} off-target hits with length >= {length_thresh}bp, mismatches <= {mismatch_thresh}")

Found 0 off-target hits with length >= 50bp, mismatches <= 5


In [28]:
all_off_targets

Unnamed: 0,query_id,subject_id,subject_acc,percent_identity,length,mismatches,gap_opens,q_start,q_end,s_start,s_end,evalue,bitscore,source,intergenic_location,intergenic_chr,intergenic_start,intergenic_end,overlapping_with_intergenic
