In [1]:
import os 
import sys
import subprocess
import pandas as pd
import numpy as np
import pybedtools
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction
from gene2probe import *

In the previous tutorials, we have seen how to generate probes starting from a [gene](https://github.com/Teichlab/gene2probe/blob/main/notebooks/002_make_exon_probes_XIST.ipynb) or a [specific region](https://github.com/Teichlab/gene2probe/blob/main/notebooks/003_make_probes_for_custom_region_CD45RA.ipynb).

But what if we are interested in designing probes against a sequence that is not included in the human genome (e.g., GFP)?
We can achieve that by directly providing the sequence as input and following all subsequent steps.

We note that we don't recommend this approach for the detection of bacterial or viral reads for metagenomics analyses, as there are many additional considerations that have to be taken into account to ensure that a probe is specifically detecting a certain species.

### 1. Specify parameters

In [2]:
## Specify gene of interest and feature of interest
gene_ID = 'EGFP'
sequence = 'GTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAA'

## specify output directory
out_dir = '../sample_run/probeDesign_' + gene_ID +'/'
## Create output directory
os.makedirs(out_dir, exist_ok=True)

In [3]:
sequence

'GTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAA'

Since we already provide the sequence, we just need a blast database to check for offtargets. 

We can use the one we generated in the [previous tutorial](https://github.com/Teichlab/gene2probe/blob/main/notebooks/001_make_blast_database.ipynb).

In [4]:
blast_db = '../hg38_resources/blastdb/hg38_ncbiRefSeq_transcripts_db' ## Database of all human transcripts to blast against

In [5]:
## Path to blast binaries.
## Replace with your conda environment
## This can also be omitted if you started the jupyter session from within the gene2probe conda environment
blast_exec_path = '/nfs/team205/is10/miniconda/envs/gene2probe_env/bin/'

Finally, we need to provide a set of parameters related to our probe's length, at which nucleotide it's split (if at all), the acceptable range for GC content and any specific requirements for individual nucleotides.

Here we are following the [recommendations of 10x Genomics for custom probes for VisiumHD/VisiumFFPE/Flex](https://cdn.10xgenomics.com/image/upload/v1697739385/support-documents/CG000621_CustomProbeDesign_TechNote_RevC.pdf).

In [6]:
## Additional parameters regarding how the probe should look like
probe_length = 50 ## Length of probe in nucleotides
split_nt = 25 ## Index of nucleotide to split the probe at (start of RHS) - set to None if splitting probe is not needed
min_GC = 0.44 ## Minimum GC content for probe (if split probe, applied to both LHS and RHS)
max_GC = 0.72 ## Maximum GC content for probe (if split probe, applied to both LHS and RHS)
required_nts = {24: 'T'} ## Dictionary of index (0-based) for required nts - by default, 25th nucleotide must be a T - set to None if no requirements
probe_offset = 100 ## Minimum distance between probes - 10 bp is the recommended minimum by 10x, this can also be adjusted depending on how many probes pass other cutoffs
n_desired_probes =3 ## Number of probes to be designed.
min_mismatches = 5 ## Minimum number of mismatches (in at least LHS or RHS) - here we require in both to be more conservative

In [7]:
## Optionally, we can also specify adapters that have to be added to the probes.
## For example, for visiumHD:
LHS_pref = 'CCTTGGCACCCGAGAATTCCA' ## Will be added to the 5' of the LHS probe
LHS_suff = '' ## Will be added to the 3' of the LHS probe
RHS_pref = '/5Phos/' ## Will be added to the 5' of the RHS probe
RHS_suff = 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' ## Will be added to the 3' of the RHS probe

## Leave as empty strings if you don't want to use them

### 2. Generate k-mers

Since we started with a sequence, we can directly make kmers from it.

In [8]:
def generate_kmers_from_seq(seq, k, outfasta, name_pref):
    """
    Given a sequence and an integer k, split the sequence into all possible kmers.
    Kmers are named based on their index, preceded by an optional prefix.
    The function exports all kmers into a fasta file (path specified as outfasta)
    """
    l = len(seq)
    if l < k:
        raise ValueError('k should be smaller than the length of the sequence')
    i = 0
    kmers_names = []
    kmers_seqs = []
    kmers_start = []
    kmers_end = []
    while i < (l-k):
        kmers_names.append((name_pref + str(i)))
        kmers_seqs.append(seq[i: (i+k)])
        kmers_start.append(i)
        kmers_end.append((i + k))
        i+=1
    ## Export fasta
    write_fasta(kmers_names, kmers_seqs, outfasta)

    ## Create dataframe
    kmers_df = pd.DataFrame({
        'seqname': 'custom_seq',
        'start': kmers_start,
        'end': kmers_end,
        'name': kmers_names,
        'transcript_seq': kmers_seqs
    })
    return kmers_df

In [9]:
kmers = generate_kmers_from_seq(sequence, k = probe_length, outfasta=(out_dir + 'all_kmers_unfiltered.fa'), name_pref=(gene_ID + '_'))

In [10]:
kmers

Unnamed: 0,seqname,start,end,name,transcript_seq
0,custom_seq,0,50,EGFP_0,GTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGT...
1,custom_seq,1,51,EGFP_1,TGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTT...
2,custom_seq,2,52,EGFP_2,GCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTC...
3,custom_seq,3,53,EGFP_3,CCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCA...
4,custom_seq,4,54,EGFP_4,CCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAG...
...,...,...,...,...,...
629,custom_seq,629,679,EGFP_629,GGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTG...
630,custom_seq,630,680,EGFP_630,GAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGT...
631,custom_seq,631,681,EGFP_631,AGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTA...
632,custom_seq,632,682,EGFP_632,GTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTAC...


In [11]:
## Export unfiltered
kmers.to_csv((out_dir + 'kmers_all.csv'))

### 3. Exclude annotated repeats/polymorphism

Since we started from a custom sequence, we skip this part.
(alternatively, if a SNP is known, we can provide it as a relative coordinate to the custom sequence).

### 4. Filter for desirable sequence features

The next step is to consider their sequence features.

For this, we first extract the sequences of each k-mer, and then estimate features such as GC content and the presence of desired nucleotides in specific positions.

In [12]:
## We can read in the sequences and simultaneously monitor GC content and count the longest homopolymer stretch
kmers_seq_stats = get_sequence_stats((out_dir + 'all_kmers_unfiltered.fa'), probe_length, split_nt)

In [13]:
## Combining with our dataframe
kmers = pd.merge(kmers, kmers_seq_stats, on = 'transcript_seq', how='left')

In [14]:
kmers

Unnamed: 0,seqname,start,end,name,transcript_seq,kmer_coord,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS
0,custom_seq,0,50,EGFP_0,GTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGT...,EGFP_0,CTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGACCAGGATGG...,0.62,3,0.56,0.68
1,custom_seq,1,51,EGFP_1,TGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTT...,EGFP_1,GCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGACCAGGATG...,0.62,3,0.56,0.68
2,custom_seq,2,52,EGFP_2,GCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTC...,EGFP_2,CGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGACCAGGAT...,0.64,3,0.56,0.72
3,custom_seq,3,53,EGFP_3,CCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCA...,EGFP_3,ACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGACCAGGA...,0.62,3,0.52,0.72
4,custom_seq,4,54,EGFP_4,CCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAG...,EGFP_4,CACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGACCAGG...,0.62,3,0.56,0.68
...,...,...,...,...,...,...,...,...,...,...,...
629,custom_seq,629,679,EGFP_629,GGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTG...,EGFP_629,TGTACAGCTCGTCCATGCCGAGAGTGATCCCGGCGGCGGTCACGAA...,0.64,3,0.56,0.72
630,custom_seq,630,680,EGFP_630,GAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGT...,EGFP_630,TTGTACAGCTCGTCCATGCCGAGAGTGATCCCGGCGGCGGTCACGA...,0.62,3,0.56,0.68
631,custom_seq,631,681,EGFP_631,AGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTA...,EGFP_631,CTTGTACAGCTCGTCCATGCCGAGAGTGATCCCGGCGGCGGTCACG...,0.62,3,0.56,0.68
632,custom_seq,632,682,EGFP_632,GTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTAC...,EGFP_632,ACTTGTACAGCTCGTCCATGCCGAGAGTGATCCCGGCGGCGGTCAC...,0.62,3,0.56,0.68


In [15]:
## Check for required nucleotides in specific positions:
if required_nts is not None:
    kmers['has_required_nts'] = check_for_required_nts(kmers, required_nts)
    print(kmers['has_required_nts'].value_counts())
    ## Filter for required nucleotides
    kmers = kmers[kmers['has_required_nts']==True].reset_index(drop=True)

has_required_nts
False    478
True     156
Name: count, dtype: int64


In [16]:
## Export kmers before filtering
kmers.to_csv((out_dir + 'kmers_candidates_unfiltered.csv'))

In [17]:
kmers

Unnamed: 0,seqname,start,end,name,transcript_seq,kmer_coord,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts
0,custom_seq,3,53,EGFP_3,CCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCA...,EGFP_3,ACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGACCAGGA...,0.62,3,0.52,0.72,True
1,custom_seq,7,57,EGFP_7,TCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGT...,EGFP_7,GGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGACC...,0.62,3,0.56,0.68,True
2,custom_seq,8,58,EGFP_8,CCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTG...,EGFP_8,CGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGAC...,0.64,3,0.60,0.68,True
3,custom_seq,9,59,EGFP_9,CTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGT...,EGFP_9,CCGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGA...,0.64,3,0.64,0.64,True
4,custom_seq,15,65,EGFP_15,GAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCG...,EGFP_15,CCCTCGCCGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCA...,0.66,3,0.64,0.68,True
...,...,...,...,...,...,...,...,...,...,...,...,...
151,custom_seq,593,643,EGFP_593,AGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTG...,EGFP_593,CGGTCACGAACTCCAGCAGGACCATGTGATCGCGCTTCTCGTTGGG...,0.60,4,0.60,0.60,True
152,custom_seq,606,656,EGFP_606,AAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGA...,EGFP_606,GTGATCCCGGCGGCGGTCACGAACTCCAGCAGGACCATGTGATCGC...,0.64,3,0.68,0.60,True
153,custom_seq,614,664,EGFP_614,TCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTC...,EGFP_614,TGCCGAGAGTGATCCCGGCGGCGGTCACGAACTCCAGCAGGACCAT...,0.64,3,0.72,0.56,True
154,custom_seq,626,676,EGFP_626,GCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAG...,EGFP_626,ACAGCTCGTCCATGCCGAGAGTGATCCCGGCGGCGGTCACGAACTC...,0.66,3,0.56,0.76,True


In [18]:
## Filter for GC content
kmers = filter_by_GC_content(kmers, min_GC, max_GC)

In [19]:
kmers

Unnamed: 0,seqname,start,end,name,transcript_seq,kmer_coord,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts
0,custom_seq,7,57,EGFP_7,TCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGT...,EGFP_7,GGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGACC...,0.62,3,0.56,0.68,True
1,custom_seq,8,58,EGFP_8,CCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTG...,EGFP_8,CGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGAC...,0.64,3,0.60,0.68,True
2,custom_seq,9,59,EGFP_9,CTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGT...,EGFP_9,CCGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGA...,0.64,3,0.64,0.64,True
3,custom_seq,15,65,EGFP_15,GAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCG...,EGFP_15,CCCTCGCCGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCA...,0.66,3,0.64,0.68,True
4,custom_seq,17,67,EGFP_17,GCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAG...,EGFP_17,CGCCCTCGCCGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTC...,0.68,3,0.68,0.68,True
...,...,...,...,...,...,...,...,...,...,...,...,...
94,custom_seq,582,632,EGFP_582,GCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGC...,EGFP_582,TCCAGCAGGACCATGTGATCGCGCTTCTCGTTGGGGTCTTTGCTCA...,0.60,4,0.60,0.60,True
95,custom_seq,588,638,EGFP_588,AGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGT...,EGFP_588,ACGAACTCCAGCAGGACCATGTGATCGCGCTTCTCGTTGGGGTCTT...,0.56,4,0.52,0.60,True
96,custom_seq,591,641,EGFP_591,AAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCG...,EGFP_591,GTCACGAACTCCAGCAGGACCATGTGATCGCGCTTCTCGTTGGGGT...,0.56,4,0.56,0.56,True
97,custom_seq,593,643,EGFP_593,AGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTG...,EGFP_593,CGGTCACGAACTCCAGCAGGACCATGTGATCGCGCTTCTCGTTGGG...,0.60,4,0.60,0.60,True


In [20]:
## Candidate kmers
kmers

Unnamed: 0,seqname,start,end,name,transcript_seq,kmer_coord,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts
0,custom_seq,7,57,EGFP_7,TCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGT...,EGFP_7,GGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGACC...,0.62,3,0.56,0.68,True
1,custom_seq,8,58,EGFP_8,CCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTG...,EGFP_8,CGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGAC...,0.64,3,0.60,0.68,True
2,custom_seq,9,59,EGFP_9,CTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGT...,EGFP_9,CCGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGA...,0.64,3,0.64,0.64,True
3,custom_seq,15,65,EGFP_15,GAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCG...,EGFP_15,CCCTCGCCGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCA...,0.66,3,0.64,0.68,True
4,custom_seq,17,67,EGFP_17,GCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAG...,EGFP_17,CGCCCTCGCCGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTC...,0.68,3,0.68,0.68,True
...,...,...,...,...,...,...,...,...,...,...,...,...
94,custom_seq,582,632,EGFP_582,GCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGC...,EGFP_582,TCCAGCAGGACCATGTGATCGCGCTTCTCGTTGGGGTCTTTGCTCA...,0.60,4,0.60,0.60,True
95,custom_seq,588,638,EGFP_588,AGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGT...,EGFP_588,ACGAACTCCAGCAGGACCATGTGATCGCGCTTCTCGTTGGGGTCTT...,0.56,4,0.52,0.60,True
96,custom_seq,591,641,EGFP_591,AAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCG...,EGFP_591,GTCACGAACTCCAGCAGGACCATGTGATCGCGCTTCTCGTTGGGGT...,0.56,4,0.56,0.56,True
97,custom_seq,593,643,EGFP_593,AGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTG...,EGFP_593,CGGTCACGAACTCCAGCAGGACCATGTGATCGCGCTTCTCGTTGGG...,0.60,4,0.60,0.60,True


In [21]:
kmers.to_csv((out_dir + 'kmers_candidates_filtered.csv'))

### 5. Remove probes with potential off-targets

Having identified a set of kmers that fulfill our sequence requirements, we can next proceed with testing whether they are specific to our transcript/exon of interest.

For this we rely on using BLAST. 

We recommend blasting against transcripts (i.e., exons and introns combined) to be as conservative as possible in terms of off-targets. 
However, in cases where it is not possible to obtain enough suitable kmers (e.g., for  short transcripts), it is reasonable to relax this requirement by BLASTing against exons only (a much smaller search space).

At this step, we also want to consider whether our probes are split (as in the current specifications for VisiumHD) or a single oligo. If probes are split, it's best to BLAST each side separately, to make sure that both sides are specific.

In [22]:
## The first thing to do is to export our sequences in fasta format, so that we can use them for BLAST
write_fasta(kmers['name'], kmers['transcript_seq'], (out_dir + 'kmers_candidates_filtered_transcript_seqs.fa'))
## If our probes are meant to be split, we should additionally blast them separately 
## Note that the LHS/RHS in the transcript are reversed compared to the probe (i.e., the LHS of the transcript is complementary to the RHS of the probe)
if split_nt is not None: 
    ## Make split probes
    kmers['transcript_seq_LHS'] = [seq[0:split_nt] for seq in kmers['transcript_seq']]
    kmers['transcript_seq_RHS'] = [seq[split_nt: probe_length] for seq in kmers['transcript_seq']]

    ## We are exporting the transcript sequence as that's the one that has to be blasted against the human transcriptome
    write_fasta(kmers['name'], kmers['transcript_seq_LHS'], (out_dir + 'kmers_candidates_filtered_transcript_seqs_LHS.fa'))
    write_fasta(kmers['name'], kmers['transcript_seq_RHS'], (out_dir + 'kmers_candidates_filtered_transcript_seqs_RHS.fa'))    

In [23]:
blast_res = {}
## First, blast the full probe
blast_res['full'] = run_blast(fasta=(out_dir + 'kmers_candidates_filtered_transcript_seqs.fa'),
                              blastdb = blast_db,
                              path2blastn=(blast_exec_path + 'blastn'),
                              outfile = (out_dir + 'kmers_candidates_filtered_blast_output.txt'))

## Additionally, if probe is split, blast each side separately
if split_nt is not None: 
    blast_res['LHS'] = run_blast(fasta=(out_dir + 'kmers_candidates_filtered_transcript_seqs_LHS.fa'),
                                     blastdb = blast_db,
                                     path2blastn=(blast_exec_path + 'blastn'),
                                     outfile = (out_dir + 'kmers_candidates_filtered_blast_output_LHS.txt'))
    blast_res['RHS'] = run_blast(fasta=(out_dir + 'kmers_candidates_filtered_transcript_seqs_RHS.fa'),
                                     blastdb = blast_db,
                                     path2blastn=(blast_exec_path + 'blastn'),
                                     outfile = (out_dir + 'kmers_candidates_filtered_blast_output_RHS.txt'))

In [24]:
for k in blast_res.keys():
    print(("The following genes were detected in mode: " +  k))
    print(blast_res[k]['sgeneid'].value_counts().head(10))

The following genes were detected in mode: full
sgeneid
CCDC85C      4
LINC00630    3
Name: count, dtype: int64
The following genes were detected in mode: LHS
sgeneid
LMTK2           2
LHFPL6          2
LOC105374785    1
Name: count, dtype: int64
The following genes were detected in mode: RHS
sgeneid
LOC105374785    1
Name: count, dtype: int64


In [25]:
blast_res['full']

Unnamed: 0,name,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sgeneid
0,EGFP_42,CCDC85C::chr14:99500189-99604207(-),100.0,21,0,0,16,36,211,231,0.47,39.2,CCDC85C
1,EGFP_42,CCDC85C::chr14:99524540-99604207(-),100.0,21,0,0,16,36,211,231,0.47,39.2,CCDC85C
2,EGFP_42,CCDC85C::chr14:99524540-99604207(-),100.0,21,0,0,16,36,211,231,0.47,39.2,CCDC85C
3,EGFP_42,CCDC85C::chr14:99526286-99604207(-),100.0,21,0,0,16,36,211,231,0.47,39.2,CCDC85C
4,EGFP_68,LINC00630::chrX:102769152-102964523(+),82.051,39,7,0,9,47,192909,192947,0.47,39.2,LINC00630
5,EGFP_74,LINC00630::chrX:102769152-102964523(+),82.051,39,7,0,3,41,192909,192947,0.47,39.2,LINC00630
6,EGFP_75,LINC00630::chrX:102769152-102964523(+),82.051,39,7,0,2,40,192909,192947,0.47,39.2,LINC00630


Not all BLAST hits will be off-targets. Hopefully, our gene of interest is included in the BLAST output. We therefore need to filter for hits with different gene IDs.

In [26]:
offtargets = []
for k in blast_res.keys():
    offtargets += (detect_offtargets(blast_res[k], gene_ID, min_mismatches=min_mismatches))
## Remove redundancies
offtargets = list(set(offtargets))

In [27]:
len(offtargets)

5

In [28]:
## Remove off-targets
kmers = kmers[kmers['name'].isin(offtargets)==False].reset_index(drop=True)

In [29]:
kmers 

Unnamed: 0,seqname,start,end,name,transcript_seq,kmer_coord,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts,transcript_seq_LHS,transcript_seq_RHS
0,custom_seq,7,57,EGFP_7,TCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGT...,EGFP_7,GGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGACC...,0.62,3,0.56,0.68,True,TCCTGGTCGAGCTGGACGGCGACGT,AAACGGCCACAAGTTCAGCGTGTCC
1,custom_seq,8,58,EGFP_8,CCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTG...,EGFP_8,CGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGAC...,0.64,3,0.60,0.68,True,CCTGGTCGAGCTGGACGGCGACGTA,AACGGCCACAAGTTCAGCGTGTCCG
2,custom_seq,9,59,EGFP_9,CTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGT...,EGFP_9,CCGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGA...,0.64,3,0.64,0.64,True,CTGGTCGAGCTGGACGGCGACGTAA,ACGGCCACAAGTTCAGCGTGTCCGG
3,custom_seq,15,65,EGFP_15,GAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCG...,EGFP_15,CCCTCGCCGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCA...,0.66,3,0.64,0.68,True,GAGCTGGACGGCGACGTAAACGGCC,ACAAGTTCAGCGTGTCCGGCGAGGG
4,custom_seq,17,67,EGFP_17,GCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAG...,EGFP_17,CGCCCTCGCCGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTC...,0.68,3,0.68,0.68,True,GCTGGACGGCGACGTAAACGGCCAC,AAGTTCAGCGTGTCCGGCGAGGGCG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,custom_seq,582,632,EGFP_582,GCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGC...,EGFP_582,TCCAGCAGGACCATGTGATCGCGCTTCTCGTTGGGGTCTTTGCTCA...,0.60,4,0.60,0.60,True,GCCCTGAGCAAAGACCCCAACGAGA,AGCGCGATCACATGGTCCTGCTGGA
90,custom_seq,588,638,EGFP_588,AGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGT...,EGFP_588,ACGAACTCCAGCAGGACCATGTGATCGCGCTTCTCGTTGGGGTCTT...,0.56,4,0.52,0.60,True,AGCAAAGACCCCAACGAGAAGCGCG,ATCACATGGTCCTGCTGGAGTTCGT
91,custom_seq,591,641,EGFP_591,AAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCG...,EGFP_591,GTCACGAACTCCAGCAGGACCATGTGATCGCGCTTCTCGTTGGGGT...,0.56,4,0.56,0.56,True,AAAGACCCCAACGAGAAGCGCGATC,ACATGGTCCTGCTGGAGTTCGTGAC
92,custom_seq,593,643,EGFP_593,AGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTG...,EGFP_593,CGGTCACGAACTCCAGCAGGACCATGTGATCGCGCTTCTCGTTGGG...,0.60,4,0.60,0.60,True,AGACCCCAACGAGAAGCGCGATCAC,ATGGTCCTGCTGGAGTTCGTGACCG


### 6. Select non-overlapping probes

At this point, we have effectively acquired a set of usable probes. They don't overlap undesirable regions (repeats/polymorphism), have desirable sequence features (GC content, specific nucleotides) and are specific to our gene of interest.

In this particular case, we still have a lot of possible k-mers (much more than the number of probes we intend to design). We can therefore choose to prioritise k-mers with shorter homopolymer stretches, as these are also discouraged by the [10x recommendations](https://cdn.10xgenomics.com/image/upload/v1697739385/support-documents/CG000621_CustomProbeDesign_TechNote_RevC.pdf).

However, at this stage you might want to consider ranking probes in a diffferent way, depending on your application.

After having ranked our k-mers in whatever way we think is reasonable at this stage, we can proceed with selecting the top probe, then removing all overlapping/adjacent probes (within a window determined by `probe_offset`).

Since we have so many available probes, we will be increasing `probe_offset` from `100 (default)` to `1000`.

In [30]:
## Sort in increasing homopolymer length
kmers = kmers.sort_values('longest_homopolymer', ascending=True).reset_index(drop=True)

In [31]:
kmers

Unnamed: 0,seqname,start,end,name,transcript_seq,kmer_coord,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts,transcript_seq_LHS,transcript_seq_RHS
0,custom_seq,450,500,EGFP_450,AAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGC...,EGFP_450,AGCTGCACGCTGCCGTCCTCGATGTTGTGGCGGATCTTGAAGTTCA...,0.56,2,0.64,0.48,True,AAGGTGAACTTCAAGATCCGCCACA,ACATCGAGGACGGCAGCGTGCAGCT
1,custom_seq,452,502,EGFP_452,GGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAG...,EGFP_452,CGAGCTGCACGCTGCCGTCCTCGATGTTGTGGCGGATCTTGAAGTT...,0.60,2,0.68,0.52,True,GGTGAACTTCAAGATCCGCCACAAC,ATCGAGGACGGCAGCGTGCAGCTCG
2,custom_seq,464,514,EGFP_464,GATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCAC...,EGFP_464,GGTAGTGGTCGGCGAGCTGCACGCTGCCGTCCTCGATGTTGTGGCG...,0.66,2,0.68,0.64,True,GATCCGCCACAACATCGAGGACGGC,AGCGTGCAGCTCGCCGACCACTACC
3,custom_seq,471,521,EGFP_471,CACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGC...,EGFP_471,TTCTGCTGGTAGTGGTCGGCGAGCTGCACGCTGCCGTCCTCGATGT...,0.62,2,0.60,0.64,True,CACAACATCGAGGACGGCAGCGTGC,AGCTCGCCGACCACTACCAGCAGAA
4,custom_seq,422,472,EGFP_422,CATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATC...,EGFP_422,GGCGGATCTTGAAGTTCACCTTGATGCCGTTCTTCTGCTTGTCGGC...,0.54,2,0.48,0.60,True,CATGGCCGACAAGCAGAAGAACGGC,ATCAAGGTGAACTTCAAGATCCGCC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,custom_seq,186,236,EGFP_186,TACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGC...,EGFP_186,TCGGGCATGGCGGACTTGAAGAAGTCGTGCTGCTTCATGTGGTCGG...,0.58,4,0.56,0.60,True,TACCCCGACCACATGAAGCAGCACG,ACTTCTTCAAGTCCGCCATGCCCGA
90,custom_seq,168,218,EGFP_168,GTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACT...,EGFP_168,AAGAAGTCGTGCTGCTTCATGTGGTCGGGGTAGCGGCTGAAGCACT...,0.58,4,0.48,0.68,True,GTGCAGTGCTTCAGCCGCTACCCCG,ACCACATGAAGCAGCACGACTTCTT
91,custom_seq,162,212,EGFP_162,TACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGC...,EGFP_162,TCGTGCTGCTTCATGTGGTCGGGGTAGCGGCTGAAGCACTGCACGC...,0.62,4,0.60,0.64,True,TACGGCGTGCAGTGCTTCAGCCGCT,ACCCCGACCACATGAAGCAGCACGA
92,custom_seq,155,205,EGFP_155,CCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATG...,EGFP_155,GCTTCATGTGGTCGGGGTAGCGGCTGAAGCACTGCACGCCGTAGGT...,0.64,4,0.64,0.64,True,CCTGACCTACGGCGTGCAGTGCTTC,AGCCGCTACCCCGACCACATGAAGC


In [32]:
## We have a lot of probes here - increasing the offset to 1000 bp to space them out
probe_offset = 100

In [33]:
selected_probes_list = []
df = kmers.copy()
i = 0
while i < n_desired_probes:
    selected_probes_list.append(df.iloc[0:1,:])
    ## Going for a high offset as we have a large span:
    df = remove_overlapping_probes(df,
                                   df['name'][0], 
                                   offset=probe_offset).reset_index(drop=True).copy()
    i +=1

  start = int(probe_df['start'].iloc[i])
  end = int(probe_df['end'].iloc[i])


In [34]:
selected_probes_df = pd.concat(selected_probes_list, axis=0).reset_index(drop=True)

In [35]:
selected_probes_df

Unnamed: 0,seqname,start,end,name,transcript_seq,kmer_coord,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts,transcript_seq_LHS,transcript_seq_RHS
0,custom_seq,450,500,EGFP_450,AAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGC...,EGFP_450,AGCTGCACGCTGCCGTCCTCGATGTTGTGGCGGATCTTGAAGTTCA...,0.56,2,0.64,0.48,True,AAGGTGAACTTCAAGATCCGCCACA,ACATCGAGGACGGCAGCGTGCAGCT
1,custom_seq,233,283,EGFP_233,CGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGC...,EGFP_233,AGTTGCCGTCGTCCTTGAAGAAGATGGTGCGCTCCTGGACGTAGCC...,0.58,2,0.48,0.68,True,CGAAGGCTACGTCCAGGAGCGCACC,ATCTTCTTCAAGGACGACGGCAACT
2,custom_seq,8,58,EGFP_8,CCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTG...,EGFP_8,CGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGAC...,0.64,3,0.6,0.68,True,CCTGGTCGAGCTGGACGGCGACGTA,AACGGCCACAAGTTCAGCGTGTCCG


In [36]:
for seq in selected_probes_df['transcript_seq']:
    print(seq)

AAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCT
CGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACT
CCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCG


And we are done! We have now selected three potential probes for our gene.

If our probes are meant to be split, we can additionally generate these columns:

In [37]:
if split_nt is not None: 
    ## Make split probes (and add adapters if provided)
    selected_probes_df['probe_seq_LHS'] = [(LHS_pref + seq[0:split_nt] + LHS_suff) for seq in selected_probes_df['probe_seq']]
    selected_probes_df['probe_seq_RHS'] = [(RHS_pref +seq[split_nt: probe_length] + RHS_suff) for seq in selected_probes_df['probe_seq']]

In [38]:
## Also add gene_ID for completeness
selected_probes_df['gene_ID'] = gene_ID

In [39]:
## Export selected probes as dataframe:
selected_probes_df.to_csv((out_dir + 'kmers_selected_probes.csv'))

We always recommend additionally performing a manual BLAST of these probe sequences to make sure that there are no off-target effects.

In [40]:
selected_probes_df

Unnamed: 0,seqname,start,end,name,transcript_seq,kmer_coord,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts,transcript_seq_LHS,transcript_seq_RHS,probe_seq_LHS,probe_seq_RHS,gene_ID
0,custom_seq,450,500,EGFP_450,AAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGC...,EGFP_450,AGCTGCACGCTGCCGTCCTCGATGTTGTGGCGGATCTTGAAGTTCA...,0.56,2,0.64,0.48,True,AAGGTGAACTTCAAGATCCGCCACA,ACATCGAGGACGGCAGCGTGCAGCT,CCTTGGCACCCGAGAATTCCAAGCTGCACGCTGCCGTCCTCGATGT,/5Phos/TGTGGCGGATCTTGAAGTTCACCTTAAAAAAAAAAAAAA...,EGFP
1,custom_seq,233,283,EGFP_233,CGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGC...,EGFP_233,AGTTGCCGTCGTCCTTGAAGAAGATGGTGCGCTCCTGGACGTAGCC...,0.58,2,0.48,0.68,True,CGAAGGCTACGTCCAGGAGCGCACC,ATCTTCTTCAAGGACGACGGCAACT,CCTTGGCACCCGAGAATTCCAAGTTGCCGTCGTCCTTGAAGAAGAT,/5Phos/GGTGCGCTCCTGGACGTAGCCTTCGAAAAAAAAAAAAAA...,EGFP
2,custom_seq,8,58,EGFP_8,CCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTG...,EGFP_8,CGGACACGCTGAACTTGTGGCCGTTTACGTCGCCGTCCAGCTCGAC...,0.64,3,0.6,0.68,True,CCTGGTCGAGCTGGACGGCGACGTA,AACGGCCACAAGTTCAGCGTGTCCG,CCTTGGCACCCGAGAATTCCACGGACACGCTGAACTTGTGGCCGTT,/5Phos/TACGTCGCCGTCCAGCTCGACCAGGAAAAAAAAAAAAAA...,EGFP
