In [1]:
import os 
import sys
import subprocess
import pandas as pd
import numpy as np
import pybedtools
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction
from gene2probe import *

In [2]:
## Specify gene of interest and feature of interest
gene_ID = 'XIST'
mode = 'exon' ## Whether to consider only exons / introns or full gene

## specify output directory
out_dir = '../sample_run/probeDesign_' + gene_ID + '_' + mode + '/'
## Create output directory
os.makedirs(out_dir, exist_ok=True)

In [3]:
## Required resources (most can be downloaded from 
gtf = '../hg38_resources/hg38.ncbiRefSeq.gtf' ## Gene annotation in gtf file
## We recommend using RefSeq as this is manually curated and more likely to contain an isoform that is present across most cell types
## Alternatively, one can filter based on RNA-seq data for a cell type/tissue of interest
fasta = '../hg38_resources/hg38.fa' ## Genome in fasta file
snp_db = '../hg38_resources/hg38_snp151Common.txt' ## Database of known SNPs and small indels
repeats = '../hg38_resources/hg38_rmsk.bed' ## bed file with repeats/low complexity regions to be excluded
gaps = '../hg38_resources/hg38_rmsk.bed' ## bed file with gaps in the genome assembly to be excluded
blast_db = '../hg38_resources/blastdb/hg38_ncbiRefSeq_exons_db' ## Database of all human transcripts to blast against

In [4]:
## Path to blast binaries.
## Replace with your conda environment
## This can also be omitted if you started the jupyter session from within the gene2probe conda environment
blast_exec_path = '/nfs/team205/is10/miniconda/envs/gene2probe_env/bin/'

In [5]:
## Additional parameters regarding how the probe should look like
probe_length = 50 ## Length of probe in nucleotides
split_nt = 25 ## Index of nucleotide to split the probe at (start of RHS) - set to None if splitting probe is not needed
min_GC = 0.44 ## Minimum GC content for probe (if split probe, applied to both LHS and RHS)
max_GC = 0.72 ## Maximum GC content for probe (if split probe, applied to both LHS and RHS)
required_nts = {24: 'T'} ## Dictionary of index (0-based) for required nts - by default, 25th nucleotide must be a T - set to None if no requirements
probe_offset = 100 ## Minimum distance between probes - 10 bp is the recommended minimum by 10x, this can also be adjusted depending on how many probes pass other cutoffs

In [6]:
## Read gtf file
gene_anno = read_gtf(gtf)
## Extract regions corresponding to gene of interest (symbol: gene_name, Ensembl ID: gene_ID)
gene_ids = gene_anno['attribute'].apply(extract_feature_from_gtf, feature='gene_name')
roi = gene_anno.iloc[np.where(gene_ids == gene_ID)[0],:]

In [7]:
roi

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
325814,chrX,ncbiRefSeq.2022-10-28,transcript,73820651,73852753,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; ..."
325815,chrX,ncbiRefSeq.2022-10-28,exon,73820651,73827984,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325816,chrX,ncbiRefSeq.2022-10-28,exon,73829068,73829231,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325817,chrX,ncbiRefSeq.2022-10-28,exon,73831066,73831274,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325818,chrX,ncbiRefSeq.2022-10-28,exon,73833238,73833374,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325819,chrX,ncbiRefSeq.2022-10-28,exon,73837440,73837503,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325820,chrX,ncbiRefSeq.2022-10-28,exon,73841382,73852753,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."


In [8]:
## Subset to feature type of interest
roi = roi[roi['feature']==mode]

In [9]:
roi

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
325815,chrX,ncbiRefSeq.2022-10-28,exon,73820651,73827984,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325816,chrX,ncbiRefSeq.2022-10-28,exon,73829068,73829231,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325817,chrX,ncbiRefSeq.2022-10-28,exon,73831066,73831274,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325818,chrX,ncbiRefSeq.2022-10-28,exon,73833238,73833374,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325819,chrX,ncbiRefSeq.2022-10-28,exon,73837440,73837503,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325820,chrX,ncbiRefSeq.2022-10-28,exon,73841382,73852753,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."


In [10]:
## Convert to bed-style df
roi_bed = gtf_2_bed(roi, name_pref = (gene_ID + '_'))

In [11]:
roi_bed

Unnamed: 0,seqname,start,end,name,score,strand
0,chrX,73820650,73827984,XIST_0,.,-
1,chrX,73829067,73829231,XIST_1,.,-
2,chrX,73831065,73831274,XIST_2,.,-
3,chrX,73833237,73833374,XIST_3,.,-
4,chrX,73837439,73837503,XIST_4,.,-
5,chrX,73841381,73852753,XIST_5,.,-


In [12]:
kmers = generate_kmers(roi_bed, k=probe_length)

In [13]:
kmers

Unnamed: 0,seqname,start,end,name,score,strand
0,chrX,73820650,73820700,XIST_0_0,.,-
1,chrX,73820651,73820701,XIST_0_1,.,-
2,chrX,73820652,73820702,XIST_0_2,.,-
3,chrX,73820653,73820703,XIST_0_3,.,-
4,chrX,73820654,73820704,XIST_0_4,.,-
...,...,...,...,...,...,...
18981,chrX,73852699,73852749,XIST_5_11318,.,-
18982,chrX,73852700,73852750,XIST_5_11319,.,-
18983,chrX,73852701,73852751,XIST_5_11320,.,-
18984,chrX,73852702,73852752,XIST_5_11321,.,-


In [14]:
## Export unfiltered
kmers.to_csv((out_dir + 'kmers_all.csv'))

In [15]:
## Convert to pybedtools
kmers_bed = pybedtools.BedTool.from_dataframe(kmers)

In [16]:
## Remove repeat/low complexity regions
if repeats is not None:
    ## Read rmsk
    repeats_bed = pybedtools.BedTool(repeats)
    ## Remove any kmers overlapping any masked regions
    kmers_bed = kmers_bed.intersect(repeats_bed, v=True)

In [17]:
## Same for gaps
if gaps is not None:
    ## Read rmsk
    gaps_bed = pybedtools.BedTool(gaps)
    ## Remove any kmers overlapping any masked regions
    kmers_bed = kmers_bed.intersect(gaps_bed, v=True)

In [18]:
kmers = kmers_bed.to_dataframe(names=['seqname', 'start', 'end', 'name', 'score', 'strand'])

In [19]:
kmers

Unnamed: 0,seqname,start,end,name,score,strand
0,chrX,73820650,73820700,XIST_0_0,.,-
1,chrX,73820651,73820701,XIST_0_1,.,-
2,chrX,73820652,73820702,XIST_0_2,.,-
3,chrX,73820653,73820703,XIST_0_3,.,-
4,chrX,73820654,73820704,XIST_0_4,.,-
...,...,...,...,...,...,...
16853,chrX,73852699,73852749,XIST_5_11318,.,-
16854,chrX,73852700,73852750,XIST_5_11319,.,-
16855,chrX,73852701,73852751,XIST_5_11320,.,-
16856,chrX,73852702,73852752,XIST_5_11321,.,-


In [20]:
## Same with SNPs
if snp_db is not None:
    snps_bed = pybedtools.BedTool(snp_db)
    ## Remove any kmers overlapping any masked regions
    kmers_bed = kmers_bed.intersect(snps_bed, v=True)

In [21]:
kmers = kmers_bed.to_dataframe(names=['seqname', 'start', 'end', 'name', 'score', 'strand'])

In [22]:
kmers

Unnamed: 0,seqname,start,end,name,score,strand
0,chrX,73820650,73820700,XIST_0_0,.,-
1,chrX,73820651,73820701,XIST_0_1,.,-
2,chrX,73820652,73820702,XIST_0_2,.,-
3,chrX,73820653,73820703,XIST_0_3,.,-
4,chrX,73820654,73820704,XIST_0_4,.,-
...,...,...,...,...,...,...
15711,chrX,73852696,73852746,XIST_5_11315,.,-
15712,chrX,73852697,73852747,XIST_5_11316,.,-
15713,chrX,73852698,73852748,XIST_5_11317,.,-
15714,chrX,73852699,73852749,XIST_5_11318,.,-


In [23]:
## Get DNA for the transcript
kmers_seq = kmers_bed.sequence(fi=fasta, s=True) 

## We can read in the sequences and simultaneously monitor GC content and count the longest homopolymer stretch
kmers_seq_stats = get_sequence_stats(kmers_seq.seqfn, probe_length, split_nt)

In [24]:
## Combining with our dataframe
kmers = pd.merge(kmers, kmers_seq_stats, left_index=True, right_index=True)

In [25]:
kmers

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS
0,chrX,73820650,73820700,XIST_0_0,.,-,chrX:73820650-73820700(-),GGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAAAC...,CAAAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACT...,0.30,4,0.20,0.40
1,chrX,73820651,73820701,XIST_0_1,.,-,chrX:73820651-73820701(-),TGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAAA...,AAAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTC...,0.28,4,0.16,0.40
2,chrX,73820652,73820702,XIST_0_2,.,-,chrX:73820652-73820702(-),CTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAA...,AAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCC...,0.30,4,0.16,0.44
3,chrX,73820653,73820703,XIST_0_3,.,-,chrX:73820653-73820703(-),ACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAA...,AGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCC...,0.30,4,0.16,0.44
4,chrX,73820654,73820704,XIST_0_4,.,-,chrX:73820654-73820704(-),AACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGA...,GTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCCC...,0.30,4,0.16,0.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15711,chrX,73852696,73852746,XIST_5_11315,.,-,chrX:73852696-73852746(-),TTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCT...,CGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTA...,0.46,3,0.44,0.48
15712,chrX,73852697,73852747,XIST_5_11316,.,-,chrX:73852697-73852747(-),GTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTC...,GAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAA...,0.46,3,0.44,0.48
15713,chrX,73852698,73852748,XIST_5_11317,.,-,chrX:73852698-73852748(-),AGTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACT...,AGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAAG...,0.44,3,0.44,0.44
15714,chrX,73852699,73852749,XIST_5_11318,.,-,chrX:73852699-73852749(-),CAGTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTAC...,GAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAAGA...,0.46,3,0.44,0.48


In [26]:
## Check for required nucleotides in specific positions:
if required_nts is not None:
    kmers['has_required_nts'] = check_for_required_nts(kmers, required_nts)
    print(kmers['has_required_nts'].value_counts())
    ## Filter for required nucleotides
    kmers = kmers[kmers['has_required_nts']==True].reset_index(drop=True)

has_required_nts
False    11525
True      4191
Name: count, dtype: int64


In [27]:
## Export kmers before filtering
kmers.to_csv((out_dir + 'kmers_candidates_unfiltered.csv'))

In [28]:
kmers

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts
0,chrX,73820650,73820700,XIST_0_0,.,-,chrX:73820650-73820700(-),GGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAAAC...,CAAAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACT...,0.30,4,0.20,0.40,True
1,chrX,73820652,73820702,XIST_0_2,.,-,chrX:73820652-73820702(-),CTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAA...,AAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCC...,0.30,4,0.16,0.44,True
2,chrX,73820653,73820703,XIST_0_3,.,-,chrX:73820653-73820703(-),ACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAA...,AGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCC...,0.30,4,0.16,0.44,True
3,chrX,73820654,73820704,XIST_0_4,.,-,chrX:73820654-73820704(-),AACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGA...,GTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCCC...,0.30,4,0.16,0.44,True
4,chrX,73820655,73820705,XIST_0_5,.,-,chrX:73820655-73820705(-),AAACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTG...,TTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCCCA...,0.28,4,0.12,0.44,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4186,chrX,73852664,73852714,XIST_5_11283,.,-,chrX:73852664-73852714(-),ATATTTCTTACTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCT...,AGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAGAGAGTAAGAA...,0.46,4,0.44,0.48,True
4187,chrX,73852680,73852730,XIST_5_11299,.,-,chrX:73852680-73852730(-),ATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGGCTGGAAGCT...,AGGAAGCTTCCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGC...,0.52,4,0.60,0.44,True
4188,chrX,73852687,73852737,XIST_5_11306,.,-,chrX:73852687-73852737(-),CGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGGCT...,TTCCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGC...,0.54,4,0.48,0.60,True
4189,chrX,73852689,73852739,XIST_5_11308,.,-,chrX:73852689-73852739(-),AGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGG...,CCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAG...,0.56,4,0.48,0.64,True


In [29]:
## Filter for GC content
kmers = filter_by_GC_content(kmers, min_GC, max_GC)

In [30]:
kmers

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts
0,chrX,73821073,73821123,XIST_0_423,.,-,chrX:73821073-73821123(-),TGGACTCAGTAACACCCCTTTCTTCAGCTGGGGATGGGGAATGGAT...,AATAATCCATTCCCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAG...,0.48,4,0.48,0.48,True
1,chrX,73821085,73821135,XIST_0_435,.,-,chrX:73821085-73821135(-),TGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGGGA...,CCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATAC...,0.54,4,0.60,0.48,True
2,chrX,73821087,73821137,XIST_0_437,.,-,chrX:73821087-73821137(-),TGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGG...,CATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCA...,0.52,4,0.56,0.48,True
3,chrX,73821088,73821138,XIST_0_438,.,-,chrX:73821088-73821138(-),ATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGG...,ATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCAC...,0.50,4,0.52,0.48,True
4,chrX,73821091,73821141,XIST_0_441,.,-,chrX:73821091-73821141(-),TCAATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGC...,CCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCACACA...,0.50,4,0.52,0.48,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,chrX,73852654,73852704,XIST_5_11273,.,-,chrX:73852654-73852704(-),CTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTCTCTGCACTT...,CCCCAAGTGCAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAG...,0.58,4,0.52,0.64,True
530,chrX,73852663,73852713,XIST_5_11282,.,-,chrX:73852663-73852713(-),TATTTCTTACTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTC...,CAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAGAGAGTAAGA...,0.48,4,0.48,0.48,True
531,chrX,73852687,73852737,XIST_5_11306,.,-,chrX:73852687-73852737(-),CGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGGCT...,TTCCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGC...,0.54,4,0.48,0.60,True
532,chrX,73852689,73852739,XIST_5_11308,.,-,chrX:73852689-73852739(-),AGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGG...,CCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAG...,0.56,4,0.48,0.64,True


In [31]:
## Candidate kmers
kmers

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts
0,chrX,73821073,73821123,XIST_0_423,.,-,chrX:73821073-73821123(-),TGGACTCAGTAACACCCCTTTCTTCAGCTGGGGATGGGGAATGGAT...,AATAATCCATTCCCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAG...,0.48,4,0.48,0.48,True
1,chrX,73821085,73821135,XIST_0_435,.,-,chrX:73821085-73821135(-),TGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGGGA...,CCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATAC...,0.54,4,0.60,0.48,True
2,chrX,73821087,73821137,XIST_0_437,.,-,chrX:73821087-73821137(-),TGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGG...,CATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCA...,0.52,4,0.56,0.48,True
3,chrX,73821088,73821138,XIST_0_438,.,-,chrX:73821088-73821138(-),ATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGG...,ATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCAC...,0.50,4,0.52,0.48,True
4,chrX,73821091,73821141,XIST_0_441,.,-,chrX:73821091-73821141(-),TCAATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGC...,CCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCACACA...,0.50,4,0.52,0.48,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,chrX,73852654,73852704,XIST_5_11273,.,-,chrX:73852654-73852704(-),CTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTCTCTGCACTT...,CCCCAAGTGCAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAG...,0.58,4,0.52,0.64,True
530,chrX,73852663,73852713,XIST_5_11282,.,-,chrX:73852663-73852713(-),TATTTCTTACTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTC...,CAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAGAGAGTAAGA...,0.48,4,0.48,0.48,True
531,chrX,73852687,73852737,XIST_5_11306,.,-,chrX:73852687-73852737(-),CGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGGCT...,TTCCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGC...,0.54,4,0.48,0.60,True
532,chrX,73852689,73852739,XIST_5_11308,.,-,chrX:73852689-73852739(-),AGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGG...,CCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAG...,0.56,4,0.48,0.64,True


In [32]:
kmers.to_csv((out_dir + 'kmers_candidates_filtered.csv'))

In [33]:
## We are exporting the transcript sequence as that's the one that has to be blasted against the human transcriptome
write_fasta(kmers['name'], kmers['transcript_seq'], (out_dir + 'kmers_candidates_filtered_transcript_seqs.fa'))

In [34]:
## Run BLAST - you can also adjust the parameters here, though the outfmt needs to be consistent for downstream functions to work
command = [
    (blast_exec_path + 'blastn'),
    '-query', (out_dir + 'kmers_candidates_filtered_transcript_seqs.fa'), ## Sequences of targeted regions
    '-db', blast_db,                     # BLAST database
    '-out', (out_dir + 'kmers_candidates_filtered_blast_output.txt'),  # Output file
    '-outfmt', '6',
    '-strand', 'plus',
    '-evalue', '1e-3',  # More lenient e-value to detect potential off-targets
    '-dust', 'no'       # Turn off low-complexity filter
]

# Run the command
result = subprocess.run(command, capture_output=True, text=True)

In [35]:
## Read in the blast output
blast_columns = [
    "name",   # Query Seq-id
    "sseqid",   # Subject Seq-id
    "pident",   # Percentage of identical matches
    "length",   # Alignment length
    "mismatch", # Number of mismatches
    "gapopen",  # Number of gap openings
    "qstart",   # Start of alignment in query
    "qend",     # End of alignment in query
    "sstart",   # Start of alignment in subject
    "send",     # End of alignment in subject
    "evalue",   # Expect value
    "bitscore"  # Bit score
]

blast_res = pd.read_csv(
    (out_dir + 'kmers_candidates_filtered_blast_output.txt'),
    sep='\t', 
    header=None, 
    names=blast_columns
    )

In [36]:
## Extract the gene ID
blast_res['sgeneid'] = blast_res['sseqid'].str.split('::').str[0]

In [37]:
blast_res['sgeneid'].value_counts()

sgeneid
XIST    641
Name: count, dtype: int64

In [38]:
blast_res['evalue'].describe()

count    6.410000e+02
mean     1.067253e-12
std      1.594191e-11
min      1.300000e-18
25%      1.300000e-18
50%      1.300000e-18
75%      1.300000e-18
max      2.840000e-10
Name: evalue, dtype: float64

In [39]:
offtargets = blast_res['name'][blast_res['sgeneid']!=gene_ID].unique()

In [40]:
offtargets

array([], dtype=object)

In [41]:
## Remove off targets
kmers = kmers[kmers['name'].isin(offtargets)==False].reset_index(drop=True)

In [42]:
kmers 

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts
0,chrX,73821073,73821123,XIST_0_423,.,-,chrX:73821073-73821123(-),TGGACTCAGTAACACCCCTTTCTTCAGCTGGGGATGGGGAATGGAT...,AATAATCCATTCCCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAG...,0.48,4,0.48,0.48,True
1,chrX,73821085,73821135,XIST_0_435,.,-,chrX:73821085-73821135(-),TGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGGGA...,CCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATAC...,0.54,4,0.60,0.48,True
2,chrX,73821087,73821137,XIST_0_437,.,-,chrX:73821087-73821137(-),TGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGG...,CATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCA...,0.52,4,0.56,0.48,True
3,chrX,73821088,73821138,XIST_0_438,.,-,chrX:73821088-73821138(-),ATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGG...,ATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCAC...,0.50,4,0.52,0.48,True
4,chrX,73821091,73821141,XIST_0_441,.,-,chrX:73821091-73821141(-),TCAATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGC...,CCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCACACA...,0.50,4,0.52,0.48,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,chrX,73852654,73852704,XIST_5_11273,.,-,chrX:73852654-73852704(-),CTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTCTCTGCACTT...,CCCCAAGTGCAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAG...,0.58,4,0.52,0.64,True
530,chrX,73852663,73852713,XIST_5_11282,.,-,chrX:73852663-73852713(-),TATTTCTTACTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTC...,CAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAGAGAGTAAGA...,0.48,4,0.48,0.48,True
531,chrX,73852687,73852737,XIST_5_11306,.,-,chrX:73852687-73852737(-),CGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGGCT...,TTCCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGC...,0.54,4,0.48,0.60,True
532,chrX,73852689,73852739,XIST_5_11308,.,-,chrX:73852689-73852739(-),AGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGG...,CCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAG...,0.56,4,0.48,0.64,True


In [43]:
## Sort in increasing homopolymer length
kmers = kmers.sort_values('longest_homopolymer', ascending=True).reset_index(drop=True)

In [44]:
kmers

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts
0,chrX,73850556,73850606,XIST_5_9175,.,-,chrX:73850556-73850606(-),TGGCAAGGACCAGAATGGATCACAGATGATCGTTGGCCAACAGGTG...,CTGCCACCTGTTGGCCAACGATCATCTGTGATCCATTCTGGTCCTT...,0.54,2,0.56,0.52,True
1,chrX,73850818,73850868,XIST_5_9437,.,-,chrX:73850818-73850868(-),AGTGCTGCCTCTTGCAGTGCTGGATATCTGGCTGTGTGGTCTGAAC...,GGAGGTTCAGACCACACAGCCAGATATCCAGCACTGCAAGAGGCAG...,0.56,2,0.56,0.56,True
2,chrX,73826123,73826173,XIST_0_5473,.,-,chrX:73826123-73826173(-),CAGATGTGCCAGACTTCTGAGAAGCACCTGCCAGCAACAGCTTCCT...,AAGAAGGAAGCTGTTGCTGGCAGGTGCTTCTCAGAAGTCTGGCACA...,0.52,2,0.52,0.52,True
3,chrX,73826126,73826176,XIST_0_5476,.,-,chrX:73826126-73826176(-),ACACAGATGTGCCAGACTTCTGAGAAGCACCTGCCAGCAACAGCTT...,AAGGAAGCTGTTGCTGGCAGGTGCTTCTCAGAAGTCTGGCACATCT...,0.52,2,0.56,0.48,True
4,chrX,73826127,73826177,XIST_0_5477,.,-,chrX:73826127-73826177(-),TACACAGATGTGCCAGACTTCTGAGAAGCACCTGCCAGCAACAGCT...,AGGAAGCTGTTGCTGGCAGGTGCTTCTCAGAAGTCTGGCACATCTG...,0.52,2,0.56,0.48,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,chrX,73850757,73850807,XIST_5_9376,.,-,chrX:73850757-73850807(-),TGTATTGGTGCCTCACCTAAGGCTAAGTATACCTCCccccccaccc...,tggggggtggggggGGAGGTATACTTAGCCTTAGGTGAGGCACCAA...,0.58,6,0.68,0.48,True
530,chrX,73850758,73850808,XIST_5_9377,.,-,chrX:73850758-73850808(-),CTGTATTGGTGCCTCACCTAAGGCTAAGTATACCTCCccccccacc...,ggggggtggggggGGAGGTATACTTAGCCTTAGGTGAGGCACCAAT...,0.60,6,0.68,0.52,True
531,chrX,73850763,73850813,XIST_5_9382,.,-,chrX:73850763-73850813(-),TTCCTCTGTATTGGTGCCTCACCTAAGGCTAAGTATACCTCCcccc...,gtggggggGGAGGTATACTTAGCCTTAGGTGAGGCACCAATACAGA...,0.54,6,0.60,0.48,True
532,chrX,73850764,73850814,XIST_5_9383,.,-,chrX:73850764-73850814(-),ATTCCTCTGTATTGGTGCCTCACCTAAGGCTAAGTATACCTCCccc...,tggggggGGAGGTATACTTAGCCTTAGGTGAGGCACCAATACAGAG...,0.52,6,0.56,0.48,True


In [None]:
## We have a lot of probes here - increasing the offset to 1000 bp to space them out
probe_offset = 1000

In [45]:
selected_probes_list = []
df = kmers.copy()
i = 0
while i < 3:
    selected_probes_list.append(df.iloc[0:1,:])
    ## Going for a high offset as we have a large span:
    df = remove_overlapping_probes(df,
                                   df['name'][0], 
                                   offset=probe_offset).reset_index(drop=True).copy()
    i +=1

  start = int(probe_df['start'].iloc[i])
  end = int(probe_df['end'].iloc[i])


In [46]:
selected_probes_df = pd.concat(selected_probes_list, axis=0).reset_index(drop=True)

In [47]:
selected_probes_df

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts
0,chrX,73850556,73850606,XIST_5_9175,.,-,chrX:73850556-73850606(-),TGGCAAGGACCAGAATGGATCACAGATGATCGTTGGCCAACAGGTG...,CTGCCACCTGTTGGCCAACGATCATCTGTGATCCATTCTGGTCCTT...,0.54,2,0.56,0.52,True
1,chrX,73826123,73826173,XIST_0_5473,.,-,chrX:73826123-73826173(-),CAGATGTGCCAGACTTCTGAGAAGCACCTGCCAGCAACAGCTTCCT...,AAGAAGGAAGCTGTTGCTGGCAGGTGCTTCTCAGAAGTCTGGCACA...,0.52,2,0.52,0.52,True
2,chrX,73851704,73851754,XIST_5_10323,.,-,chrX:73851704-73851754(-),AGATGGATGATAGCAGGTCAGGCAGAGGAAGTCATGTGCATTGCAT...,GCTCATGCAATGCACATGACTTCCTCTGCCTGACCTGCTATCATCC...,0.5,2,0.48,0.52,True


In [48]:
for seq in selected_probes_df['transcript_seq']:
    print(seq)

TGGCAAGGACCAGAATGGATCACAGATGATCGTTGGCCAACAGGTGGCAG
CAGATGTGCCAGACTTCTGAGAAGCACCTGCCAGCAACAGCTTCCTTCTT
AGATGGATGATAGCAGGTCAGGCAGAGGAAGTCATGTGCATTGCATGAGC


In [49]:
## Make split probes
selected_probes_df['probe_seq_LHS'] = [seq[0:split_nt] for seq in selected_probes_df['probe_seq']]
selected_probes_df['probe_seq_RHS'] = [seq[split_nt: probe_length] for seq in selected_probes_df['probe_seq']]

In [50]:
## Also add gene_ID for completeness
selected_probes_df['gene_ID'] = gene_ID

In [51]:
## Export selected probes as dataframe:
selected_probes_df.to_csv((out_dir + 'kmers_selected_probes.csv'))