In [1]:
import os 
import sys
import subprocess
import pandas as pd
import numpy as np
import pybedtools
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction
from Bio.Blast import NCBIXML
from Bio.Blast.Applications import NcbiblastnCommandline
from gene2probe import *


Due to the on going maintenance burden of keeping command line application
wrappers up to date, we have decided to deprecate and eventually remove these
modules.

We instead now recommend building your command line and invoking it directly
with the subprocess module.


In [2]:
## specify inputs
gene_ID = 'XIST'
feature = 'exon' ## Whether to consider only exons / introns or full gene
gtf = '/nfs/team205/is10/resources/ucsc_genomes/hg38.ncbiRefSeq.gtf' ## Gene annotation in gtf file
## We recommend using RefSeq as this is manually curated and more likely to contain an isoform that is present across most cell types
## Alternatively, one can filter based on RNA-seq data for a cell type/tissue of interest
fasta = '/nfs/team205/is10/resources/ucsc_genomes/hg38.fa' ## Genome in fasta file
snp_db = '/nfs/team205/is10/resources/hg38_dbSNP/hg38_snp151Common_chrX.txt' ## Database of known SNPs
masked_regions_bed = '/nfs/team205/is10/resources/hg38_repeatmasker/hg38_rmsk.bed' ## bed file with coordinates to be excluded
blast_db = '/nfs/team205/is10/projects/thytrans/data/gene2probe/blastdb/hg38_ncbiRefSeq_exons_db' ## Database of all human transcripts to blast against
blast_exec_path = '/nfs/team205/is10/miniconda/envs/gene2probe/bin/'

In [3]:
## additional parameters
probe_length = 50
split_nt = 25 ## Index of nucleotide to split the probe at (start of RHS) - leave None if splitting probe is not needed
min_GC = 0.44
max_GC = 0.72
required_nts = {24: 'T'} ## Dictionary of index (0-based) for required nts - by default, 25th nucleotide must be a T

In [4]:
## Create output directory
out_dir = '/nfs/team205/is10/projects/thytrans/data/gene2probe/' + gene_ID + '/'
os.makedirs(out_dir, exist_ok=True)

In [5]:
## Read gtf file
gene_anno = read_gtf(gtf)
## Extract regions corresponding to gene of interest (symbol: gene_name, Ensembl ID: gene_ID)
gene_ids = gene_anno['attribute'].apply(extract_feature_from_gtf, feature='gene_name')
roi = gene_anno.iloc[np.where(gene_ids == gene_ID)[0],:]

In [6]:
roi

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
325814,chrX,ncbiRefSeq.2022-10-28,transcript,73820651,73852753,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; ..."
325815,chrX,ncbiRefSeq.2022-10-28,exon,73820651,73827984,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325816,chrX,ncbiRefSeq.2022-10-28,exon,73829068,73829231,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325817,chrX,ncbiRefSeq.2022-10-28,exon,73831066,73831274,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325818,chrX,ncbiRefSeq.2022-10-28,exon,73833238,73833374,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325819,chrX,ncbiRefSeq.2022-10-28,exon,73837440,73837503,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325820,chrX,ncbiRefSeq.2022-10-28,exon,73841382,73852753,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."


In [7]:
## Subset to feature type of interest
roi = roi[roi['feature']==feature]

In [8]:
roi

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
325815,chrX,ncbiRefSeq.2022-10-28,exon,73820651,73827984,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325816,chrX,ncbiRefSeq.2022-10-28,exon,73829068,73829231,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325817,chrX,ncbiRefSeq.2022-10-28,exon,73831066,73831274,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325818,chrX,ncbiRefSeq.2022-10-28,exon,73833238,73833374,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325819,chrX,ncbiRefSeq.2022-10-28,exon,73837440,73837503,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."
325820,chrX,ncbiRefSeq.2022-10-28,exon,73841382,73852753,.,-,.,"gene_id ""XIST""; transcript_id ""NR_001564.2""; e..."


In [9]:
## Convert to bed-style df
roi_bed = gtf_2_bed(roi, name_pref = (gene_ID + '_'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bed['start'] = df_bed['start'].astype(int) - 1


In [10]:
roi_bed

Unnamed: 0,seqname,start,end,name,score,strand
0,chrX,73820650,73827984,XIST_0,.,-
1,chrX,73829067,73829231,XIST_1,.,-
2,chrX,73831065,73831274,XIST_2,.,-
3,chrX,73833237,73833374,XIST_3,.,-
4,chrX,73837439,73837503,XIST_4,.,-
5,chrX,73841381,73852753,XIST_5,.,-


In [11]:
roi_kmers = generate_50mers(roi_bed)

In [12]:
roi_kmers

Unnamed: 0,seqname,start,end,name,score,strand
0,chrX,73820650,73820700,XIST_0_0,.,-
1,chrX,73820651,73820701,XIST_0_1,.,-
2,chrX,73820652,73820702,XIST_0_2,.,-
3,chrX,73820653,73820703,XIST_0_3,.,-
4,chrX,73820654,73820704,XIST_0_4,.,-
...,...,...,...,...,...,...
18981,chrX,73852699,73852749,XIST_5_11318,.,-
18982,chrX,73852700,73852750,XIST_5_11319,.,-
18983,chrX,73852701,73852751,XIST_5_11320,.,-
18984,chrX,73852702,73852752,XIST_5_11321,.,-


In [13]:
## Export unfiltered
roi_kmers.to_csv((out_dir + 'kmers_all.csv'))

In [14]:
## Convert to pybedtools
roi_kmers_bed = pybedtools.BedTool.from_dataframe(roi_kmers)

In [15]:
## Remove masked regions
if masked_regions_bed is not None:
    ## Read rmsk
    rmsk = pybedtools.BedTool(masked_regions_bed)
    ## Remove any kmers overlapping any masked regions
    roi_kmers_bed = roi_kmers_bed.intersect(rmsk, v=True)

In [16]:
roi_kmers_filtered = roi_kmers_bed.to_dataframe(names=['seqname', 'start', 'end', 'name', 'score', 'strand'])

In [17]:
roi_kmers_filtered

Unnamed: 0,seqname,start,end,name,score,strand
0,chrX,73820650,73820700,XIST_0_0,.,-
1,chrX,73820651,73820701,XIST_0_1,.,-
2,chrX,73820652,73820702,XIST_0_2,.,-
3,chrX,73820653,73820703,XIST_0_3,.,-
4,chrX,73820654,73820704,XIST_0_4,.,-
...,...,...,...,...,...,...
16853,chrX,73852699,73852749,XIST_5_11318,.,-
16854,chrX,73852700,73852750,XIST_5_11319,.,-
16855,chrX,73852701,73852751,XIST_5_11320,.,-
16856,chrX,73852702,73852752,XIST_5_11321,.,-


In [18]:
## Same with SNPs
if snp_db is not None:
    ## Read SNP file (by chromosome as this can be quite large)
    snps = pd.read_csv(snp_db, sep='\t')  
    snps_bed = pybedtools.BedTool.from_dataframe(snps[['#chrom', 'chromStart', 'chromEnd']])
    ## Remove any kmers overlapping any masked regions
    roi_kmers_bed = roi_kmers_bed.intersect(snps_bed, v=True)

In [19]:
roi_kmers_filtered = roi_kmers_bed.to_dataframe(names=['seqname', 'start', 'end', 'name', 'score', 'strand'])

In [20]:
roi_kmers_filtered

Unnamed: 0,seqname,start,end,name,score,strand
0,chrX,73820650,73820700,XIST_0_0,.,-
1,chrX,73820651,73820701,XIST_0_1,.,-
2,chrX,73820652,73820702,XIST_0_2,.,-
3,chrX,73820653,73820703,XIST_0_3,.,-
4,chrX,73820654,73820704,XIST_0_4,.,-
...,...,...,...,...,...,...
15728,chrX,73852699,73852749,XIST_5_11318,.,-
15729,chrX,73852700,73852750,XIST_5_11319,.,-
15730,chrX,73852701,73852751,XIST_5_11320,.,-
15731,chrX,73852702,73852752,XIST_5_11321,.,-


In [21]:
## Get DNA for transcript and probe (reverse complement)
## Simultaneously we can monitor GC content and count the longest homopolymer stretch
roi_kmers_seq = roi_kmers_bed.sequence(fi=fasta, s=True) 

kmer_coord = []
transcript_seq = []
probe_seq = []
gc_content = []
gc_content_lhs = []
gc_content_rhs = []
longest_homopol = []
with open(roi_kmers_seq.seqfn) as f:
    for line in f:
        if line.startswith('>'):
            kmer_coord.append(line.strip().replace('>', ''))
        else:
            ## Get sequence
            kmer_seq = Seq(line.strip())
            transcript_seq.append(str(kmer_seq))
            ## Reverse complement - this is the sequence of the probe
            probe_seq.append(str(kmer_seq.reverse_complement()))
            ## Estimate GC content
            gc_content.append(gc_fraction(kmer_seq))
            gc_content_rhs.append(gc_fraction(kmer_seq[0:split_nt])) ## RHS of the probe is LHS of the target - GC content is the same
            gc_content_lhs.append(gc_fraction(kmer_seq[split_nt: probe_length])) ## RHS of the probe is LHS of the target - GC content is the same
            ## Estimate longest homopolymer
            longest_homopol.append(get_longest_homopolymer(kmer_seq))

## Incorporate sequences and stats into the dataframe
roi_kmers_filtered['kmer_coord'] = kmer_coord
roi_kmers_filtered['transcript_seq'] = transcript_seq
roi_kmers_filtered['probe_seq'] = probe_seq
roi_kmers_filtered['GC_content_full'] = gc_content
roi_kmers_filtered['GC_content_LHS'] = gc_content_lhs
roi_kmers_filtered['GC_content_RHS'] = gc_content_rhs
roi_kmers_filtered['longest_homopolymer'] = longest_homopol

In [22]:
roi_kmers_filtered

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,GC_content_LHS,GC_content_RHS,longest_homopolymer
0,chrX,73820650,73820700,XIST_0_0,.,-,chrX:73820650-73820700(-),GGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAAAC...,CAAAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACT...,0.30,0.20,0.40,4
1,chrX,73820651,73820701,XIST_0_1,.,-,chrX:73820651-73820701(-),TGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAAA...,AAAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTC...,0.28,0.16,0.40,4
2,chrX,73820652,73820702,XIST_0_2,.,-,chrX:73820652-73820702(-),CTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAA...,AAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCC...,0.30,0.16,0.44,4
3,chrX,73820653,73820703,XIST_0_3,.,-,chrX:73820653-73820703(-),ACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAA...,AGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCC...,0.30,0.16,0.44,4
4,chrX,73820654,73820704,XIST_0_4,.,-,chrX:73820654-73820704(-),AACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGA...,GTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCCC...,0.30,0.16,0.44,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15728,chrX,73852699,73852749,XIST_5_11318,.,-,chrX:73852699-73852749(-),CAGTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTAC...,GAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAAGA...,0.46,0.44,0.48,3
15729,chrX,73852700,73852750,XIST_5_11319,.,-,chrX:73852700-73852750(-),TCAGTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTA...,AGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAAGAA...,0.44,0.44,0.44,3
15730,chrX,73852701,73852751,XIST_5_11320,.,-,chrX:73852701-73852751(-),TTCAGTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTT...,GAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAAGAAC...,0.44,0.48,0.40,3
15731,chrX,73852702,73852752,XIST_5_11321,.,-,chrX:73852702-73852752(-),CTTCAGTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCT...,AGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAAGAACT...,0.44,0.48,0.40,3


In [23]:
## Mark probes matching nucleotide requirement:
has_required_nt = [True for i in range(roi_kmers_filtered.shape[0])]
for nt_idx in required_nts.keys():
    for i in range(roi_kmers_filtered.shape[0]):
        if roi_kmers_filtered['probe_seq'][i][nt_idx] != required_nts[nt_idx]:
            has_required_nt[i] = False

In [24]:
roi_kmers_filtered['has_required_nts'] = has_required_nt

In [25]:
roi_kmers_filtered['has_required_nts'].value_counts()

has_required_nts
False    11549
True      4184
Name: count, dtype: int64

In [26]:
## Export kmers before filtering
roi_kmers_filtered.to_csv((out_dir + 'kmers_candidates_unfiltered.csv'))

In [27]:
roi_kmers_filtered

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,GC_content_LHS,GC_content_RHS,longest_homopolymer,has_required_nts
0,chrX,73820650,73820700,XIST_0_0,.,-,chrX:73820650-73820700(-),GGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAAAC...,CAAAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACT...,0.30,0.20,0.40,4,True
1,chrX,73820651,73820701,XIST_0_1,.,-,chrX:73820651-73820701(-),TGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAAA...,AAAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTC...,0.28,0.16,0.40,4,False
2,chrX,73820652,73820702,XIST_0_2,.,-,chrX:73820652-73820702(-),CTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAA...,AAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCC...,0.30,0.16,0.44,4,True
3,chrX,73820653,73820703,XIST_0_3,.,-,chrX:73820653-73820703(-),ACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAA...,AGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCC...,0.30,0.16,0.44,4,True
4,chrX,73820654,73820704,XIST_0_4,.,-,chrX:73820654-73820704(-),AACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGA...,GTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCCC...,0.30,0.16,0.44,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15728,chrX,73852699,73852749,XIST_5_11318,.,-,chrX:73852699-73852749(-),CAGTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTAC...,GAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAAGA...,0.46,0.44,0.48,3,False
15729,chrX,73852700,73852750,XIST_5_11319,.,-,chrX:73852700-73852750(-),TCAGTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTA...,AGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAAGAA...,0.44,0.44,0.44,3,False
15730,chrX,73852701,73852751,XIST_5_11320,.,-,chrX:73852701-73852751(-),TTCAGTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTT...,GAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAAGAAC...,0.44,0.48,0.40,3,False
15731,chrX,73852702,73852752,XIST_5_11321,.,-,chrX:73852702-73852752(-),CTTCAGTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCT...,AGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAAGAACT...,0.44,0.48,0.40,3,False


In [28]:
## Filter for GC content
roi_kmers_filtered = roi_kmers_filtered[(roi_kmers_filtered['GC_content_full'] > min_GC) & 
(roi_kmers_filtered['GC_content_LHS'] > min_GC) & (roi_kmers_filtered['GC_content_RHS'] > min_GC) & 
(roi_kmers_filtered['GC_content_full'] < max_GC) & 
(roi_kmers_filtered['GC_content_LHS'] < max_GC) & (roi_kmers_filtered['GC_content_RHS'] < max_GC)].reset_index(drop=True)

In [29]:
roi_kmers_filtered

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,GC_content_LHS,GC_content_RHS,longest_homopolymer,has_required_nts
0,chrX,73821069,73821119,XIST_0_419,.,-,chrX:73821069-73821119(-),CTCAGTAACACCCCTTTCTTCAGCTGGGGATGGGGAATGGATTATT...,TTCCAATAATCCATTCCCCATCCCCAGCTGAAGAAAGGGGTGTTAC...,0.48,0.48,0.48,4,False
1,chrX,73821070,73821120,XIST_0_420,.,-,chrX:73821070-73821120(-),ACTCAGTAACACCCCTTTCTTCAGCTGGGGATGGGGAATGGATTAT...,TCCAATAATCCATTCCCCATCCCCAGCTGAAGAAAGGGGTGTTACT...,0.48,0.48,0.48,4,False
2,chrX,73821071,73821121,XIST_0_421,.,-,chrX:73821071-73821121(-),GACTCAGTAACACCCCTTTCTTCAGCTGGGGATGGGGAATGGATTA...,CCAATAATCCATTCCCCATCCCCAGCTGAAGAAAGGGGTGTTACTG...,0.50,0.52,0.48,4,False
3,chrX,73821072,73821122,XIST_0_422,.,-,chrX:73821072-73821122(-),GGACTCAGTAACACCCCTTTCTTCAGCTGGGGATGGGGAATGGATT...,CAATAATCCATTCCCCATCCCCAGCTGAAGAAAGGGGTGTTACTGA...,0.50,0.52,0.48,4,False
4,chrX,73821073,73821123,XIST_0_423,.,-,chrX:73821073-73821123(-),TGGACTCAGTAACACCCCTTTCTTCAGCTGGGGATGGGGAATGGAT...,AATAATCCATTCCCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAG...,0.48,0.48,0.48,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2491,chrX,73852691,73852741,XIST_5_11310,.,-,chrX:73852691-73852741(-),AAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGG...,AGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCG...,0.52,0.48,0.56,4,False
2492,chrX,73852692,73852742,XIST_5_11311,.,-,chrX:73852692-73852742(-),TAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCG...,GCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGC...,0.52,0.52,0.52,4,False
2493,chrX,73852693,73852743,XIST_5_11312,.,-,chrX:73852693-73852743(-),TTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTC...,CCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCT...,0.50,0.48,0.52,4,True
2494,chrX,73852694,73852744,XIST_5_11313,.,-,chrX:73852694-73852744(-),CTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCT...,CCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTT...,0.50,0.48,0.52,3,False


In [30]:
## Filter for required nucleotides
roi_kmers_filtered = roi_kmers_filtered[roi_kmers_filtered['has_required_nts']==True].reset_index(drop=True)

In [31]:
## Candidate kmers
roi_kmers_filtered

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,GC_content_LHS,GC_content_RHS,longest_homopolymer,has_required_nts
0,chrX,73821073,73821123,XIST_0_423,.,-,chrX:73821073-73821123(-),TGGACTCAGTAACACCCCTTTCTTCAGCTGGGGATGGGGAATGGAT...,AATAATCCATTCCCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAG...,0.48,0.48,0.48,4,True
1,chrX,73821085,73821135,XIST_0_435,.,-,chrX:73821085-73821135(-),TGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGGGA...,CCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATAC...,0.54,0.60,0.48,4,True
2,chrX,73821087,73821137,XIST_0_437,.,-,chrX:73821087-73821137(-),TGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGG...,CATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCA...,0.52,0.56,0.48,4,True
3,chrX,73821088,73821138,XIST_0_438,.,-,chrX:73821088-73821138(-),ATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGG...,ATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCAC...,0.50,0.52,0.48,4,True
4,chrX,73821091,73821141,XIST_0_441,.,-,chrX:73821091-73821141(-),TCAATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGC...,CCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCACACA...,0.50,0.52,0.48,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521,chrX,73852654,73852704,XIST_5_11273,.,-,chrX:73852654-73852704(-),CTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTCTCTGCACTT...,CCCCAAGTGCAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAG...,0.58,0.52,0.64,4,True
522,chrX,73852663,73852713,XIST_5_11282,.,-,chrX:73852663-73852713(-),TATTTCTTACTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTC...,CAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAGAGAGTAAGA...,0.48,0.48,0.48,4,True
523,chrX,73852687,73852737,XIST_5_11306,.,-,chrX:73852687-73852737(-),CGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGGCT...,TTCCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGC...,0.54,0.48,0.60,4,True
524,chrX,73852689,73852739,XIST_5_11308,.,-,chrX:73852689-73852739(-),AGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGG...,CCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAG...,0.56,0.48,0.64,4,True


In [32]:
roi_kmers_filtered.to_csv((out_dir + 'kmers_candidates_filtered.csv'))

In [33]:
## We are exporting the transcript sequence as that's the one that has to be blasted against the human transcriptome
write_fasta(roi_kmers_filtered['name'], roi_kmers_filtered['transcript_seq'], (out_dir + 'kmers_candidates_filtered_transcript_seqs.fa'))

In [34]:
command = [
    (blast_exec_path + 'blastn'),
    '-query', (out_dir + 'kmers_candidates_filtered_transcript_seqs.fa'), ## Sequences of targeted regions
    '-db', blast_db,                     # BLAST database
    '-out', (out_dir + 'kmers_candidates_filtered_blast_output.txt'),  # Output file
    '-outfmt', '6',
    '-strand', 'plus',
    '-evalue', '1e-3',  # More lenient e-value to detect potential off-targets
    '-dust', 'no'       # Turn off low-complexity filter
]

# Run the command
result = subprocess.run(command, capture_output=True, text=True)

In [35]:
## Read in the blast output
blast_columns = [
    "name",   # Query Seq-id
    "sseqid",   # Subject Seq-id
    "pident",   # Percentage of identical matches
    "length",   # Alignment length
    "mismatch", # Number of mismatches
    "gapopen",  # Number of gap openings
    "qstart",   # Start of alignment in query
    "qend",     # End of alignment in query
    "sstart",   # Start of alignment in subject
    "send",     # End of alignment in subject
    "evalue",   # Expect value
    "bitscore"  # Bit score
]

blast_res = pd.read_csv(
    (out_dir + 'kmers_candidates_filtered_blast_output.txt'),
    sep='\t', 
    header=None, 
    names=blast_columns
    )

In [36]:
## Extract the gene ID
blast_res['sgeneid'] = blast_res['sseqid'].str.split('::').str[0]

In [37]:
blast_res['sgeneid'].value_counts()

sgeneid
XIST    611
Name: count, dtype: int64

In [38]:
blast_res['evalue'].describe()

count    6.110000e+02
mean     1.116134e-12
std      1.632751e-11
min      1.300000e-18
25%      1.300000e-18
50%      1.300000e-18
75%      1.300000e-18
max      2.840000e-10
Name: evalue, dtype: float64

In [39]:
offtargets = blast_res['name'][blast_res['sgeneid']!=gene_ID].unique()

In [40]:
offtargets

array([], dtype=object)

In [41]:
## Remove off targets
roi_kmers_filtered = roi_kmers_filtered[roi_kmers_filtered['name'].isin(offtargets)==False].reset_index(drop=True)

In [42]:
roi_kmers_filtered 

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,GC_content_LHS,GC_content_RHS,longest_homopolymer,has_required_nts
0,chrX,73821073,73821123,XIST_0_423,.,-,chrX:73821073-73821123(-),TGGACTCAGTAACACCCCTTTCTTCAGCTGGGGATGGGGAATGGAT...,AATAATCCATTCCCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAG...,0.48,0.48,0.48,4,True
1,chrX,73821085,73821135,XIST_0_435,.,-,chrX:73821085-73821135(-),TGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGGGA...,CCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATAC...,0.54,0.60,0.48,4,True
2,chrX,73821087,73821137,XIST_0_437,.,-,chrX:73821087-73821137(-),TGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGG...,CATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCA...,0.52,0.56,0.48,4,True
3,chrX,73821088,73821138,XIST_0_438,.,-,chrX:73821088-73821138(-),ATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGG...,ATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCAC...,0.50,0.52,0.48,4,True
4,chrX,73821091,73821141,XIST_0_441,.,-,chrX:73821091-73821141(-),TCAATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGC...,CCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCACACA...,0.50,0.52,0.48,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521,chrX,73852654,73852704,XIST_5_11273,.,-,chrX:73852654-73852704(-),CTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTCTCTGCACTT...,CCCCAAGTGCAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAG...,0.58,0.52,0.64,4,True
522,chrX,73852663,73852713,XIST_5_11282,.,-,chrX:73852663-73852713(-),TATTTCTTACTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTC...,CAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAGAGAGTAAGA...,0.48,0.48,0.48,4,True
523,chrX,73852687,73852737,XIST_5_11306,.,-,chrX:73852687-73852737(-),CGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGGCT...,TTCCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGC...,0.54,0.48,0.60,4,True
524,chrX,73852689,73852739,XIST_5_11308,.,-,chrX:73852689-73852739(-),AGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGG...,CCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAG...,0.56,0.48,0.64,4,True


In [43]:
## Sort in increasing homopolymer length
roi_kmers_filtered = roi_kmers_filtered.sort_values('longest_homopolymer', ascending=True).reset_index(drop=True)

In [44]:
roi_kmers_filtered

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,GC_content_LHS,GC_content_RHS,longest_homopolymer,has_required_nts
0,chrX,73827199,73827249,XIST_0_6549,.,-,chrX:73827199-73827249(-),GTGCAGAACGTCGGCTATCAGAGCAAGCATTGTGGAGCGGTTCCTT...,GCATAAGGAACCGCTCCACAATGCTTGCTCTGATAGCCGACGTTCT...,0.54,0.52,0.56,2,True
1,chrX,73851704,73851754,XIST_5_10323,.,-,chrX:73851704-73851754(-),AGATGGATGATAGCAGGTCAGGCAGAGGAAGTCATGTGCATTGCAT...,GCTCATGCAATGCACATGACTTCCTCTGCCTGACCTGCTATCATCC...,0.50,0.48,0.52,2,True
2,chrX,73850595,73850645,XIST_5_9214,.,-,chrX:73850595-73850645(-),TGCTCCAGGCCTGCTTGGTGTGGACATGGTGGTGAGCCGTGGCAAG...,GGTCCTTGCCACGGCTCACCACCATGTCCACACCAAGCAGGCCTGG...,0.64,0.64,0.64,2,True
3,chrX,73850597,73850647,XIST_5_9216,.,-,chrX:73850597-73850647(-),AGTGCTCCAGGCCTGCTTGGTGTGGACATGGTGGTGAGCCGTGGCA...,TCCTTGCCACGGCTCACCACCATGTCCACACCAAGCAGGCCTGGAG...,0.62,0.60,0.64,2,True
4,chrX,73826123,73826173,XIST_0_5473,.,-,chrX:73826123-73826173(-),CAGATGTGCCAGACTTCTGAGAAGCACCTGCCAGCAACAGCTTCCT...,AAGAAGGAAGCTGTTGCTGGCAGGTGCTTCTCAGAAGTCTGGCACA...,0.52,0.52,0.52,2,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521,chrX,73852326,73852376,XIST_5_10945,.,-,chrX:73852326-73852376(-),GGGGCTGCGGATACCTGGTTTTATTATTTTTTCTTTGCCCAACGGG...,CGGCCCCGTTGGGCAAAGAAAAAATAATAAAACCAGGTATCCGCAG...,0.52,0.52,0.52,6,True
522,chrX,73850763,73850813,XIST_5_9382,.,-,chrX:73850763-73850813(-),TTCCTCTGTATTGGTGCCTCACCTAAGGCTAAGTATACCTCCcccc...,gtggggggGGAGGTATACTTAGCCTTAGGTGAGGCACCAATACAGA...,0.54,0.60,0.48,6,True
523,chrX,73850758,73850808,XIST_5_9377,.,-,chrX:73850758-73850808(-),CTGTATTGGTGCCTCACCTAAGGCTAAGTATACCTCCccccccacc...,ggggggtggggggGGAGGTATACTTAGCCTTAGGTGAGGCACCAAT...,0.60,0.68,0.52,6,True
524,chrX,73850757,73850807,XIST_5_9376,.,-,chrX:73850757-73850807(-),TGTATTGGTGCCTCACCTAAGGCTAAGTATACCTCCccccccaccc...,tggggggtggggggGGAGGTATACTTAGCCTTAGGTGAGGCACCAA...,0.58,0.68,0.48,6,True


In [45]:
selected_probes_list = []
df = roi_kmers_filtered.copy()
i = 0
while i < 3:
    selected_probes_list.append(df.iloc[0:1,:])
    ## Going for a high offset as we have a large span:
    df = remove_overlapping_probes(df,
                                   df['name'][0], 
                                   offset=1000).reset_index(drop=True).copy()
    i +=1

  start = int(probe_df['start'].iloc[i])
  end = int(probe_df['end'].iloc[i])


In [46]:
selected_probes_df = pd.concat(selected_probes_list, axis=0).reset_index(drop=True)

In [47]:
selected_probes_df

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,GC_content_LHS,GC_content_RHS,longest_homopolymer,has_required_nts
0,chrX,73827199,73827249,XIST_0_6549,.,-,chrX:73827199-73827249(-),GTGCAGAACGTCGGCTATCAGAGCAAGCATTGTGGAGCGGTTCCTT...,GCATAAGGAACCGCTCCACAATGCTTGCTCTGATAGCCGACGTTCT...,0.54,0.52,0.56,2,True
1,chrX,73851704,73851754,XIST_5_10323,.,-,chrX:73851704-73851754(-),AGATGGATGATAGCAGGTCAGGCAGAGGAAGTCATGTGCATTGCAT...,GCTCATGCAATGCACATGACTTCCTCTGCCTGACCTGCTATCATCC...,0.5,0.48,0.52,2,True
2,chrX,73850595,73850645,XIST_5_9214,.,-,chrX:73850595-73850645(-),TGCTCCAGGCCTGCTTGGTGTGGACATGGTGGTGAGCCGTGGCAAG...,GGTCCTTGCCACGGCTCACCACCATGTCCACACCAAGCAGGCCTGG...,0.64,0.64,0.64,2,True


In [48]:
for seq in selected_probes_df['transcript_seq']:
    print(seq)

GTGCAGAACGTCGGCTATCAGAGCAAGCATTGTGGAGCGGTTCCTTATGC
AGATGGATGATAGCAGGTCAGGCAGAGGAAGTCATGTGCATTGCATGAGC
TGCTCCAGGCCTGCTTGGTGTGGACATGGTGGTGAGCCGTGGCAAGGACC


In [49]:
## Make split probes
selected_probes_df['probe_seq_LHS'] = [seq[0:split_nt] for seq in selected_probes_df['probe_seq']]
selected_probes_df['probe_seq_RHS'] = [seq[split_nt: probe_length] for seq in selected_probes_df['probe_seq']]

In [50]:
## Also add gene_ID for completeness
selected_probes_df['gene_ID'] = gene_ID

In [51]:
## Export selected probes as dataframe:
selected_probes_df.to_csv((out_dir + 'kmers_selected_probes.csv'))