In [1]:
import os 
import sys
import subprocess
import pandas as pd
import numpy as np
import pybedtools
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction
from gene2probe import *

In the previous tutorials, we have seen how to generate probes starting from a [gene](https://github.com/Teichlab/gene2probe/blob/main/notebooks/002_make_exon_probes_XIST.ipynb) or a [specific region](https://github.com/Teichlab/gene2probe/blob/main/notebooks/003_make_probes_for_custom_region_CD45RA.ipynb).

But what if we are interested in designing probes against a sequence that is not included in the human genome (e.g., GFP)?
We can achieve that by directly providing the sequence as input and following all subsequent steps.

We note that we don't recommend this approach for the detection of bacterial or viral reads for metagenomics analyses, as there are many additional considerations that have to be taken into account to ensure that a probe is specifically detecting a certain species.

### 1. Specify parameters

In [2]:
## Specify gene of interest and feature of interest
gene_ID = 'EGFP'
sequence = 'gtgcccatcctggtcgagctggacggcgacgtaaacggccacaagttcagcgtgtccggcgagggcgagggcgatgccacctacggcaagctgaccctgaagttcatctgcaccaccggcaagctgcccgtgccctggcccaccctcgtgaccaccctgacctacggcgtgcagtgcttcagccgctaccccgaccacatgaagcagcacgacttcttcaagtccgccatgcccgaaggctacgtccaggagcgcaccatcttcttcaaggacgacggcaactacaagacccgcgccgaggtgaagttcgagggcgacaccctggtgaaccgcatcgagctgaagggcatcgacttcaaggaggacggcaacatcctggggcacaagctggagtacaactacaacagccacaacgtctatatcatggccgacaagcagaagaacggcatcaaggtgaacttcaagatccgccacaacatcgaggacggcagcgtgcagctcgccgaccactaccagcagaacacccccatcggcgacggccccgtgctgctgcccgacaaccactacctgagcacccagtccgccctgagcaaagaccccaacgagaagcgcgatcacatggtcctgctggagttcgtgaccgccgccgggatcactctcggcatggacgagctgtacaagtaa'

## specify output directory
out_dir = '../sample_run/probeDesign_' + gene_ID +'/'
## Create output directory
os.makedirs(out_dir, exist_ok=True)

Additionally, we need to provide the path to several resource files. Many of these files can be obtained from [UCSC table browser](https://genome.ucsc.edu/cgi-bin/hgTables).

We also need a blast database, such as the one we generated in the [previous tutorial](https://github.com/Teichlab/gene2probe/blob/main/notebooks/001_make_blast_database.ipynb).

In [3]:
## Required resources (most can be downloaded from 
gtf = '../hg38_resources/hg38.ncbiRefSeq.gtf' ## Gene annotation in gtf file
## We recommend using RefSeq as this is manually curated and more likely to contain an isoform that is present across most cell types
## Alternatively, one can filter based on RNA-seq data for a cell type/tissue of interest
fasta = '../hg38_resources/hg38.fa' ## Genome in fasta file
snp_db = '../hg38_resources/hg38_snp151Common.bed' ## Database of known SNPs and small indels
repeats = '../hg38_resources/hg38_rmsk.bed' ## bed file with repeats/low complexity regions to be excluded
gaps = '../hg38_resources/hg38_rmsk.bed' ## bed file with gaps in the genome assembly to be excluded
blast_db = '../hg38_resources/blastdb/hg38_ncbiRefSeq_transcripts_db' ## Database of all human transcripts to blast against

In [4]:
## Path to blast binaries.
## Replace with your conda environment
## This can also be omitted if you started the jupyter session from within the gene2probe conda environment
blast_exec_path = '/nfs/team205/is10/miniconda/envs/gene2probe_env/bin/'

Finally, we need to provide a set of parameters related to our probe's length, at which nucleotide it's split (if at all), the acceptable range for GC content and any specific requirements for individual nucleotides.

Here we are following the [recommendations of 10x Genomics for custom probes for VisiumHD/VisiumFFPE/Flex](https://cdn.10xgenomics.com/image/upload/v1697739385/support-documents/CG000621_CustomProbeDesign_TechNote_RevC.pdf).

In [5]:
## Additional parameters regarding how the probe should look like
probe_length = 50 ## Length of probe in nucleotides
split_nt = 25 ## Index of nucleotide to split the probe at (start of RHS) - set to None if splitting probe is not needed
min_GC = 0.44 ## Minimum GC content for probe (if split probe, applied to both LHS and RHS)
max_GC = 0.72 ## Maximum GC content for probe (if split probe, applied to both LHS and RHS)
required_nts = {24: 'T'} ## Dictionary of index (0-based) for required nts - by default, 25th nucleotide must be a T - set to None if no requirements
probe_offset = 100 ## Minimum distance between probes - 10 bp is the recommended minimum by 10x, this can also be adjusted depending on how many probes pass other cutoffs
n_desired_probes =3 ## Number of probes to be designed.
min_mismatches = 5 ## Minimum number of mismatches (in at least LHS or RHS) - here we require in both to be more conservative

In [6]:
## Optionally, we can also specify adapters that have to be added to the probes.
## For example, for visiumHD:
LHS_pref = 'CCTTGGCACCCGAGAATTCCA' ## Will be added to the 5' of the LHS probe
LHS_suff = '' ## Will be added to the 3' of the LHS probe
RHS_pref = '/5Phos/' ## Will be added to the 5' of the RHS probe
RHS_suff = 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' ## Will be added to the 3' of the RHS probe

## Leave as empty strings if you don't want to use them

### 2. Generate k-mers

Now we can start by reading the gene annotation and filtering for our gene of interest.

In [7]:
## Read gtf file
gene_anno = read_gtf(gtf)

In [8]:
gene_anno

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,chrM,ncbiRefSeq.2022-10-28,transcript,15956,16023,.,-,.,"gene_id ""TRNP""; transcript_id ""rna-TRNP""; gen..."
1,chrM,ncbiRefSeq.2022-10-28,exon,15956,16023,.,-,.,"gene_id ""TRNP""; transcript_id ""rna-TRNP""; exon..."
2,chrM,ncbiRefSeq.2022-10-28,transcript,15888,15953,.,+,.,"gene_id ""TRNT""; transcript_id ""rna-TRNT""; gen..."
3,chrM,ncbiRefSeq.2022-10-28,exon,15888,15953,.,+,.,"gene_id ""TRNT""; transcript_id ""rna-TRNT""; exon..."
4,chrM,ncbiRefSeq.2022-10-28,transcript,14747,15887,.,+,.,"gene_id ""CYTB""; transcript_id ""rna-CYTB""; gen..."
...,...,...,...,...,...,...,...,...,...
4886697,chr1,ncbiRefSeq.2022-10-28,exon,29321,29370,.,-,.,"gene_id ""WASH7P""; transcript_id ""NR_024540.1"";..."
4886698,chr1,ncbiRefSeq.2022-10-28,transcript,11874,14409,.,+,.,"gene_id ""DDX11L1""; transcript_id ""NR_046018.2""..."
4886699,chr1,ncbiRefSeq.2022-10-28,exon,11874,12227,.,+,.,"gene_id ""DDX11L1""; transcript_id ""NR_046018.2""..."
4886700,chr1,ncbiRefSeq.2022-10-28,exon,12613,12721,.,+,.,"gene_id ""DDX11L1""; transcript_id ""NR_046018.2""..."


In [9]:
## Extract regions corresponding to gene of interest (symbol: gene_name, Ensembl ID: gene_ID), subset to feature of interest and convert to bed style dataframe:
roi_bed = get_region_of_interest(gene_anno, gene_ID, gene_id_type = 'gene_name', feature=mode)

In [10]:
roi_bed

Unnamed: 0,seqname,start,end,name,score,strand
0,chrX,73820650,73827984,XIST_0,.,-
1,chrX,73829067,73829231,XIST_1,.,-
2,chrX,73831065,73831274,XIST_2,.,-
3,chrX,73833237,73833374,XIST_3,.,-
4,chrX,73837439,73837503,XIST_4,.,-
5,chrX,73841381,73852753,XIST_5,.,-


After extracting the coordinates of interest and converting to a bed-like format, we can generate all possible kmers that fall within these regions.

In [11]:
kmers = generate_kmers(roi_bed, k=probe_length)

In [12]:
kmers

Unnamed: 0,seqname,start,end,name,score,strand
0,chrX,73820650,73820700,XIST_0_0,.,-
1,chrX,73820651,73820701,XIST_0_1,.,-
2,chrX,73820652,73820702,XIST_0_2,.,-
3,chrX,73820653,73820703,XIST_0_3,.,-
4,chrX,73820654,73820704,XIST_0_4,.,-
...,...,...,...,...,...,...
18981,chrX,73852699,73852749,XIST_5_11318,.,-
18982,chrX,73852700,73852750,XIST_5_11319,.,-
18983,chrX,73852701,73852751,XIST_5_11320,.,-
18984,chrX,73852702,73852752,XIST_5_11321,.,-


In [13]:
## Export unfiltered
kmers.to_csv((out_dir + 'kmers_all.csv'))

### 3. Exclude annotated repeats/polymorphism

We can next exclude kmers overlapping undesired regions (repeats, low complexity regions, common polymorphism, gaps in the assembly) from further consideration.

Ideally, we will exclude everything that overlaps a repeat or polymorphism, but if we only have too few kmers available, we might need to relax these requirements (e.g., to only exclude kmers overlapping SNPs around the ligation junction, if the probes are split).

In [14]:
## For example, we could have removed all kmers overlapping a repeat/low complexity region within 5 nts of the ligation junction:
remove_overlaps(kmers, repeats, core=[20,30])

Unnamed: 0,seqname,start,end,name,score,strand
0,chrX,73820650,73820700,XIST_0_0,.,-
1,chrX,73820651,73820701,XIST_0_1,.,-
2,chrX,73820652,73820702,XIST_0_2,.,-
3,chrX,73820653,73820703,XIST_0_3,.,-
4,chrX,73820654,73820704,XIST_0_4,.,-
...,...,...,...,...,...,...
17166,chrX,73852699,73852749,XIST_5_11318,.,-
17167,chrX,73852700,73852750,XIST_5_11319,.,-
17168,chrX,73852701,73852751,XIST_5_11320,.,-
17169,chrX,73852702,73852752,XIST_5_11321,.,-


In [15]:
## In this case we have a lot of possible kmers, so we will remove those with overlaps in any part of the probe:
kmers = remove_overlaps(kmers, repeats)

In [16]:
kmers 

Unnamed: 0,seqname,start,end,name,score,strand
0,chrX,73820650,73820700,XIST_0_0,.,-
1,chrX,73820651,73820701,XIST_0_1,.,-
2,chrX,73820652,73820702,XIST_0_2,.,-
3,chrX,73820653,73820703,XIST_0_3,.,-
4,chrX,73820654,73820704,XIST_0_4,.,-
...,...,...,...,...,...,...
16853,chrX,73852699,73852749,XIST_5_11318,.,-
16854,chrX,73852700,73852750,XIST_5_11319,.,-
16855,chrX,73852701,73852751,XIST_5_11320,.,-
16856,chrX,73852702,73852752,XIST_5_11321,.,-


In [17]:
## Doing the same for gaps in the assembly (very unlikely since we are starting with annotated exons)
kmers = remove_overlaps(kmers, gaps)

In [18]:
## And more importantly, against common polymorphism (SNPs, short indels)
kmers = remove_overlaps(kmers, snp_db)

In [19]:
kmers

Unnamed: 0,seqname,start,end,name,score,strand
0,chrX,73820650,73820700,XIST_0_0,.,-
1,chrX,73820651,73820701,XIST_0_1,.,-
2,chrX,73820652,73820702,XIST_0_2,.,-
3,chrX,73820653,73820703,XIST_0_3,.,-
4,chrX,73820654,73820704,XIST_0_4,.,-
...,...,...,...,...,...,...
15711,chrX,73852696,73852746,XIST_5_11315,.,-
15712,chrX,73852697,73852747,XIST_5_11316,.,-
15713,chrX,73852698,73852748,XIST_5_11317,.,-
15714,chrX,73852699,73852749,XIST_5_11318,.,-


### 4. Filter for desirable sequence features

Having excluded undesirable kmers based on intersection with genomic annotations, the next step is to consider their sequence features.

For this, we first extract the sequences of each k-mer, and then estimate features such as GC content and the presence of desired nucleotides in specific positions.

In [20]:
## Get DNA for the transcript
kmers_bed = pybedtools.BedTool.from_dataframe(kmers)
kmers_seq = kmers_bed.sequence(fi=fasta, s=True) 

## We can read in the sequences and simultaneously monitor GC content and count the longest homopolymer stretch
kmers_seq_stats = get_sequence_stats(kmers_seq.seqfn, probe_length, split_nt)

In [21]:
## Combining with our dataframe
kmers = pd.merge(kmers, kmers_seq_stats, left_index=True, right_index=True)

In [22]:
kmers

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS
0,chrX,73820650,73820700,XIST_0_0,.,-,chrX:73820650-73820700(-),GGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAAAC...,CAAAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACT...,0.30,4,0.20,0.40
1,chrX,73820651,73820701,XIST_0_1,.,-,chrX:73820651-73820701(-),TGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAAA...,AAAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTC...,0.28,4,0.16,0.40
2,chrX,73820652,73820702,XIST_0_2,.,-,chrX:73820652-73820702(-),CTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAA...,AAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCC...,0.30,4,0.16,0.44
3,chrX,73820653,73820703,XIST_0_3,.,-,chrX:73820653-73820703(-),ACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAA...,AGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCC...,0.30,4,0.16,0.44
4,chrX,73820654,73820704,XIST_0_4,.,-,chrX:73820654-73820704(-),AACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGA...,GTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCCC...,0.30,4,0.16,0.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15711,chrX,73852696,73852746,XIST_5_11315,.,-,chrX:73852696-73852746(-),TTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCT...,CGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTA...,0.46,3,0.44,0.48
15712,chrX,73852697,73852747,XIST_5_11316,.,-,chrX:73852697-73852747(-),GTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTC...,GAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAA...,0.46,3,0.44,0.48
15713,chrX,73852698,73852748,XIST_5_11317,.,-,chrX:73852698-73852748(-),AGTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACT...,AGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAAG...,0.44,3,0.44,0.44
15714,chrX,73852699,73852749,XIST_5_11318,.,-,chrX:73852699-73852749(-),CAGTTCTTAAAGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTAC...,GAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAGCGCTTTAAGA...,0.46,3,0.44,0.48


In [23]:
## Check for required nucleotides in specific positions:
if required_nts is not None:
    kmers['has_required_nts'] = check_for_required_nts(kmers, required_nts)
    print(kmers['has_required_nts'].value_counts())
    ## Filter for required nucleotides
    kmers = kmers[kmers['has_required_nts']==True].reset_index(drop=True)

has_required_nts
False    11525
True      4191
Name: count, dtype: int64


In [24]:
## Export kmers before filtering
kmers.to_csv((out_dir + 'kmers_candidates_unfiltered.csv'))

In [25]:
kmers

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts
0,chrX,73820650,73820700,XIST_0_0,.,-,chrX:73820650-73820700(-),GGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAAAC...,CAAAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACT...,0.30,4,0.20,0.40,True
1,chrX,73820652,73820702,XIST_0_2,.,-,chrX:73820652-73820702(-),CTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAAA...,AAGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCC...,0.30,4,0.16,0.44,True
2,chrX,73820653,73820703,XIST_0_3,.,-,chrX:73820653-73820703(-),ACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGAA...,AGTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCC...,0.30,4,0.16,0.44,True
3,chrX,73820654,73820704,XIST_0_4,.,-,chrX:73820654-73820704(-),AACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTGA...,GTTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCCC...,0.30,4,0.16,0.44,True
4,chrX,73820655,73820705,XIST_0_5,.,-,chrX:73820655-73820705(-),AAACTGGGGAGTTGGTTGCTATTGTAAAATAAAATATACTGTTTTG...,TTTTCAAAACAGTATATTTTATTTTACAATAGCAACCAACTCCCCA...,0.28,4,0.12,0.44,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4186,chrX,73852664,73852714,XIST_5_11283,.,-,chrX:73852664-73852714(-),ATATTTCTTACTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCT...,AGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAGAGAGTAAGAA...,0.46,4,0.44,0.48,True
4187,chrX,73852680,73852730,XIST_5_11299,.,-,chrX:73852680-73852730(-),ATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGGCTGGAAGCT...,AGGAAGCTTCCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGC...,0.52,4,0.60,0.44,True
4188,chrX,73852687,73852737,XIST_5_11306,.,-,chrX:73852687-73852737(-),CGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGGCT...,TTCCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGC...,0.54,4,0.48,0.60,True
4189,chrX,73852689,73852739,XIST_5_11308,.,-,chrX:73852689-73852739(-),AGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGG...,CCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAG...,0.56,4,0.48,0.64,True


In [26]:
## Filter for GC content
kmers = filter_by_GC_content(kmers, min_GC, max_GC)

In [27]:
kmers

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts
0,chrX,73821073,73821123,XIST_0_423,.,-,chrX:73821073-73821123(-),TGGACTCAGTAACACCCCTTTCTTCAGCTGGGGATGGGGAATGGAT...,AATAATCCATTCCCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAG...,0.48,4,0.48,0.48,True
1,chrX,73821085,73821135,XIST_0_435,.,-,chrX:73821085-73821135(-),TGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGGGA...,CCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATAC...,0.54,4,0.60,0.48,True
2,chrX,73821087,73821137,XIST_0_437,.,-,chrX:73821087-73821137(-),TGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGG...,CATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCA...,0.52,4,0.56,0.48,True
3,chrX,73821088,73821138,XIST_0_438,.,-,chrX:73821088-73821138(-),ATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGG...,ATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCAC...,0.50,4,0.52,0.48,True
4,chrX,73821091,73821141,XIST_0_441,.,-,chrX:73821091-73821141(-),TCAATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGC...,CCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCACACA...,0.50,4,0.52,0.48,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,chrX,73852654,73852704,XIST_5_11273,.,-,chrX:73852654-73852704(-),CTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTCTCTGCACTT...,CCCCAAGTGCAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAG...,0.58,4,0.52,0.64,True
530,chrX,73852663,73852713,XIST_5_11282,.,-,chrX:73852663-73852713(-),TATTTCTTACTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTC...,CAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAGAGAGTAAGA...,0.48,4,0.48,0.48,True
531,chrX,73852687,73852737,XIST_5_11306,.,-,chrX:73852687-73852737(-),CGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGGCT...,TTCCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGC...,0.54,4,0.48,0.60,True
532,chrX,73852689,73852739,XIST_5_11308,.,-,chrX:73852689-73852739(-),AGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGG...,CCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAG...,0.56,4,0.48,0.64,True


In [28]:
## Candidate kmers
kmers

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts
0,chrX,73821073,73821123,XIST_0_423,.,-,chrX:73821073-73821123(-),TGGACTCAGTAACACCCCTTTCTTCAGCTGGGGATGGGGAATGGAT...,AATAATCCATTCCCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAG...,0.48,4,0.48,0.48,True
1,chrX,73821085,73821135,XIST_0_435,.,-,chrX:73821085-73821135(-),TGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGGGA...,CCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATAC...,0.54,4,0.60,0.48,True
2,chrX,73821087,73821137,XIST_0_437,.,-,chrX:73821087-73821137(-),TGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGG...,CATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCA...,0.52,4,0.56,0.48,True
3,chrX,73821088,73821138,XIST_0_438,.,-,chrX:73821088-73821138(-),ATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGG...,ATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCAC...,0.50,4,0.52,0.48,True
4,chrX,73821091,73821141,XIST_0_441,.,-,chrX:73821091-73821141(-),TCAATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGC...,CCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCACACA...,0.50,4,0.52,0.48,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,chrX,73852654,73852704,XIST_5_11273,.,-,chrX:73852654-73852704(-),CTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTCTCTGCACTT...,CCCCAAGTGCAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAG...,0.58,4,0.52,0.64,True
530,chrX,73852663,73852713,XIST_5_11282,.,-,chrX:73852663-73852713(-),TATTTCTTACTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTC...,CAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAGAGAGTAAGA...,0.48,4,0.48,0.48,True
531,chrX,73852687,73852737,XIST_5_11306,.,-,chrX:73852687-73852737(-),CGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGGCT...,TTCCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGC...,0.54,4,0.48,0.60,True
532,chrX,73852689,73852739,XIST_5_11308,.,-,chrX:73852689-73852739(-),AGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGG...,CCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAG...,0.56,4,0.48,0.64,True


In [29]:
kmers.to_csv((out_dir + 'kmers_candidates_filtered.csv'))

### 5. Remove probes with potential off-targets

Having identified a set of kmers that fulfill our sequence requirements, we can next proceed with testing whether they are specific to our transcript/exon of interest.

For this we rely on using BLAST. 

We recommend blasting against transcripts (i.e., exons and introns combined) to be as conservative as possible in terms of off-targets. 
However, in cases where it is not possible to obtain enough suitable kmers (e.g., for  short transcripts), it is reasonable to relax this requirement by BLASTing against exons only (a much smaller search space).

At this step, we also want to consider whether our probes are split (as in the current specifications for VisiumHD) or a single oligo. If probes are split, it's best to BLAST each side separately, to make sure that both sides are specific.

In [30]:
## The first thing to do is to export our sequences in fasta format, so that we can use them for BLAST
write_fasta(kmers['name'], kmers['transcript_seq'], (out_dir + 'kmers_candidates_filtered_transcript_seqs.fa'))
## If our probes are meant to be split, we should additionally blast them separately 
## Note that the LHS/RHS in the transcript are reversed compared to the probe (i.e., the LHS of the transcript is complementary to the RHS of the probe)
if split_nt is not None: 
    ## Make split probes
    kmers['transcript_seq_LHS'] = [seq[0:split_nt] for seq in kmers['transcript_seq']]
    kmers['transcript_seq_RHS'] = [seq[split_nt: probe_length] for seq in kmers['transcript_seq']]

    ## We are exporting the transcript sequence as that's the one that has to be blasted against the human transcriptome
    write_fasta(kmers['name'], kmers['transcript_seq_LHS'], (out_dir + 'kmers_candidates_filtered_transcript_seqs_LHS.fa'))
    write_fasta(kmers['name'], kmers['transcript_seq_RHS'], (out_dir + 'kmers_candidates_filtered_transcript_seqs_RHS.fa'))    

In [31]:
blast_res = {}
## First, blast the full probe
blast_res['full'] = run_blast(fasta=(out_dir + 'kmers_candidates_filtered_transcript_seqs.fa'),
                              blastdb = blast_db,
                              path2blastn=(blast_exec_path + 'blastn'),
                              outfile = (out_dir + 'kmers_candidates_filtered_blast_output.txt'))

## Additionally, if probe is split, blast each side separately
if split_nt is not None: 
    blast_res['LHS'] = run_blast(fasta=(out_dir + 'kmers_candidates_filtered_transcript_seqs_LHS.fa'),
                                     blastdb = blast_db,
                                     path2blastn=(blast_exec_path + 'blastn'),
                                     outfile = (out_dir + 'kmers_candidates_filtered_blast_output_LHS.txt'))
    blast_res['RHS'] = run_blast(fasta=(out_dir + 'kmers_candidates_filtered_transcript_seqs_RHS.fa'),
                                     blastdb = blast_db,
                                     path2blastn=(blast_exec_path + 'blastn'),
                                     outfile = (out_dir + 'kmers_candidates_filtered_blast_output_RHS.txt'))

In [32]:
for k in blast_res.keys():
    print(("The following genes were detected in mode: " +  k))
    print(blast_res[k]['sgeneid'].value_counts().head(10))

The following genes were detected in mode: full
sgeneid
XIST       1228
MYT1L       174
ANO4        102
SLC45A4      60
BRAF         57
JARID2       56
FAAH2        54
LRRC8A       45
PACSIN2      40
SORBS2       38
Name: count, dtype: int64
The following genes were detected in mode: LHS
sgeneid
XIST      887
KIFC3      78
AGPAT3     54
ELAVL4     54
NHSL1      52
TLR5       44
MYT1L      29
NR2C2      25
ANKS1B     21
JARID2     20
Name: count, dtype: int64
The following genes were detected in mode: RHS
sgeneid
XIST       803
NT5C2      108
TRERF1      52
DOCK9       43
MAP7        39
PLEKHA7     38
GSE1        29
NHSL1       26
COBL        22
FGF14       22
Name: count, dtype: int64


In [33]:
blast_res['full']

Unnamed: 0,name,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sgeneid
0,XIST_0_423,XIST::chrX:73820650-73852753(-),100.0,50,0,0,1,50,31631,31680,8.510000e-17,91.5,XIST
1,XIST_0_435,XIST::chrX:73820650-73852753(-),100.0,50,0,0,1,50,31619,31668,8.510000e-17,91.5,XIST
2,XIST_0_437,XIST::chrX:73820650-73852753(-),100.0,50,0,0,1,50,31617,31666,8.510000e-17,91.5,XIST
3,XIST_0_438,XIST::chrX:73820650-73852753(-),100.0,50,0,0,1,50,31616,31665,8.510000e-17,91.5,XIST
4,XIST_0_441,XIST::chrX:73820650-73852753(-),100.0,50,0,0,1,50,31613,31662,8.510000e-17,91.5,XIST
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2364,XIST_5_11273,XIST::chrX:73820650-73852753(-),100.0,50,0,0,1,50,50,99,8.510000e-17,91.5,XIST
2365,XIST_5_11282,XIST::chrX:73820650-73852753(-),100.0,50,0,0,1,50,41,90,8.510000e-17,91.5,XIST
2366,XIST_5_11306,XIST::chrX:73820650-73852753(-),100.0,50,0,0,1,50,17,66,8.510000e-17,91.5,XIST
2367,XIST_5_11308,XIST::chrX:73820650-73852753(-),100.0,50,0,0,1,50,15,64,8.510000e-17,91.5,XIST


Not all BLAST hits will be off-targets. Hopefully, our gene of interest is included in the BLAST output. We therefore need to filter for hits with different gene IDs.

In [34]:
offtargets = []
for k in blast_res.keys():
    offtargets += (detect_offtargets(blast_res[k], gene_ID, min_mismatches=min_mismatches))
## Remove redundancies
offtargets = list(set(offtargets))

In [35]:
len(offtargets)

162

In [36]:
## Remove off-targets
kmers = kmers[kmers['name'].isin(offtargets)==False].reset_index(drop=True)

In [37]:
kmers 

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts,transcript_seq_LHS,transcript_seq_RHS
0,chrX,73821073,73821123,XIST_0_423,.,-,chrX:73821073-73821123(-),TGGACTCAGTAACACCCCTTTCTTCAGCTGGGGATGGGGAATGGAT...,AATAATCCATTCCCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAG...,0.48,4,0.48,0.48,True,TGGACTCAGTAACACCCCTTTCTTC,AGCTGGGGATGGGGAATGGATTATT
1,chrX,73821085,73821135,XIST_0_435,.,-,chrX:73821085-73821135(-),TGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGGGA...,CCCATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATAC...,0.54,4,0.60,0.48,True,TGTGGTATCAGCTGGACTCAGTAAC,ACCCCTTTCTTCAGCTGGGGATGGG
2,chrX,73821087,73821137,XIST_0_437,.,-,chrX:73821087-73821137(-),TGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGGG...,CATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCA...,0.52,4,0.56,0.48,True,TGTGTGGTATCAGCTGGACTCAGTA,ACACCCCTTTCTTCAGCTGGGGATG
3,chrX,73821088,73821138,XIST_0_438,.,-,chrX:73821088-73821138(-),ATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGCTGG...,ATCCCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCAC...,0.50,4,0.52,0.48,True,ATGTGTGGTATCAGCTGGACTCAGT,AACACCCCTTTCTTCAGCTGGGGAT
4,chrX,73821091,73821141,XIST_0_441,.,-,chrX:73821091-73821141(-),TCAATGTGTGGTATCAGCTGGACTCAGTAACACCCCTTTCTTCAGC...,CCCAGCTGAAGAAAGGGGTGTTACTGAGTCCAGCTGATACCACACA...,0.50,4,0.52,0.48,True,TCAATGTGTGGTATCAGCTGGACTC,AGTAACACCCCTTTCTTCAGCTGGG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,chrX,73852654,73852704,XIST_5_11273,.,-,chrX:73852654-73852704(-),CTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTCTCTGCACTT...,CCCCAAGTGCAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAG...,0.58,4,0.52,0.64,True,CTCTCTCGGGGCTGGAAGCTTCCTG,ACTGAAGATCTCTCTGCACTTGGGG
368,chrX,73852663,73852713,XIST_5_11282,.,-,chrX:73852663-73852713(-),TATTTCTTACTCTCTCGGGGCTGGAAGCTTCCTGACTGAAGATCTC...,CAGAGAGATCTTCAGTCAGGAAGCTTCCAGCCCCGAGAGAGTAAGA...,0.48,4,0.48,0.48,True,TATTTCTTACTCTCTCGGGGCTGGA,AGCTTCCTGACTGAAGATCTCTCTG
369,chrX,73852687,73852737,XIST_5_11306,.,-,chrX:73852687-73852737(-),CGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGGCT...,TTCCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGC...,0.54,4,0.48,0.60,True,CGCTGCAATTCGCTGCTGCAGCCAT,ATTTCTTACTCTCTCGGGGCTGGAA
370,chrX,73852689,73852739,XIST_5_11308,.,-,chrX:73852689-73852739(-),AGCGCTGCAATTCGCTGCTGCAGCCATATTTCTTACTCTCTCGGGG...,CCAGCCCCGAGAGAGTAAGAAATATGGCTGCAGCAGCGAATTGCAG...,0.56,4,0.48,0.64,True,AGCGCTGCAATTCGCTGCTGCAGCC,ATATTTCTTACTCTCTCGGGGCTGG


### 6. Select non-overlapping probes

At this point, we have effectively acquired a set of usable probes. They don't overlap undesirable regions (repeats/polymorphism), have desirable sequence features (GC content, specific nucleotides) and are specific to our gene of interest.

In this particular case, we still have a lot of possible k-mers (much more than the number of probes we intend to design). We can therefore choose to prioritise k-mers with shorter homopolymer stretches, as these are also discouraged by the [10x recommendations](https://cdn.10xgenomics.com/image/upload/v1697739385/support-documents/CG000621_CustomProbeDesign_TechNote_RevC.pdf).

However, at this stage you might want to consider ranking probes in a diffferent way, depending on your application.

After having ranked our k-mers in whatever way we think is reasonable at this stage, we can proceed with selecting the top probe, then removing all overlapping/adjacent probes (within a window determined by `probe_offset`).

Since we have so many available probes, we will be increasing `probe_offset` from `100 (default)` to `1000`.

In [38]:
## Sort in increasing homopolymer length
kmers = kmers.sort_values('longest_homopolymer', ascending=True).reset_index(drop=True)

In [39]:
kmers

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts,transcript_seq_LHS,transcript_seq_RHS
0,chrX,73827196,73827246,XIST_0_6546,.,-,chrX:73827196-73827246(-),CAGAACGTCGGCTATCAGAGCAAGCATTGTGGAGCGGTTCCTTATG...,CTGGCATAAGGAACCGCTCCACAATGCTTGCTCTGATAGCCGACGT...,0.54,2,0.52,0.56,True,CAGAACGTCGGCTATCAGAGCAAGC,ATTGTGGAGCGGTTCCTTATGCCAG
1,chrX,73827173,73827223,XIST_0_6523,.,-,chrX:73827173-73827223(-),GCATTGTGGAGCGGTTCCTTATGCCAGGCTGCCATGTGAGATGATC...,CTTGGATCATCTCACATGGCAGCCTGGCATAAGGAACCGCTCCACA...,0.54,2,0.52,0.56,True,GCATTGTGGAGCGGTTCCTTATGCC,AGGCTGCCATGTGAGATGATCCAAG
2,chrX,73827178,73827228,XIST_0_6528,.,-,chrX:73827178-73827228(-),AGCAAGCATTGTGGAGCGGTTCCTTATGCCAGGCTGCCATGTGAGA...,ATCATCTCACATGGCAGCCTGGCATAAGGAACCGCTCCACAATGCT...,0.52,2,0.52,0.52,True,AGCAAGCATTGTGGAGCGGTTCCTT,ATGCCAGGCTGCCATGTGAGATGAT
3,chrX,73827189,73827239,XIST_0_6539,.,-,chrX:73827189-73827239(-),TCGGCTATCAGAGCAAGCATTGTGGAGCGGTTCCTTATGCCAGGCT...,TGGCAGCCTGGCATAAGGAACCGCTCCACAATGCTTGCTCTGATAG...,0.56,2,0.60,0.52,True,TCGGCTATCAGAGCAAGCATTGTGG,AGCGGTTCCTTATGCCAGGCTGCCA
4,chrX,73850525,73850575,XIST_5_9144,.,-,chrX:73850525-73850575(-),GTTGGCCAACAGGTGGCAGAAGAGGAATTCCTGCCTTCCTCAAGAG...,GTTCCTCTTGAGGAAGGCAGGAATTCCTCTTCTGCCACCTGTTGGC...,0.54,2,0.48,0.60,True,GTTGGCCAACAGGTGGCAGAAGAGG,AATTCCTGCCTTCCTCAAGAGGAAC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,chrX,73845767,73845817,XIST_5_4386,.,-,chrX:73845767-73845817(-),CACTCCCAGTCTTCCTTTCCCTTCCAGCAGGGAGTGCCCCCTCCAT...,TCTTATGGAGGGGGCACTCCCTGCTGGAAGGGAAAGGAAGACTGGG...,0.58,5,0.60,0.56,True,CACTCCCAGTCTTCCTTTCCCTTCC,AGCAGGGAGTGCCCCCTCCATAAGA
368,chrX,73852326,73852376,XIST_5_10945,.,-,chrX:73852326-73852376(-),GGGGCTGCGGATACCTGGTTTTATTATTTTTTCTTTGCCCAACGGG...,CGGCCCCGTTGGGCAAAGAAAAAATAATAAAACCAGGTATCCGCAG...,0.52,6,0.52,0.52,True,GGGGCTGCGGATACCTGGTTTTATT,ATTTTTTCTTTGCCCAACGGGGCCG
369,chrX,73850758,73850808,XIST_5_9377,.,-,chrX:73850758-73850808(-),CTGTATTGGTGCCTCACCTAAGGCTAAGTATACCTCCccccccacc...,ggggggtggggggGGAGGTATACTTAGCCTTAGGTGAGGCACCAAT...,0.60,6,0.68,0.52,True,CTGTATTGGTGCCTCACCTAAGGCT,AAGTATACCTCCccccccacccccc
370,chrX,73850763,73850813,XIST_5_9382,.,-,chrX:73850763-73850813(-),TTCCTCTGTATTGGTGCCTCACCTAAGGCTAAGTATACCTCCcccc...,gtggggggGGAGGTATACTTAGCCTTAGGTGAGGCACCAATACAGA...,0.54,6,0.60,0.48,True,TTCCTCTGTATTGGTGCCTCACCTA,AGGCTAAGTATACCTCCccccccac


In [40]:
## We have a lot of probes here - increasing the offset to 1000 bp to space them out
probe_offset = 1000

In [41]:
selected_probes_list = []
df = kmers.copy()
i = 0
while i < n_desired_probes:
    selected_probes_list.append(df.iloc[0:1,:])
    ## Going for a high offset as we have a large span:
    df = remove_overlapping_probes(df,
                                   df['name'][0], 
                                   offset=probe_offset).reset_index(drop=True).copy()
    i +=1

  start = int(probe_df['start'].iloc[i])
  end = int(probe_df['end'].iloc[i])


In [42]:
selected_probes_df = pd.concat(selected_probes_list, axis=0).reset_index(drop=True)

In [43]:
selected_probes_df

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts,transcript_seq_LHS,transcript_seq_RHS
0,chrX,73827196,73827246,XIST_0_6546,.,-,chrX:73827196-73827246(-),CAGAACGTCGGCTATCAGAGCAAGCATTGTGGAGCGGTTCCTTATG...,CTGGCATAAGGAACCGCTCCACAATGCTTGCTCTGATAGCCGACGT...,0.54,2,0.52,0.56,True,CAGAACGTCGGCTATCAGAGCAAGC,ATTGTGGAGCGGTTCCTTATGCCAG
1,chrX,73850525,73850575,XIST_5_9144,.,-,chrX:73850525-73850575(-),GTTGGCCAACAGGTGGCAGAAGAGGAATTCCTGCCTTCCTCAAGAG...,GTTCCTCTTGAGGAAGGCAGGAATTCCTCTTCTGCCACCTGTTGGC...,0.54,2,0.48,0.6,True,GTTGGCCAACAGGTGGCAGAAGAGG,AATTCCTGCCTTCCTCAAGAGGAAC
2,chrX,73844799,73844849,XIST_5_3418,.,-,chrX:73844799-73844849(-),TGACCGCAGCCATGCACCTTGGTCAATAATGTGTGTAACTGCACAC...,GGCCGTGTGCAGTTACACACATTATTGACCAAGGTGCATGGCTGCG...,0.54,2,0.48,0.6,True,TGACCGCAGCCATGCACCTTGGTCA,ATAATGTGTGTAACTGCACACGGCC


In [44]:
for seq in selected_probes_df['transcript_seq']:
    print(seq)

CAGAACGTCGGCTATCAGAGCAAGCATTGTGGAGCGGTTCCTTATGCCAG
GTTGGCCAACAGGTGGCAGAAGAGGAATTCCTGCCTTCCTCAAGAGGAAC
TGACCGCAGCCATGCACCTTGGTCAATAATGTGTGTAACTGCACACGGCC


And we are done! We have now selected three potential probes for our gene.

If our probes are meant to be split, we can additionally generate these columns:

In [45]:
if split_nt is not None: 
    ## Make split probes (and add adapters if provided)
    selected_probes_df['probe_seq_LHS'] = [(LHS_pref + seq[0:split_nt] + LHS_suff) for seq in selected_probes_df['probe_seq']]
    selected_probes_df['probe_seq_RHS'] = [(RHS_pref +seq[split_nt: probe_length] + RHS_suff) for seq in selected_probes_df['probe_seq']]

In [46]:
## Also add gene_ID for completeness
selected_probes_df['gene_ID'] = gene_ID

In [47]:
## Export selected probes as dataframe:
selected_probes_df.to_csv((out_dir + 'kmers_selected_probes.csv'))

We always recommend additionally performing a manual BLAST of these probe sequences to make sure that there are no off-target effects.

In [48]:
selected_probes_df

Unnamed: 0,seqname,start,end,name,score,strand,kmer_coord,transcript_seq,probe_seq,GC_content_full,longest_homopolymer,GC_content_LHS,GC_content_RHS,has_required_nts,transcript_seq_LHS,transcript_seq_RHS,probe_seq_LHS,probe_seq_RHS,gene_ID
0,chrX,73827196,73827246,XIST_0_6546,.,-,chrX:73827196-73827246(-),CAGAACGTCGGCTATCAGAGCAAGCATTGTGGAGCGGTTCCTTATG...,CTGGCATAAGGAACCGCTCCACAATGCTTGCTCTGATAGCCGACGT...,0.54,2,0.52,0.56,True,CAGAACGTCGGCTATCAGAGCAAGC,ATTGTGGAGCGGTTCCTTATGCCAG,CCTTGGCACCCGAGAATTCCACTGGCATAAGGAACCGCTCCACAAT,/5Phos/GCTTGCTCTGATAGCCGACGTTCTGAAAAAAAAAAAAAA...,XIST
1,chrX,73850525,73850575,XIST_5_9144,.,-,chrX:73850525-73850575(-),GTTGGCCAACAGGTGGCAGAAGAGGAATTCCTGCCTTCCTCAAGAG...,GTTCCTCTTGAGGAAGGCAGGAATTCCTCTTCTGCCACCTGTTGGC...,0.54,2,0.48,0.6,True,GTTGGCCAACAGGTGGCAGAAGAGG,AATTCCTGCCTTCCTCAAGAGGAAC,CCTTGGCACCCGAGAATTCCAGTTCCTCTTGAGGAAGGCAGGAATT,/5Phos/CCTCTTCTGCCACCTGTTGGCCAACAAAAAAAAAAAAAA...,XIST
2,chrX,73844799,73844849,XIST_5_3418,.,-,chrX:73844799-73844849(-),TGACCGCAGCCATGCACCTTGGTCAATAATGTGTGTAACTGCACAC...,GGCCGTGTGCAGTTACACACATTATTGACCAAGGTGCATGGCTGCG...,0.54,2,0.48,0.6,True,TGACCGCAGCCATGCACCTTGGTCA,ATAATGTGTGTAACTGCACACGGCC,CCTTGGCACCCGAGAATTCCAGGCCGTGTGCAGTTACACACATTAT,/5Phos/TGACCAAGGTGCATGGCTGCGGTCAAAAAAAAAAAAAAA...,XIST
