# Oligopaints based MERFISH probe sequence design pipeline 
  
The goal of this pipeline is to take a set of genes and use [OligoPaints](https://oligopaints.hms.harvard.edu/genome-files) for encoding probes, [DNA barcodes](https://elledge.hms.harvard.edu/?page_id=638) for readout probes/primers, and [5x5 bDNA sequences](https://static-content.springer.com/esm/art%3A10.1038%2Fs41598-019-43943-8/MediaObjects/41598_2019_43943_MOESM2_ESM.xlsx) for signal amplification. Signal amplification is necessary because the number of probes for many of our genes of interest are significantly less than the 92 probes used in the original MERFISH papers. The rules for our pipeline are assembled from all of the various MERFISH publications by Rory Kruithoff.  
  
Written on Ubuntu 18.04 LTS (both native and using Windows Subsytem). This code requires a working local BLAST install, working local BEDtools install, and some work to get cruzdb running with Python 3.x. Will write up the install in an another markdown once everything is finalized.

Current external dependcies:
- local BLAST install
- local BEDTOOLS install
  
Current library dependencies:
- python = 3.6
- cruzdb (requires some effort to install in python 3.6)
- pandas  
- pybedtools  
- biopython  
- numpy  
- os  
  
Current external data dependencies:
- hg38 OligoPaints BED files  
- hg38 transcriptome fasta file  
- hg38 ncRNA fasta file  
- Elledge lab 240k list of 25-mer sequences
- Zhuang lab modified hamming codes  
- Wollman lab modified hamming codes
- Zhuang lab standard readout sequences
- Moffitt & Zhuang lab amplified readout sequences 

Douglas Shepherd, PhD  
Quantitative Imaging and Inference Lab (qi2lab)  
Center for Biological Physics and Department of Physics  
Arizona State University  
04.2020

## Imports

In [1]:
# cruzdb imports
# # https://github.com/brentp/cruzdb/tree/pull16/cruzdb
from cruzdb import Genome 

# pandas imports
# https://pandas.pydata.org/
import pandas as pd 

# pybedtools imports
# https://daler.github.io/pybedtools/
# relies on a local BEDTOOLS installation
import pybedtools 

# biopython imports
# https://biopython.org/
# relies on a local BLASTx installation
from Bio import SeqIO
from Bio import SearchIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna
from Bio.SeqUtils import MeltingTemp as mt
from Bio.SeqUtils import GC as gcCheck
from Bio.Blast.Applications import NcbimakeblastdbCommandline
from Bio.Blast.Applications import NcbiblastnCommandline as blastn
from Bio.Blast import NCBIXML

# numpy imports
import numpy as np

# os imports
import os

## Encoding probe design 
1. Define genes using refGene name from UCSC browser
2. Pull all isoforms from UCSC associated with refGene name
3. Parse out exons for each isoform.
4. Using OligoPaints 'balanced' database to select probes for each isoform.
5. Find unique probes that span all isoforms. Save probes in Pandas structure and a BLAST database
6. If less than 30 probes, tag gene for multiple copies of readout sequences.

In [22]:
# function to take a dataframe of chromosome, exon locations, and strandness for one gene isoform and return a BED file 
def dfToBEDisoform(df_isoform):

    # loop over each exon and create list of strings with format
    # CHR START STOP NAME STRAND
    bed_record=[]
    i=0
    for index, row in df_isoform.iterrows():
        line=(str(row['chromosome']),str(row['start']),str(row['stop']),
              gene_id+'_exon_'+str(row['exon']),0,str(row['strand']))
        bed_record.append(line)
        i+=1

    # convert list of strings to BED record
    BED_isoform = pybedtools.BedTool(bed_record)

    # return BED record
    return BED_isoform

In [29]:
# function to take an isoform and return a set of probes
def findProbesFromIsoform(gene_id,df_isoform,isoform_number):
    
    # convert dataframe to BED record
    BED_isoform = dfToBEDisoform(df_isoform)
    
    # load OligoPaints BED file corresponding to chromosome for gene
    BED_all_probes =pybedtools.BedTool('oligodb/hg38b/hg38_'+str(df_isoform.chromosome.unique()[0])+'b.bed')
    
    # find probes using intersect
    encoding_probes = BED_all_probes.intersect(BED_isoform,f=1)
    
    # turn output into a list of strings
    # check if strand is + or -
    # if +, store probe
    # if -, take reverse complement before storing probe
    encoding_probes_sequence = []
    for interval in encoding_probes:
        if df_isoform.strand.unique()[0]=='+':
            encoding_probes_sequence.append(interval.name)
        else:
            temp_seq = Seq(interval.name, generic_dna)
            encoding_probes_sequence.append(str(temp_seq.reverse_complement()))
        
    # convert list of strings into dataframe
    i=0
    df_isoform_probes = pd.DataFrame(columns=['gene','isoform','probe','sequence'])
    for probe_seq in encoding_probes_sequence:
        df_isoform_probes = df_isoform_probes.append({'gene': gene_id, 'isoform': isoform_number,'probe': i,
                                             'sequence': probe_seq},ignore_index=True)
        i+=1
    
    # return dataframe for probes for this gene
    return df_isoform_probes

In [47]:
# function to retrieve exons from UCSC for a refGene ID
def generateEncodingProbes(database, gene_ids):

    # open connection to UCSC database
    # here, we use the hg38 human genome assembly
    g = Genome(db=genome)

    # create empty dataframe to store all probes
    df_encoding_probes = pd.DataFrame(columns=['gene','probe','sequence'])

    # create empty dataframe
    df_encoding_across_all_isoforms = pd.DataFrame(columns=['gene','isoform','probe','sequence'])

    # loop over all genes
    for gene_id in gene_ids:

        # pull all entries for a given gene from UCSC
        gene_entry_all = g.refGene.filter_by(name2=gene_id).all()

        j=0
        # loop over all gene entries
        for isoform in gene_entry_all:

            # extract exons for this gene entry
            exons=isoform.exons

            # create empty dataframe
            df_exons = pd.DataFrame(columns=['gene','chromosome','exon','start','stop','strand'])

            # place each exon in dataframe 
            i=0
            for exon in exons:
                df_exons = df_exons.append({'gene': gene_id, 'chromosome': isoform.chrom, 
                                           'exon': i, 'start': exon[0], 'stop': exon[1], 'strand': isoform.strand},
                                          ignore_index=True)
                i+=1

            df_encoding_isoform=findProbesFromIsoform(gene_id,df_exons,j)
            j+=1

            df_encoding_across_all_isoforms = df_encoding_across_all_isoforms.append(df_encoding_isoform,ignore_index=True)

    df_encoding_unique = df_encoding_across_all_isoforms.drop_duplicates(['sequence'])
    df_encoding_unique=df_encoding_unique.reset_index()

    # place this into the larger dataframe
    df_encoding_probes=df_encoding_probes.append(df_encoding_unique,ignore_index=True)
    df_encoding_probes=df_encoding_probes.drop(columns=['probe','index'])

    return df_encoding_probes

In [48]:
# define target genome
genome = 'hg38'

# define target genes
# use refGene ID is UCSC
# TO DO: create function to parse & load output of gene selection software
'''
gene_ids=['ACTA1','ACTA2','NOS3','VEGFA','VEGFB',
          'VEGFC','VEGFD','KDR','HIF1A','EPAS1','HIF3A',
          'IGF1','IGF1R','HGF','ALK','COL18A1',
          'BMPR1A','CD34','VWF','KRT18','ACE2',
          'RPTOR','RICTOR','MTOR','PIK3CA','SFTPB',
          'SFTPC','EPCAM','MYRF','TMPRSS2','CTSL',
          'POL1A','POL2A']
'''

gene_ids = ['VEGFA','NOS3']

df_encoding=generateEncodingProbes(genome, gene_ids)

## Parse non-coding RNA FASTA
1. [Download hg38 ncRNA fasta](ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz)
2. Parse out 'tRNA', 'Mt-tRNA', 'rRNA'
3. Create blast database

In [None]:
def fastaToBlastDBncRNA():

    records_to_keep=[]
    with open('/home/dps/merfish/blastdb/tRNA/Homo_sapiens.GRCh38.ncrna.fa', 'r') as handle:
        for record in SeqIO.parse(handle, 'fasta'):
            description=record.description
            if ('tRNA' in description) or ('rRNA' in description):
                records_to_keep.append(record)
                
    with open('/home/dps/merfish/blastdb/tRNA/tRNA_parsed.fa','a') as output_handle:
        for record in records_to_keep:
            SeqIO.write(record,output_handle,'fasta')
    
    cline = NcbimakeblastdbCommandline(dbtype='nucl',input_file='/home/dps/merfish/blastdb/tRNA/tRNA_parsed.fa',
                                       title='tRNA',out='/home/dps/merfish/blastdb/tRNA/db/tRNA')
    stdout, stderr = cline()

In [None]:
fastaToBlastDBncRNA()

## Denovo readout probe design
1. Create set of all potential 20-mer readout probes from [known set](https://doi.org/10.1073/pnas.0812506106) of 240,000 25-mers.
2. Select probes with only 'A', 'T', and 'C'.
3. Select probes without 'CCC', 'AAA', and 'TTT'.
4. Select probes with 40-50% GC content.
5. BLAST 20-mers against transcriptome for species of interest. Select those with less than 11 contiguous base homology.
6. BLAST 20-mers aganist tRNA, rRNA for species of interest and mitochondria. Select those with less than 11 contiguous base homology.
7. BLAST 20-mers against other selected 20-mers. Select those with less than 11 contiguous base homology.

In [None]:
def generateDenovoReadoutProbes():
    big_list_25mers_all = list(SeqIO.parse('bc25mer.240k.fasta','fasta'))
    big_list_25mers=[]
    
    for i in range(0,len(big_list_25mers_all)):
        big_list_25mers.append(str(big_list_25mers_all[i].seq))
        
    K=20
    big_list_20mers=[]
        
    for trial_25mer in big_list_25mers:
        trial_20mers = [trial_25mer[i: j] for i in range(len(trial_25mer)) for j in range(i + 1, len(trial_25mer) + 1) if len(trial_25mer[i:j]) == K]
        for trial_20mer in trial_20mers:
            if not ('G' in trial_20mer):
                #trial_20mer.replace('C','G')
                big_list_20mers.append(trial_20mer)
            
    pass_list=[]
    for probe in big_list_20mers:
        if not (('CCC' in probe) or ('TTT' in probe) or ('AAA' in probe)):
            pass_list.append(probe)
                
    pass_list2=[]
    for probe in pass_list:
        gc_count = gcCheck(probe)

        if (gc_count>=40) and (gc_count<=50):
            pass_list2.append(probe)
    
    pass_list3=[]
    for probe in pass_list2:

        record = SeqRecord(Seq(probe,generic_dna),id=probe+'_'+str(i))
        
        with open('readout_test_h3g8.fasta','w') as output_handle:
            SeqIO.write(record,output_handle,'fasta')
                
        blastn_cline = blastn(query='readout_test_h3g8.fasta',db='/home/dps/merfish/blastdb/hg38/GRCh38',
                              out='readout_check_hg38.xml',dust='no',word_size=10,outfmt=5)
        stdout,stderr = blastn_cline()
        
        with open('readout_check_hg38.xml','r') as input_handle_hg38:
            blast_qresult = SearchIO.read(input_handle_hg38, 'blast-xml')

        flagged=True

        for hit in blast_qresult:
            if (hit.seq_len>=10):
                flagged=False
                break
                
        if flagged:
            pass_list3.append(probe)
            
    pass_list4=[]
    first = True
    for probe in pass_list3:

        record = SeqRecord(Seq(probe,generic_dna),id=probe+'_'+str(i))
        
        with open('readout_test_ncRNA.fasta','w') as output_handle:
            SeqIO.write(record,output_handle,'fasta')
                
        blastn_cline = blastn(query='readout_test_ncRNA.fasta',db='/home/dps/merfish/blastdb/tRNA/db/tRNA',
                              out='readout_check_ncRNA.xml',dust='no',word_size=10,outfmt=5)
        stdout,stderr = blastn_cline()
        
        with open('readout_check_ncRNA.xml','r') as input_handle_ncRNA:
            blast_qresult_ncRNA = SearchIO.read(input_handle_ncRNA, 'blast-xml')

        flagged=True

        for hit in blast_qresult_ncRNA:
            if (hit.seq_len>=10):
                flagged=False
                break
                
        if flagged:
            pass_list4.append(probe)
            
    for probe in pass_list4:
        
        record = SeqRecord(Seq(probe,generic_dna),id=probe+'_'+str(i),description="potential readout "+str(i))

        with open('/home/dps/merfish/blastdb/readout/readout_candidates.fasta','a') as output_handle:    
            SeqIO.write(record,output_handle,"fasta")

    cline = NcbimakeblastdbCommandline(dbtype='nucl',input_file='/home/dps/merfish/blastdb/readout/readout_candidates.fasta',
                                       title='readout',out='/home/dps/merfish/blastdb/readout/db/readout')
    stdout, stderr = cline()
                
    pass_list5=[]
    for probe in pass_list4:

        record = SeqRecord(Seq(probe,generic_dna),id=probe+'_'+str(i))
        
        with open('readout_test_final.fasta','w') as output_handle:
            SeqIO.write(record,output_handle,'fasta')
                
        blastn_cline = blastn(query='readout_test_final.fasta',db='/home/dps/merfish/blastdb/readout/db/readout',
                              out='readout_test_final.xml',dust='no',word_size=10,outfmt=5)
        stdout,stderr = blastn_cline()
        
        with open('readout_test_final.xml','r') as input_handle_final:
            blast_qresult_final = SearchIO.read(input_handle_final, 'blast-xml')
        
        flagged=True

        for hit in blast_qresult_final:
            if (hit.seq_len>=10) and (hit.seq_len<20):
                flagged=False
                break
                
        if flagged:
            pass_list5.append(probe)
    
    # place each probe in dataframe 
    df_readout_probes = pd.DataFrame(columns=['probe','sequence'])
    i=0
    for probe in pass_list5:
        df_readout_probes = df_readout.append({'probe': i, 'sequence': probe},ignore_index=True)
        i+=1
        
    return df_readout

In [None]:
df_readout_probes=generateDenovoReadoutProbes()

## Published standard readout sequences
1. Load readout sequences used for 16-bit MHD4 code from [Moffitt et al 2018](https://doi.org/10.1073/pnas.1617699113).  
  
Moffitt et al used 'A','T','G' instead of the 'A','T','C' strategy used for our denovo readout sequence design.

In [None]:
def loadReadoutSequences():
    
    df_readout_probes = pd.

## Published amplified readout sequences 
1. Load 5x5 amplified readout sequences for 16-bit MHD4 code from [Xia et al 2019](https://doi.org/10.1038/s41598-019-43943-8).

This is a clear area of opportunity. Look into orthogonal strategies used in [SABER](https://www.nature.com/articles/s41592-019-0404-0#Sec36). I agree with Moffitt that solid-phase amplifiers with defined number of binding sites makes more sense as it will reduce variation. Even if it does cost a little bit extra. Also, Allen has not been able to reliable run the SABER reaction to get amplifiers.

In [None]:
def loadAmplifiedReadoutSequences():
    
    df_readout_probes_amplified = pd.read_excel('/home/dps/merfish/16bit_MHD4_amplify.xlsx',header=1)
    
    return df_readout_probes_amplified

## Published modified hamming codes
Load:
1. 14-bit modified hamming codes from [Zhuang lab example data #1](http://zhuang.harvard.edu/merfish.html).
2. 16-bit modified hamming codes from [Zhuang lab example data #1](http://zhuang.harvard.edu/merfish.html).
3. 18-bit modified hamming codes from [Wollman MERFISH github repo](https://github.com/wollmanlab/PySpots).
4. 24-bit modified hamming codes from [Wollman MERFISH github repo](https://github.com/wollmanlab/PySpots).  
  
This is a clear area of opportunity. We are working on alternative coding/decoding at ASU, but are not ready to roll it out yet.

In [None]:
def loadHammingFromDisk(bit):

    if (bit=='14-bit'):
        list_hamming=[]
        with open('/home/dps/merfish/codebook/14bit_MHD2_codebook.fasta','r') as input_handle:
            for record in SeqIO.parse(input_handle, 'fasta'):
                list_hamming.append(record.description.split())

        df_hamming_codes=pd.DataFrame(list_hamming,columns=['R0','R1','R2','R3',
                                                        'R4','R5','R6','R7',
                                                        'R8','R9','R10','R11',
                                                        'R12','R13'])
    else if (bit=='16-bit'):
        list_codebook=[]
        with open('/home/dps/merfish/codebook/16bit_MHD4_codebook.fasta','r') as input_handle:
            for record in SeqIO.parse(input_handle, 'fasta'):
                list_hamming.append(record.description.split())

        df_hamming_codes=pd.DataFrame(list_hamming,columns=['R0','R1','R2','R3',
                                                        'R4','R5','R6','R7',
                                                        'R8','R9','R10','R11',
                                                        'R12','R13','R14','R15'])
        
    else if (bit=='18-bit'):
        list_codebook=[]
        df_hamming_codes=pd.DataFrame(columns=['R0','R1','R2','R3',
                                               'R4','R5','R6','R7',
                                               'R8','R9','R10','R11',
                                               'R12','R13','R14','R15',
                                               'R16','R17'])
        with open('/home/dps/merfish/codebook/18bit_MHD4_codebook.fasta','r') as input_handle:
            df_hamming_codes=pd.read_csv(input_handle,delimeter=',')
                
    else:
        list_codebook=[]
        df_hamming_codes=pd.DataFrame(columns=['R0','R1','R2','R3',
                                               'R4','R5','R6','R7',
                                               'R8','R9','R10','R11',
                                               'R12','R13','R14','R15',
                                               'R16','R17','R18','R19',
                                               'R20','R21','R22','R23'])
        with open('/home/dps/merfish/codebook/24bit_MHD4_codebook.fasta','r') as input_handle:
            df_hamming_codes=pd.read_csv(input_handle,delimeter=',')
            
    return df_hamming_codes

In [None]:
df_hamming_codes = loadCodebookFromDisk('16-bit')

## Denovo primer design
1. Read (or create) set of 20-mers from known set of 25-mers
2. BLAST 20-mers against encoding+readout probes. Select all probes with less than 10 hits.
3. BLAST 20-mers against lincRNA, rRNA, tRNA, mt-tRNA, mt-RNA for species of interest. Select all probes with less than 10 hits.
3. Select forward and reverse primer based on minimum number of off-target hits.
4. Save primers into Pandas structure separately and as sets with encoding+readout probes

In [None]:
def generatePrimers(df_probes):

    big_list_25mers_all = list(SeqIO.parse('bc25mer.240k.fasta','fasta'))
    big_list_25mers=[]
    
    for i in range(0,len(big_list_25mers_all)):
        big_list_25mers.append(str(big_list_25mers_all[i].seq))
        
    K=20
    big_list_20mers=[]
        
    for trial_25mer in big_list_25mers:
        trial_20mers = [trial_25mer[i: j] for i in range(len(trial_25mer)) for j in range(i + 1, len(trial_25mer) + 1) if len(trial_25mer[i:j]) == K]
        for trial_20mer in trial_20mers:
            if (trial_20mer.endswith('G') or trial_20mer.endswith('C') 
                or str(trial_20mer[:-1]).endswith('G') or str(trial_20mers[:-1]).endswith('C')):
                    big_list_20mers.append(trial_20mer)
            
    pass_list=[]
    for primer in big_list_20mers:
        if not (('CCC' in primer) or ('TTT' in primer) or ('AAA' in primer) or ('GGG' in primer)):
            pass_list.append(primer)
                
    pass_list2=[]
    for primer in pass_list:
        gc_count = gcCheck(primer)

        if (gc_count>=40) and (gc_count<=50):
            pass_list2.append(primer)
            
    pass_list3=[]
    for primer in pass_list2:
        tmval = mt.Tm_NN(primer,Na=300,dnac1=25,dnac2=0)
        if (tmval>=70.0) and (tmval<=80.0):
                pass_list.append(primer)
    
    pass_list4=[]
    for primer in pass_list3:

        record = SeqRecord(Seq(primer,generic_dna),id=primer+'_'+str(i))
        
        with open('primer_test_h3g8.fasta','w') as output_handle:
            SeqIO.write(record,output_handle,'fasta')
                
        blastn_cline = blastn(query='primer_test_h3g8.fasta',db='/home/dps/merfish/blastdb/hg38/GRCh38',
                              out='primer_check_hg38.xml',dust='no',word_size=10,outfmt=5)
        stdout,stderr = blastn_cline()
        
        with open('primer_check_hg38.xml','r') as input_handle_hg38:
            blast_qresult = SearchIO.read(input_handle_hg38, 'blast-xml')

        flagged=True

        for hit in blast_qresult:
            if (hit.seq_len>=10):
                flagged=False
                break
                
        if flagged:
            pass_list4.append(primer)
            
    pass_list5=[]
    first = True
    for primer in pass_list4:

        record = SeqRecord(Seq(probe,generic_dna),id=probe+'_'+str(i))
        
        with open('primer_test_ncRNA.fasta','w') as output_handle:
            SeqIO.write(record,output_handle,'fasta')
                
        blastn_cline = blastn(query='primer_test_ncRNA.fasta',db='/home/dps/merfish/blastdb/tRNA/db/tRNA',
                              out='primer_check_ncRNA.xml',dust='no',word_size=10,outfmt=5)
        stdout,stderr = blastn_cline()
        
        with open('primer_check_ncRNA.xml','r') as input_handle_ncRNA:
            blast_qresult_ncRNA = SearchIO.read(input_handle_ncRNA, 'blast-xml')

        flagged=True

        for hit in blast_qresult_ncRNA:
            if (hit.seq_len>=10):
                flagged=False
                break
                
        if flagged:
            pass_list5.append(primer)
            
    for primer in pass_list5:
        
        record = SeqRecord(Seq(probe,generic_dna),id=probe+'_'+str(i),description="potential primer "+str(i))

        with open('/home/dps/merfish/blastdb/primer/primer_candidates.fasta','a') as output_handle:    
            SeqIO.write(record,output_handle,"fasta")

    cline = NcbimakeblastdbCommandline(dbtype='nucl',input_file='/home/dps/merfish/blastdb/primer/primer_candidates.fasta',
                                       title='primer',out='/home/dps/merfish/blastdb/primer/db/primer')
    stdout, stderr = cline()
                
    pass_list6=[]
    for primer in pass_list5:

        record = SeqRecord(Seq(primer,generic_dna),id=probe+'_'+str(i))
        
        with open('primer_test_final.fasta','w') as output_handle:
            SeqIO.write(record,output_handle,'fasta')
                
        blastn_cline = blastn(query='primer_test_final.fasta',db='/home/dps/merfish/blastdb/primer/db/primer',
                              out='primer_test_final.xml',dust='no',word_size=10,outfmt=5)
        stdout,stderr = blastn_cline()
        
        with open('primer_test_final.xml','r') as input_handle_final:
            blast_qresult_final = SearchIO.read(input_handle_final, 'blast-xml')
        
        flagged=True

        for hit in blast_qresult_final:
            if (hit.seq_len>=10) and (hit.seq_len<20):
                flagged=False
                break
                
        if flagged:
            pass_list5.append(probe)
    
    # place each probe in dataframe 
    df_readout_probes = pd.DataFrame(columns=['probe','sequence'])
    i=0
    for probe in pass_list5:
        df_readout_probes = df_readout.append({'probe': i, 'sequence': probe},ignore_index=True)
        i+=1
        
    return df_readout

In [None]:
# check if potential primers dataframe is saved to disk
# if true, load it
# if not, generate it (need to add checks?)

## Experimental design
1. Create experiment specific codebook using two-colors, number of genes, and known codes from Zhuang lab.
2. Assemble primers, encoding, and readout probes
3. Save experiental design into the Pandas structure.

In [None]:
def defineCodebook():
    
    # automatically assign low probe count genes (<30) to Alexa647
    
    # randomly assign remaining genes between Cy3B and Alexa647
    
    # loop over all genes
    
    # randomly assign probes for each gene to readout set A (readouts round 1 & 3) and readout set B (readout rounds 2 & 4)
    
    # create codebook for a given gene
    
    # return codebook for panel

In [None]:
def constructEncodingWithReadoutProbes():

In [None]:
def constructFullProbes():
    

In [None]:
# define barcode strategy
# 14-bit vs 16-bit
hamming_bit = '16-bit'

# define readout probe strategy
# standard (3 readout per probe) vs amplified (2 readout per probe)
# if standard, generate denovo probes or use known probes?
readout_strategy = 'amplified'

# define target genome
genome = 'hg38'

# define target genes
# use refGene ID is UCSC
# TO DO: isoform and/or exon specific targeting
# TO DO: create function to parse & load output of gene selection software
gene_ids=['ACTA1','ACTA2','NOS3','VEGFA','VEGFB',
          'VEGFC','VEGFD','KDR','HIF1A','EPAS1','HIF3A',
          'IGF1','IGF1R','HGF','ALK','COL18A1',
          'BMPR1A','CD34','VWF','KRT18','ACE2',
          'RPTOR','RICTOR','MTOR','PIK3CA','SFTPB',
          'SFTPC','EPCAM','MYRF','TMPRSS2','CTSL',
          'POL1A','POL2A']

# load modified hamming codes
df_barcodes = loadHammingFromDisk(bit)

# load (or generate) readout probe sequences
# TO DO: version tracking linked to specific readout probe orders
df_readout_probes = loadReadoutProbes(readout_strategy)

# generate encoding probe sequences
df_encoding_probes = generateEncodingProbes(database,gene_ids)

# generate codebook 
df_codebook = 

# generate encoding+readout probe sequences
df_encoding_readout_probes = 

# generate forward and reverse primer sequences for this set of encoding+readout probes
df_primers = 

# generate full probe sequences for ordering
df_full_probes = 

# save all dataframes to disk with unique identifiers and all settings


In [None]:
db='hg38'
g = Genome(db=database)
#obj = g.knownCanonical.filter_by(name=g.kgXref.geneSymbol('VEGFA')).first()
objs=g.refGene.filter_by(name2="VEGFA").all()

In [None]:
all_isoforms=[]
for obj in objs:
        all_isoforms.append(list(obj.exons))
        
# using naive method  
# to get list intersection 
res_list = [] 
for test_isoform_1 in all_isoforms:
    for test_isoform_2 in all_isoforms:
        for i in test_isoform_1: 
            if i in test_isoform_2:
                res_list.append(i) 

temp=np.array(res_list)
temp_uniq=np.unique(temp)
print(temp_uniq[0])