In [1]:
import pandas as pd
from Bio.SeqIO import parse
from Bio.Seq import reverse_complement
from Bio.pairwise2 import align
from Bio.pairwise2 import format_alignment

for n_pools in [3, 4]:
    df = pd.read_excel(f'../data/primers/282_pairs_{n_pools}_pools.xlsx')
    df[['gene', 'version']] = df['Name'].str.split('_v', expand=True).values
    df['gene'] = [s.split('_')[0] if not s.startswith('P') else s for s in df.gene]
    df['part'] = df.Name.str.extract('_(\d)_v').fillna('-')
    df.columns = ['_'.join(s.lower().split(' ')) for s in df]

    ref = {
        **{r.id: str(r.seq) for r in parse('../data/reference/crl75_392_genes.fa', 'fasta')},
        **{r.id: str(r.seq) for r in parse('../data/reference/fungicide_genes.fa', 'fasta')},
    }

    i = 0
    boundaries = []
    for _, row in df.iterrows():
        gene_seq = ref[row.gene].upper()
        f = row.forward_sequence
        r = row.reverse_sequence
        length = len(gene_seq)
        start = 0
        end = length
        for primer in [f, r, reverse_complement(f), reverse_complement(r)]:
            alignment = align.localms(gene_seq, primer, 1, -1, -1, -1)[0]
            if alignment.score > 15:
                if alignment.start > (len(gene_seq) / 2):
                    end = alignment.start + len(primer)
                else:
                    start = alignment.start
        boundaries.append((start, end, length))
        i += 1
        if not i % 10:
            print(i, end=' ')

    amplicon_boundaries = pd.DataFrame(boundaries, columns=['start', 'end', 'gene_length'])
    amplicon_boundaries['amplicon_length'] = amplicon_boundaries.end - amplicon_boundaries.start
    amplicon_boundaries['amplicon_pct'] = 100 * amplicon_boundaries.amplicon_length / amplicon_boundaries.gene_length
    df = df.join(amplicon_boundaries)
    df.to_excel(f'../data/primers/{df.shape[0]}_pairs_{df.gene.nunique()}_genes_{df.pool.max()}_pools.xlsx', index=None)

10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 

In [2]:
genes = sorted(set(df.gene))
with open(f'../data/reference/{len(genes)}_genes.fa', 'w') as f:
    for gene in sorted(df.gene):
        f.write('>' + gene + '\n' + ref[gene] + '\n')
gff = pd.concat([
    pd.read_csv('../data/reference/crl75_392_genes.gff', comment='#', sep='\t', header=None),
    pd.read_csv('../data/reference/fungicide_genes.gff3', comment='#', sep='\t', header=None),
])
gff[gff[0].isin(genes)].to_csv(f'../data/reference/{len(genes)}_genes.gff', sep='\t', header=None, index=None)