#  T-cell vaccine design
By optimising coverage of potential T-cell epitope (PTEs)

In [None]:
from Bio import SeqIO

In [None]:
# Change
fasta_path = '../data/nucleoprotein/3_nuc_pro_uniq.fa'
k = 9

In [None]:
def seqs_to_kmers(seqs, k=9):
    """
    Returns a dictionary of all possible k-mers and their counts for a given list of sequences and value of k
    :param seqs: List of amino acid sequences
    :param k: Integer for substring length 
    :returns: Dictionary containing all possible k-mers and their counts
    """
    kmers = {}
    for seq in seqs:
        for i in range(len(seq) - k + 1):
            kmer = seq[i:i+k]
            if kmer in kmers:
                # NOTE: Currently a k-mer can be counted multiple times within the same sequence
                kmers[kmer] += 1
            else:
                kmers[kmer] = 1
    return kmers

## Load the FASTA sequences

In [None]:
fasta_seqs = SeqIO.parse(open(fasta_path),'fasta')
seqs = {fasta.id: str(fasta.seq) for fasta in fasta_seqs}

## Split into k-mers
Compute all possible k-mers of length `k` for the given sequences

In [None]:
kmers = seqs_to_kmers(list(seqs.values()), k)

In [None]:
for kmer, count in kmers.items():
    print(kmer + "\t" + str(count))