#  T-cell vaccine design
Design vaccines to elicit a T-cell response by optimising coverage of potential T-cell epitope (PTEs)

In this notebook the terms k-mers, epitopes, node are used interchangeably because each k-mer is a potential T-cell epitope (PTE) and is represented as a node in the k-mer graph.

In [None]:
from Bio import SeqIO
import igviz as ig
from itertools import product
import networkx as nx

In [None]:
# Change
fasta_path = '../data/nucleoprotein/3_nuc_pro_uniq.fa'
k = 9

In [None]:
def seqs_to_kmers(seqs, k=9):
    """
    Returns a dictionary of all possible k-mers and their counts for a given list of sequences and value of k
    :param seqs: List of amino acid sequences
    :param k: Integer for substring length 
    :returns: Dictionary containing all possible k-mers and their counts
    """
    kmers_dict = {}
    for seq in seqs:
        for i in range(len(seq) - k + 1):
            kmer = seq[i:i+k]
            if kmer in kmers_dict:
                # TODO: Only count a k-mer once per sequence
                # TODO: Calculate the epitope frequency instead of just a count
                kmers_dict[kmer] += 1
            else:
                kmers_dict[kmer] = 1
    return kmers_dict


def P(G, e):
    """
    Returns the predecessors for a given graph G and node e
    :param G: Directed Graph containing k-mers
    :param e: String for a given potential T-cell epitope (PTE)
    :returns: List of predecessors
    """
    return list(G.predecessors(e))


def S(G, e):
    """
    Returns the successors for a given graph G and node e
    :param G: Directed Graph containing k-mers
    :param e: String for a given potential T-cell epitope (PTE)
    :returns: List of successors
    """
    return list(G.successors(e))


def f(G, e):
    """
    Returns the frequency for a given epitope e in the population
    :param G: Directed Graph containing k-mers
    :param e: String for a given potential T-cell epitope (PTE)
    :returns: Integer for the epitope frequency
    """
    return G.nodes[e]['Frequency']


def construct_graph(kmers_dict):
    """
    Return a Directed Graph with unique k-mers as nodes, where overlapping k-mers are connected by edges
    :param kmers_dict: Dictionary containing k-mers and their counts
    :returns: Directed Graph containing k-mers
    """
    # Create graph
    G = nx.DiGraph()
    # Add nodes - for each unique k-mer
    for kmer, count in kmers_dict.items():
        G.add_node(kmer, Frequency=count)
    # Add edges - where the last k−1 characters ofea match the first k−1 characters of eb
    for n1, n2 in product(G.nodes(), G.nodes()):
        if not G.has_edge(n1, n2) and n1[1:] == n2[:-1]:
            G.add_edge(n1, n2)
    # Add begin and end nodes
    G.add_node('BEGIN', Frequency=0)
    G.add_node('END', Frequency=0)
    begin_nodes = [e for e in list(G.nodes) if not P(G, e)]
    end_nodes = [e for e in list(G.nodes) if not S(G, e)]
    for n in begin_nodes:
        G.add_edge('BEGIN', n)
    for n in end_nodes:
        G.add_edge(n, 'END')
    return G

## Load the FASTA sequences

In [None]:
fasta_seqs = SeqIO.parse(open(fasta_path),'fasta')
seqs_dict = {fasta.id: str(fasta.seq) for fasta in fasta_seqs}
seqs = list(seqs_dict.values())

## Split into k-mers
Compute all possible k-mers of length `k` for the given sequences

In [None]:
kmers_dict = seqs_to_kmers(seqs, k)

In [None]:
for kmer, count in kmers_dict.items():
    print(kmer + "\t" + str(count))

## Construct the k-mer graph

In [None]:
G = construct_graph(kmers_dict)

## Plot the k-mer graph

In [None]:
fig = ig.plot(G, color_method='Frequency', node_text=['Frequency'], layout='spring') # 'spiral'
fig.show()

In [None]:
# import matplotlib.pyplot as plt
# freq = list(nx.get_node_attributes(G,'frequency').values())
# nx.draw(G, node_color=freq, node_size=10)
# plt.show()