#  T-cell vaccine design
Design vaccines to elicit a T-cell response by optimising coverage of potential T-cell epitope (PTEs)

Here the term epitope `e` refers to a potential T-cell epitope (PTE) which is a short subsequence of k amino acids and also  represented as a node in the k-mer graph.

In [None]:
from Bio import SeqIO
import igviz as ig
from itertools import product
import networkx as nx

In [None]:
# Change
fasta_path = '../data/nucleoprotein/3_nuc_pro_uniq.fa'
k = 9

In [None]:
def seqs_to_kmers(seqs, k=9):
    """
    Returns a dictionary of all possible k-mers and their frequencies for a given list of sequences and value of k
    :param seqs: List of amino acid sequences
    :param k: Integer for substring length 
    :returns: Dictionary containing all possible k-mers and their frequencies
    """
    kmers_dict = {}
    N = len(seqs)
    for seq in seqs:
        # Get a unique set of k-mers for each sequence with the position they were first found in
        seq_kmers = {}
        for i in range(N - k + 1):
            e = seq[i:i+k]
            if e not in seq_kmers:
                seq_kmers[e] = i + 1
        # Count the number of seqs for each k-mer (n)
        for e, pos in seq_kmers.items():
            if e in kmers_dict:
                kmers_dict[e]['n'] = kmers_dict[e]['n'] + 1
            else:
                kmers_dict[e] = {'n': 1, 'pos': pos}
    # Calculate epitope frequency
    for e in kmers_dict:
        kmers_dict[e]['freq'] = kmers_dict[e]['n']/N
    return kmers_dict


def path_to_seq(path):
    """
    Returns an AA string for a list of epitopes (path)
    """
    seq = [path[0]] + [e[-1] for e in path[1:]]
    return ''.join(seq)


def argmax(lst):
    """
    Returns the index for the maximum value in a list
    """
    return lst.index(max(lst))


def P(G, e):
    """
    Returns the predecessors for a given graph G and node e
    :param G: Directed Graph containing k-mers
    :param e: String for a given potential T-cell epitope (PTE)
    :returns: List of predecessors
    """
    return list(G.predecessors(e))


def S(G, e):
    """
    Returns the successors for a given graph G and node e
    :param G: Directed Graph containing k-mers
    :param e: String for a given potential T-cell epitope (PTE)
    :returns: List of successors
    """
    return list(G.successors(e))


def f(G, e, f='Frequency'):
    """
    Returns the feature for a given epitope e eg frequency in the population
    :param G: Directed Graph containing k-mers
    :param e: String for a given potential T-cell epitope (PTE)
    :param e: String for the node feature (default = 'Frequency')
    :returns: Float for the epitope frequency
    """
    return G.nodes[e][f]


def F(G, e):
    """
    Returns the the maximum total frequency over all paths that end in e
    :param G: Directed Graph containing k-mers
    :param e: String for a given potential T-cell epitope (PTE)
    :returns: Float for the maximum total epitope frequency
    """
    # Use precomputed F(e) if it already exists for the epitope
    if 'F(e)' not in G.nodes[e]:
        predecessors = P(G, e)
        if not predecessors:
            # If the set of predecessors P(e) is empty, then F(e) = f(e)
            Fe = f(G, e)
        else:
            # If the set of predecessors P(e) is not empty, then F(e) = f(e) + max(F(P(e)))
            Fe = f(G, e) + max([F(G, pe) for pe in predecessors])
        # Save F(e) to the graph for this epitope
        nx.set_node_attributes(G, {e: Fe}, 'F(e)')
    return f(G, e, f='F(e)')


def backward(G, path=[]):
    """
    Returns the path that achieves the maximal score
    :param G: Directed Graph containing k-mers
    :param path: List of epitope strings to complete (deafult=[])
    :returns: List of epitope strings on path that achieve maximum score
    """
    # Get the precomputed F(e) from the graph for all epitopes
    Fe_dict = nx.get_node_attributes(G, 'F(e)')
    if not path:
        # Get the epitope with the maximum F(e) as the final epitope in our optimal path
        path = [max(Fe_dict, key=Fe_dict.get)]
    # Get the most recently added epitope e and it's predecessors P(e)
    e = path[0]
    predecessors = P(G, e)
    if predecessors[0] != 'BEGIN':
        # Add the best (highest F(e)) predecessor P(e) of epitope e to our path
        i = argmax([Fe_dict[pe] for pe in predecessors])
        path.insert(0, predecessors[i])
        # Repeat until you get to the start
        backward(G, path)
    return path


def construct_graph(kmers_dict):
    """
    Return a Directed Graph with unique k-mers as nodes, where overlapping k-mers are connected by edges
    :param kmers_dict: Dictionary containing k-mers and their counts
    :returns: Directed Graph containing k-mers
    """
    # Create graph
    G = nx.DiGraph()
    # Add nodes - for each unique k-mer
    for e in kmers_dict:
        freq = kmers_dict[e]['freq']
        pos = kmers_dict[e]['pos']
        G.add_node(e, Frequency=freq, pos=(pos, freq))
    # Add edges - where the last k−1 characters of ea match the first k−1 characters of eb
    for ea, eb in product(G.nodes(), G.nodes()):
        if not G.has_edge(ea, eb) and ea[1:] == eb[:-1]:
            G.add_edge(ea, eb)
    # Add begin and end nodes
    begin_nodes = [e for e in list(G.nodes) if not P(G, e)]
    end_nodes = [e for e in list(G.nodes) if not S(G, e)]
    end_pos = max([pos[0] for pos in list(nx.get_node_attributes(G, 'pos').values())]) + 1
    G.add_node('BEGIN', Frequency=0, pos=(0, 0))
    G.add_node('END', Frequency=0, pos=(end_pos, 0))
    for e in begin_nodes:
        G.add_edge('BEGIN', e)
    for e in end_nodes:
        G.add_edge(e, 'END')
    return G

## Simple example 

In [None]:
# Define the input epitopes
kmers_dict = {
    'MSA': {'freq':6, 'pos':1},
    'SAM': {'freq':2, 'pos':2},
    'AMQ': {'freq':2, 'pos':3},
    'MQL': {'freq':2, 'pos':4},
    'SAR': {'freq':4, 'pos':2},
    'MGA': {'freq':3, 'pos':1},
    'GAR': {'freq':7, 'pos':2},
    'ARQ': {'freq':4, 'pos':3},
    'RQL': {'freq':4, 'pos':4},
}

# Construct the graph
G = construct_graph(kmers_dict)

# Forward loop - compute F(e)
for e in G.nodes:
    F(G, e)
    
# Backward loop - build the path that achieves the maximal score
path = backward(G)
print(path_to_seq(path))

# Plot the results
fig = ig.plot(G, color_method='Frequency', node_text=['Frequency', 'F(e)']) # , layout='spectral'
fig.show()

In [None]:
# Backward loop - build the path that achieves the maximal score
path = backward(G)
path_to_seq(path)

## Load the FASTA sequences

In [None]:
fasta_seqs = SeqIO.parse(open(fasta_path),'fasta')
seqs_dict = {fasta.id: str(fasta.seq) for fasta in fasta_seqs}
seqs = list(seqs_dict.values())

## Split into k-mers
Compute all possible k-mers of length `k` for the given sequences

In [None]:
kmers_dict = seqs_to_kmers(seqs, k)

## Construct the k-mer graph

In [None]:
G = construct_graph(kmers_dict)

## Plot the k-mer graph

In [None]:
fig = ig.plot(G, color_method='Frequency', node_text=['Frequency', 'pos']) # layout='spiral','spring'
fig.show()

## Assembly 
Take a path through the graph to optimise epitope frequency

In [None]:
# TODO: Add decycling

# Forward loop - compute F(e)
for e in G.nodes:
    F(G, e)

# Backward loop - build the path that achieves the maximal score
path = backward(G)
path_to_seq(path)

# TODO: Investigate why the final sequence is so short (due to the lack of decycling?)
# TODO: Add additional measures to scoring function eg binding affinity prediction