In [1]:
root_dir = "../../Downloads/GENOME540/"
protein = "ferritins" # "bacterioferritins" "transthyretins" "hemoglobins" "insulins"
fastas_path = root_dir + protein + ".txt"
score_matrix_path = root_dir + "blosum62.txt"
output_path = root_dir + "_wdag.txt"

In [2]:
def read_fastas(filename):
    fastas = {}
    with open(filename, 'r') as f:
        lines = f.read().split('>')
        lines = [line for line in lines if line]
        for line in lines:
            title, seq = line.split('\n', 1)
            fastas[title] = seq.replace('\n', '')
    fastas = list(fastas.values())
    return fastas, list(set("".join(fastas))) + ["-"]

fastas, aminos = read_fastas(fastas_path)
fastas

['MALAPSKVSTFSGFSPKPSVGGAQKNPTCSVSLSFLNEKLGSRNLRVCASTVPLTGVIFEPFEEVKKSELAVPTAPQVSLARQNYADECESAINEQINVEYNASYVYHSLFAYFDRDNVALKGFAKFFKESSEEEREHAEKLMKYQNTRGGRVVLHPIKNAPSEFEHVEKGDALYAMELALSLEKLVNEKLLNVHSVADRNNDPQMADFIESEFLSEQVESIKKISEYVAQLRRVGKGHGVWHFDQRLLD',
 'MLLKAAPAFALLNTQGENLSPLFSSSKSFSPKNGNRFVVSASKATNHKPLTGVVFEPFEELKKELMLVPAVPDTSLCRQKYSDDCEAAINEQINVEYNNSYVYHAMFAYFDRDNVALKGLAKFFKESSLEEREHAEKLMEFQNKRGGRVKLLSICAPPTEFDHCEKGDALYAMELALCLEKLTNQRLLNLHAVASRSNDVHLADFLESEFLVEQVDAIKKISEYVAQLRRVGQGHGVWQFDQMLLNEGAAA',
 'MMLRVSPSPAAAVPTQLSGAPATPAPVVRVAAPRGVASPSAGAACRAAGKGKEVLSGVVFQPFEEIKGELALVPQSPDKSLARHKFVDDCEAALNEQINVEYNASYAYHSLFAYFDRDNVALKGFAKFFKESSDEEREHAEKLMEYQNKRGGRVRLQSIVTPLTEFDHPEKGDALYAMELALALEKLVNEKLHNLHGVATRCNDPQLTDFIESEFLEEQGEAINKISKYVAQLRRVGKGHGVWHFDQMLLEEEA']

In [3]:
def read_score_matrix(filename):
    with open(filename, 'r') as f:
        amino_acids = f.readline().strip().split()
        score_matrix = {}
        for line in f:
            scores = line.strip().split()
            amino_acid = scores[0]
            score_matrix[amino_acid] = {}
            for i in range(len(amino_acids)):
                score_matrix[amino_acid][amino_acids[i]] = int(scores[i + 1])
    return score_matrix

blosum62 = read_score_matrix(score_matrix_path)
gap_pen = -6

In [4]:
def get_weight(edge, score_matrix, gap_penalty):
    x1, x2, x3 = edge
    pair_weights = [0, 0, 0]
    if x1 != "-" and x2 != "-":
        pair_weights[0] = score_matrix[x1][x2]
    elif x1 != "-":
        pair_weights[0] = gap_penalty
    elif x2 != "-":
        pair_weights[0] = gap_penalty
    if x1 != "-" and x3 != "-":
        pair_weights[1] = score_matrix[x1][x3]
    elif x1 != "-":
        pair_weights[1] = gap_penalty
    elif x3 != "-":
        pair_weights[1] = gap_penalty
    if x2 != "-" and x3 != "-":
        pair_weights[2] = score_matrix[x2][x3]
    elif x2 != "-":
        pair_weights[2] = gap_penalty
    elif x3 != "-":
        pair_weights[2] = gap_penalty
    return sum(pair_weights)

In [None]:
import itertools

def display_weights(letters, score_matrix, gap_penalty):
    edges = sorted(["".join(x) for x in list(itertools.product(letters, repeat=3))])[1:]
    print("Edge weights:")
    for edge in edges:
        print(edge, get_weight(edge, score_matrix, gap_penalty))

display_weights(aminos, blosum62, gap_pen)

In [6]:
class Vertex:
    def __init__(self, label):
        self.label = label
        self.edges = []

    def add_edge(self, edge):
        self.edges.append(edge)

    def sort_edges(self):
        self.edges = sorted(self.edges, key=lambda edge: -edge.weight)

class Edge:
    def __init__(self, label, start, end, weight):
        self.label = label
        self.start = start
        self.end = end
        self.weight = weight

    def __lt__(self, other):
        return self.weight < other.weight

In [7]:
def create_graph(sequences, score_matrix, gap_penalty):
    seq1, seq2, seq3 = sequences[0], sequences[1], sequences[2]
    n1, n2, n3 = len(seq1), len(seq2), len(seq3)

    vertices = [[[Vertex(f"{i}_{j}_{k}") for k in range(n3 + 1)] for j in range(n2 + 1)] for i in range(n1 + 1)]

    for i in range(n1 + 1):
        for j in range(n2 + 1):
            for k in range(n3 + 1):
                vertex = vertices[i][j][k]

                if i < n1:
                    edge_label = seq1[i] + "-" + "-"
                    weight = get_weight(edge_label, score_matrix, gap_penalty)
                    neighbor = vertices[i + 1][j][k]
                    edge = Edge(edge_label, vertex, neighbor, weight)
                    vertex.add_edge(edge)

                if j < n2:
                    edge_label = "-" + seq2[j] + "-"
                    weight = get_weight(edge_label, score_matrix, gap_penalty)
                    neighbor = vertices[i][j + 1][k]
                    edge = Edge(edge_label, vertex, neighbor, weight)
                    vertex.add_edge(edge)

                if k < n3:
                    edge_label = "-" + "-" + seq3[k]
                    weight = get_weight(edge_label, score_matrix, gap_penalty)
                    neighbor = vertices[i][j][k + 1]
                    edge = Edge(edge_label, vertex, neighbor, weight)
                    vertex.add_edge(edge)

                if i < n1 and j < n2:
                    edge_label = seq1[i] + seq2[j] + "-"
                    weight = get_weight(edge_label, score_matrix, gap_penalty)
                    neighbor = vertices[i + 1][j + 1][k]
                    edge = Edge(edge_label, vertex, neighbor, weight)
                    vertex.add_edge(edge)

                if j < n2 and k < n3:
                    edge_label = "-" + seq2[j] + seq3[k]
                    weight = get_weight(edge_label, score_matrix, gap_penalty)
                    neighbor = vertices[i][j + 1][k + 1]
                    edge = Edge(edge_label, vertex, neighbor, weight)
                    vertex.add_edge(edge)

                if i < n1 and k < n3:
                    edge_label = seq1[i] + "-" + seq3[k]
                    weight = get_weight(edge_label, score_matrix, gap_penalty)
                    neighbor = vertices[i + 1][j][k + 1]
                    edge = Edge(edge_label, vertex, neighbor, weight)
                    vertex.add_edge(edge)

                if i < n1 and j < n2 and k < n3:
                    edge_label = seq1[i] + seq2[j] + seq3[k]
                    weight = get_weight(edge_label, score_matrix, gap_penalty)
                    neighbor = vertices[i + 1][j + 1][k + 1]
                    edge = Edge(edge_label, vertex, neighbor, weight)
                    vertex.add_edge(edge)

    return [vertex for row in vertices for col in row for vertex in col]

verts = create_graph(fastas, blosum62, gap_pen)

In [8]:
def write_graph(vertices, filename, write=False):
    if write:
        with open(filename, "w") as f:
            for vertex in vertices:
                f.write(f"V {vertex.label}\n")
                for edge in vertex.edges:
                    f.write(f"E {edge.start.label} {edge.end.label} {edge.weight}\n")

write_graph(verts, output_path)

In [None]:
from collections import Counter

print("Edge counts:")
counts = sorted(Counter([y.label for x in verts for y in x.edges]).items())
for count in counts:
    print(count[0], count[1])

In [10]:
from collections import deque

def topological_sort(vertices):
    indegree = {v: 0 for v in vertices}
    for v in vertices:
        for e in v.edges:
            indegree[e.end] += 1
    
    queue = deque(v for v in vertices if indegree[v] == 0)
    
    result = []
    while queue:
        v = queue.popleft()
        result.append(v)
        for e in v.edges:
            indegree[e.end] -= 1
            if indegree[e.end] == 0:
                queue.append(e.end)
    
    if len(result) != len(vertices):
        raise RuntimeError("bad graph")
    
    return result

def get_path(vertex, longest_paths):
    path = []
    while longest_paths[vertex] != 0:
        max_edge = max(vertex.edges, key=lambda e: longest_paths[e.end] + e.weight)
        path.append(max_edge)
        vertex = max_edge.end
    return path[::-1]

def highest_weight_path(vertices):

    sorted_vertices = topological_sort(vertices)
    
    longest_paths = {v: 0 for v in vertices}
    
    for v in reversed(sorted_vertices):
        longest_path = 0
        for e in v.edges:
            longest_path = max(longest_path, longest_paths[e.end] + e.weight)
        longest_paths[v] = longest_path
    
    highest_weight = max(longest_paths.values())
    highest_path = []
    for v in vertices:
        if longest_paths[v] == highest_weight:
            highest_path = get_path(v, longest_paths)
            break
    
    if highest_path:
        return highest_weight, [x.label for x in highest_path][::-1]
    else:
        return 0, []

score, path = highest_weight_path(verts)

print("Score:", score)
print("Local alignment:")
print("\n".join(path))

Score: 2325
Local alignment:
PPV
LLL
TTS
GGG
VVV
IVV
FFF
EEQ
PPP
FFF
EEE
EEE
VLI
KKK
KKG
S--
EEE
LLL
AMA
-LL
VVV
PPP
TAQ
AVS
PPP
QDD
VTK
SSS
LLL
ACA
RRR
QQH
NKK
YYF
ASV
DDD
EDD
CCC
EEE
SAA
AAA
IIL
NNN
EEE
QQQ
III
NNN
VVV
EEE
YYY
NNN
ANA
SSS
YYY
VVA
YYY
HHH
SAS
LML
FFF
AAA
YYY
FFF
DDD
RRR
DDD
NNN
VVV
AAA
LLL
KKK
GGG
FLF
AAA
KKK
FFF
FFF
KKK
EEE
SSS
SSS
ELD
EEE
EEE
RRR
EEE
HHH
AAA
EEE
KKK
LLL
MMM
KEE
YFY
QQQ
NNN
TKK
RRR
GGG
GGG
RRR
VVV
VKR
LLL
HLQ
PSS
III
KCV
NAT
APP
PPL
STT
EEE
FFF
EDD
HHH
VCP
EEE
KKK
GGG
DDD
AAA
LLL
YYY
AAA
MMM
EEE
LLL
AAA
LLL
SCA
LLL
EEE
KKK
LLL
VTV
NNN
EQE
KRK
LLL
LLH
NNN
VLL
HHH
SAG
VVV
AAA
DST
RRR
NSC
NNN
DDD
PVP
QHQ
MLL
AAT
DDD
FFF
ILI
EEE
SSS
EEE
FFF
LLL
SVE
EEE
QQQ
VVG
EDE
SAA
III
KKN
KKK
III
SSS
EEK
YYY
VVV
AAA
QQQ
LLL
RRR
RRR
VVV
GGG
KQK
GGG
HHH
GGG
VVV
WWW
HQH
FFF
DDD
QQQ
RMM
LLL
LLL
DNE


In [11]:
def extract_alignments(alignment):
    alignment1 = "".join([x[0] for x in alignment if x[0] != '-'])
    alignment2 = "".join([x[1] for x in alignment if x[1] != '-'])
    alignment3 = "".join([x[2] for x in alignment if x[2] != '-'])
    return [alignment1, alignment2, alignment3]

aligns = extract_alignments(path)

In [12]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

def highlight_substring(full_string, substring):

    start_index = full_string.find(substring)
    if start_index == -1:
        return full_string
    end_index = start_index + len(substring)

    print(f"{full_string[:start_index]}{bcolors.OKGREEN}{full_string[start_index:end_index]}{bcolors.ENDC}{full_string[end_index:]}")

def highlight_alignments(sequences, alignments):

    for s, a in zip(sequences, alignments):
        highlight_substring(s, a)

highlight_alignments(fastas, aligns)

MALAPSKVSTFSGFSPKPSVGGAQKNPTCSVSLSFLNEKLGSRNLRVCASTV[92mPLTGVIFEPFEEVKKSELAVPTAPQVSLARQNYADECESAINEQINVEYNASYVYHSLFAYFDRDNVALKGFAKFFKESSEEEREHAEKLMKYQNTRGGRVVLHPIKNAPSEFEHVEKGDALYAMELALSLEKLVNEKLLNVHSVADRNNDPQMADFIESEFLSEQVESIKKISEYVAQLRRVGKGHGVWHFDQRLLD[0m
MLLKAAPAFALLNTQGENLSPLFSSSKSFSPKNGNRFVVSASKATNHK[92mPLTGVVFEPFEELKKELMLVPAVPDTSLCRQKYSDDCEAAINEQINVEYNNSYVYHAMFAYFDRDNVALKGLAKFFKESSLEEREHAEKLMEFQNKRGGRVKLLSICAPPTEFDHCEKGDALYAMELALCLEKLTNQRLLNLHAVASRSNDVHLADFLESEFLVEQVDAIKKISEYVAQLRRVGQGHGVWQFDQMLLN[0mEGAAA
MMLRVSPSPAAAVPTQLSGAPATPAPVVRVAAPRGVASPSAGAACRAAGKGKE[92mVLSGVVFQPFEEIKGELALVPQSPDKSLARHKFVDDCEAALNEQINVEYNASYAYHSLFAYFDRDNVALKGFAKFFKESSDEEREHAEKLMEYQNKRGGRVRLQSIVTPLTEFDHPEKGDALYAMELALALEKLVNEKLHNLHGVATRCNDPQLTDFIESEFLEEQGEAINKISKYVAQLRRVGKGHGVWHFDQMLLE[0mEEA
