In [1]:
root_dir = "../../Downloads/GENOME540/"
test_wdag_path = root_dir + "test_wdag.txt"
wdag_path = root_dir + "wdag.txt"
wdag_unconstrained_path = root_dir + "wdag_unconstrained.txt"
score_scheme_path = root_dir + "scoring_scheme.txt"
test_fasta_path = root_dir + "CP003508.txt"
fasta_path = root_dir + "s_pyogenes.txt"
output_path = root_dir + "sequence_wdag.txt"

In [2]:
class Vertex:
    def __init__(self, label):
        self.label = label
        self.edges = []

    def add_edge(self, edge):
        self.edges.append(edge)

class Edge:
    def __init__(self, label, start, end, weight):
        self.label = label
        self.start = start
        self.end = end
        self.weight = weight

def read_input(filename):

    vertices = []
    start = None
    end = None
    with open(filename, 'r') as f:
        for line in f:
            line = line.split()

            if line[0] == 'V':
                label = line[1]
                vertex = Vertex(label)
                vertices.append(vertex)
                if 'START' in line:
                    start = label
                if 'END' in line:
                    end = label

            elif line[0] == 'E':
                label = line[1]
                start_label = line[2]
                end_label = line[3]
                weight = int(line[4])
                start_vertex = [v for v in vertices if v.label == start_label][0]
                end_vertex = [v for v in vertices if v.label == end_label][0]
                edge = Edge(label, start_vertex, end_vertex, weight)
                start_vertex.add_edge(edge)

    return vertices, start, end

v, s, e = read_input(wdag_path)

In [3]:
def highest_weight_path_constrained(vertices, start, end):

    def find_max_weight_path(v, scores, path_lists, visited=[]):

        if v.label == end:
            scores[v.label] = 0
            path_lists[v.label] = []
            return

        max_score = float('-inf')
        best_path_list = None
        for e in v.edges:
            if e.end.label in visited:
                continue
            if e.end.label not in scores:
                find_max_weight_path(e.end, scores, path_lists, visited + [v.label])
            if scores[e.end.label] + e.weight > max_score:
                max_score = scores[e.end.label] + e.weight
                best_path_list = path_lists[e.end.label] + [e.label]

        scores[v.label] = max_score
        path_lists[v.label] = best_path_list

    scores = {}
    path_lists = {}

    start_vertex = [v for v in vertices if v.label == start][0]
    find_max_weight_path(start_vertex, scores, path_lists)

    max_score = float('-inf')
    if scores[start] > max_score:
        max_score = scores[start]
    
    return max_score, s, e, path_lists[s][::-1]

score, start_node, end_node, path = highest_weight_path_constrained(v, s, e)

print("Score:", score)
print("Begin:", start_node)
print("End:", end_node)
print("Path:", "".join(path))

Score: 7
Begin: i
End: xiii
Path: CKP


In [4]:
from collections import deque

def highest_weight_path_unconstrained(vertices):
    highest_weight = 0
    highest_path = []
    memo = {}
    for vertex in vertices:
        queue = deque([(vertex, [], 0, vertex)])
        while queue:
            curr_vertex, path, weight, traceback = queue.popleft()
            if curr_vertex in memo and memo[curr_vertex][0] >= weight:
                continue
            memo[curr_vertex] = (weight, path, traceback)
            for edge in curr_vertex.edges:
                next_vertex = edge.end if edge.start == curr_vertex else edge.start
                new_path = path + [edge]
                new_weight = weight + edge.weight
                if new_weight < 0:
                    new_weight = 0
                    new_path = []
                    traceback = curr_vertex
                if new_weight > highest_weight:
                    highest_weight = new_weight
                    highest_path = new_path
                    start_node = traceback
                    end_node = next_vertex
                queue.append((next_vertex, new_path, new_weight, traceback))

    return highest_weight, start_node.label, end_node.label, [x.label for x in highest_path]

score, start_node, end_node, path = highest_weight_path_unconstrained(v)

print("Score:", score)
print("Begin:", start_node)
print("End:", end_node)
print("Path:", "".join(path))

Score: 13
Begin: ii
End: ix
Path: DFK


In [5]:
def read_scoring_scheme(filename):
    scoring_scheme = {}
    with open(filename, 'r') as f:
        for line in f:
            line = line.strip().split()
            scoring_scheme[line[0]] = float(line[1])
    return scoring_scheme

score_scheme = read_scoring_scheme(score_scheme_path)
score_scheme

{'A': -1.49, 'C': 0.74, 'G': 0.74, 'T': -1.49, 'N': 0.0}

In [6]:
def read_sequence(filename):
    
    sequence = ""
    base_counts = {"A": 0, "C": 0, "T": 0, "G": 0}
    invalid_chars = 0

    with open(filename, "r") as f:
        for line in f:
            if line[0] == ">":
                title = line
                continue
            for c in line:
                if c == " " or c == "\n":
                    continue
                try:
                    base_counts[c.upper()] += 1
                    sequence += c.upper()

                except:
                    invalid_chars += 1

        print("Fasta:", filename.split("/")[-1][:-4])
        print("Non-alphabetic characters:", invalid_chars)
        print(title.strip())
        print("*=" + str(len(sequence)))
        print("A=" + str(base_counts["A"]))
        print("C=" + str(base_counts["C"]))
        print("G=" + str(base_counts["G"]))
        print("T=" + str(base_counts["T"]))
        print("N=0")

    return sequence

seq = read_sequence(fasta_path)

Fasta: s_pyogenes
Non-alphabetic characters: 0
>NZ_LS483338.1 Streptococcus pyogenes strain NCTC12064 chromosome 1, complete sequence
*=1746380
A=537381
C=334523
G=338691
T=535785
N=0


In [7]:
def create_sequence_graph(sequence, scoring_scheme):
    vertices = []
    for i, residue in enumerate(sequence):
        label = str(i)
        vertex = Vertex(label)
        vertices.append(vertex)
        if i > 0:
            prev_vertex = vertices[i-1]
            edge = Edge(residue, prev_vertex, vertex, scoring_scheme[residue])
            prev_vertex.add_edge(edge)
    return vertices

verts = create_sequence_graph(seq, score_scheme)

In [8]:
def write_sequence_graph(vertices, filename):
    with open(filename, 'w') as f:
        for vertex in vertices:
            f.write('V {}\n'.format(vertex.label))
            for edge in vertex.edges:
                f.write('E {} {} {} {}\n'.format(edge.label, edge.start.label, edge.end.label, edge.weight))

write_sequence_graph(verts, output_path)

In [8]:
score, start_node, end_node, path = highest_weight_path_unconstrained(verts)

print("Score:", f'{score:.2f}')
print("Begin:", int(start_node)+2)
print("End:", int(end_node)+1)
print("Path:", "".join(path))

Score: 13.28
Begin: 235971
End: 236001
Path: GGCGGTGGCGGAGGAGGCGGCGGCGGTGCC


In [10]:
"Description: This sequence lies within the DQM35_RS01315 gene and encodes for DUF2207 domain-containing protein"

'Description: This sequence lies within the DQM35_RS01315 gene and encodes for DUF2207 domain-containing protein'