In [48]:
def find_closest_cluster(distances):
    cur_min = 10e8
    cur_cluster = (-1, -1)
    for key in distances:
        if key[0] != key[1]:
            dist = distances[(key[0], key[1])]
            if dist < cur_min:
                cur_min = dist
                cur_cluster = (key[0], key[1])
    return cur_cluster, cur_min


def upgma(current_distances, n, w=False):
    tree = {}
    clusters = [i for i in range(n)]
    for i in clusters:
        tree[i] = (-1, -1, 0, 1)  # left child, right child, weight, cluster size

    while len(clusters) != 2:
        (i, j), distance = find_closest_cluster(current_distances)
        i_size = tree[i][3]
        j_size = tree[j][3]
        tree[n] = (i, j, distance / 2, i_size + j_size)

        clusters.remove(i)
        clusters.remove(j)

        new_distances = {}
        for a in clusters:
            for b in clusters:
                new_distances[(a, b)] = current_distances[(a, b)]

        for c in clusters:
            new_distances[(n, c)] = (current_distances[(i, c)] * (1 if w else i_size) + current_distances[(j, c)] * (1 if w else j_size)) / (
                    (1 if w else i_size) + (1 if w else j_size))
            new_distances[(c, n)] = new_distances[(n, c)]

        new_distances[(n, n)] = 0

        clusters.append(n)
        current_distances = new_distances
        n += 1
    (i, j), distance = find_closest_cluster(current_distances)
    i_size = tree[i][3]
    j_size = tree[j][3]
    tree[n] = (i, j, distance / 2, i_size + j_size)

    return tree

## Newick format

In [49]:
def newick_dfs(edges, visited, names, current_node):
    visited[current_node] = True
    for key, value in edges.items():
        if key == current_node and value[0]:
            return '({}:{:.2f},{}:{:.2f})'.format(newick_dfs(edges, visited, names, value[1]), value[0], newick_dfs(edges, visited, names, value[3]), value[2])
    return names[current_node]
    

## Testing

In [50]:
from collections import defaultdict

### UPGMA. Test 1

In [51]:
matrix = [[0, 16, 16, 10], [16, 0, 8, 8], [16, 8, 0, 4], [10, 8, 4, 0]]
names = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}

edges = {}
n = 4
for i in range(n):
    for j in range(n):
        edges[(i, j)] = matrix[i][j]
        
edges = upgma(edges, n)
graph = defaultdict(list)
for key, value in edges.items():
    if value[0] != -1:
        graph[key] = [value[2] - edges[value[0]][2], value[0], value[2] - edges[value[1]][2], value[1]]
vertices_count = max(edges.keys()) + 1

print(newick_dfs(graph, [False for _ in range(vertices_count)], names, vertices_count - 1))

(((C:2.00,D:2.00):2.00,B:4.00):3.00,A:7.00)


### UPGMA. Test 2

In [52]:
matrix = [[0, 5, 4, 7, 6, 8], [5, 0, 7, 10, 9, 11], [4, 7, 0, 7, 6, 8], 
          [7, 10, 7, 0, 5, 9], [6, 9, 6, 5, 0, 8], [8, 11, 8, 9, 8, 0]]
names = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'}

edges = {}
n = 6
for i in range(n):
    for j in range(n):
        edges[(i, j)] = matrix[i][j]
        
edges = upgma(edges, n)
graph = defaultdict(list)
for key, value in edges.items():
    if value[0] != -1:
        graph[key] = [value[2] - edges[value[0]][2], value[0], value[2] - edges[value[1]][2], value[1]]
vertices_count = max(edges.keys()) + 1

print(newick_dfs(graph, [False for _ in range(vertices_count)], names, vertices_count - 1))

(((B:3.00,(A:2.00,C:2.00):1.00):0.75,(D:2.50,E:2.50):1.25):0.65,F:4.40)


### WPGMA. Test 1

In [53]:
matrix = [[0, 16, 16, 10], [16, 0, 8, 8], [16, 8, 0, 4], [10, 8, 4, 0]]
names = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}

edges = {}
n = 4
for i in range(n):
    for j in range(n):
        edges[(i, j)] = matrix[i][j]
        
edges = upgma(edges, n, w=True)
graph = defaultdict(list)
for key, value in edges.items():
    if value[0] != -1:
        graph[key] = [value[2] - edges[value[0]][2], value[0], value[2] - edges[value[1]][2], value[1]]
vertices_count = max(edges.keys()) + 1

print(newick_dfs(graph, [False for _ in range(vertices_count)], names, vertices_count - 1))

(((C:2.00,D:2.00):2.00,B:4.00):3.25,A:7.25)


### WPGMA. Test 2

In [55]:
matrix = [[0, 5, 4, 7, 6, 8], [5, 0, 7, 10, 9, 11], [4, 7, 0, 7, 6, 8], 
          [7, 10, 7, 0, 5, 9], [6, 9, 6, 5, 0, 8], [8, 11, 8, 9, 8, 0]]
names = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'}

edges = {}
n = 6
for i in range(n):
    for j in range(n):
        edges[(i, j)] = matrix[i][j]
        
edges = upgma(edges, n)
graph = defaultdict(list)
for key, value in edges.items():
    if value[0] != -1:
        graph[key] = [value[2] - edges[value[0]][2], value[0], value[2] - edges[value[1]][2], value[1]]
vertices_count = max(edges.keys()) + 1

print(newick_dfs(graph, [False for _ in range(vertices_count)], names, vertices_count - 1))

(((B:3.00,(A:2.00,C:2.00):1.00):0.75,(D:2.50,E:2.50):1.25):0.65,F:4.40)
