Skip to content

Commit

Permalink
seqhac
Browse files Browse the repository at this point in the history
  • Loading branch information
ldhulipala committed Apr 21, 2021
1 parent 232549f commit 1ecf43c
Show file tree
Hide file tree
Showing 12 changed files with 2,009 additions and 0 deletions.
29 changes: 29 additions & 0 deletions benchmarks/Clustering/SeqHAC/AvgLinkageUtils/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
cc_library(
name = "ClusteredGraph",
hdrs = ["ClusteredGraph.h"],
deps = [
"//gbbs:gbbs",
"@PAM//pam:pam",
]
)

cc_library(
name = "HAC_configuration",
hdrs = ["HAC_configuration.h"],
deps = [
"//gbbs:macros",
]
)

cc_library(
name = "HeapBased",
hdrs = ["HeapBased.h"],
deps = [
":ClusteredGraph",
"//gbbs:gbbs",
]
)

package(
default_visibility = ["//visibility:public"],
)
339 changes: 339 additions & 0 deletions benchmarks/Clustering/SeqHAC/AvgLinkageUtils/ClusteredGraph.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,339 @@
// This code is part of the project "Theoretically Efficient Parallel Graph
// Algorithms Can Be Fast and Scalable", presented at Symposium on Parallelism
// in Algorithms and Architectures, 2018.
// Copyright (c) 2018 Laxman Dhulipala, Guy Blelloch, and Julian Shun
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once

#include <queue>
#include <unordered_set>
#include <vector>

#include "gbbs/gbbs.h"
#include "pam/pam.h"

namespace gbbs {
namespace approx_average_linkage {

template <class Weights, class IW, template <class W> class w_vertex>
struct clustered_graph {

using orig_vertex = w_vertex<IW>;
using Graph = symmetric_graph<w_vertex, IW>;

using W = typename Weights::weight_type;
using internal_edge = std::pair<uintE, std::pair<uintE, W>>;
using edge = std::pair<uintE, W>;

struct neighbor_entry {
using key_t = uintE; // neighbor_id
using val_t = std::pair<uintE, W>; // (id * weight)
using aug_t = W; // aggregated weight
static inline bool comp(key_t a, key_t b) { return a < b; }
static aug_t get_empty() { return Weights::id(); }
static aug_t from_entry(key_t k, val_t v) { return v.second; } // (get weight)
// used to select min/max edges based on similarity/dissimilarity clustering.
static aug_t combine(aug_t a, aug_t b) { return Weights::augmented_combine(a, b); }
};

using neighbor_map = aug_map<neighbor_entry>;

struct clustered_vertex {

clustered_vertex() {}

clustered_vertex(uintE vtx_id, orig_vertex& vertex, const Weights& weights) {
auto cluster_size = vertex.out_degree();
staleness = cluster_size;
num_in_cluster = 1; // initially just this vertex
active = true;
current_id = vtx_id;

auto edges = sequence<internal_edge>::uninitialized(cluster_size);

size_t i = 0;
auto map_f = [&] (const uintE& u, const uintE& v, const IW& wgh) {
W true_weight = Weights::get_weight(u, v, wgh);
edges[i++] = std::make_pair(v, std::make_pair(v, true_weight));
};
vertex.out_neighbors().map(map_f, /* parallel = */false);

neighbors = neighbor_map(edges);
}

std::optional<edge> highest_priority_edge() {
if (neighbor_size() == 0) return {};
W m = neighbors.aug_val();
internal_edge entry;
entry = *neighbors.aug_eq(m);
assert(entry.second.second == m);
return entry.second;
}

uintE neighbor_size() {
return neighbors.size();
}

uintE size() {
return num_in_cluster;
}

bool is_active() {
return active;
}

uintE get_current_id() {
return current_id;
}

void set_current_id(uintE id) {
current_id = id;
}

bool is_stale(double epsilon) {
return ((staleness * (1 + epsilon)) < size());
}

// Tracks the last cluster update size.
uintE staleness;
// The "current" id of this cluster, updated upon a merge that keeps this cluster active.
uintE current_id;
// Number of vertices contained in this cluster.
uintE num_in_cluster;
// Active == false iff this cluster has not yet been clustered
bool active;
// An augmented map storing our neighbors + weights.
neighbor_map neighbors;
};

Graph& G;
Weights& weights;
double epsilon;
uintE n;
uintE last_cluster_id;
uintE num_merges_performed;

parlay::sequence<clustered_vertex> clusters;
parlay::sequence<std::pair<uintE, W>> dendrogram;

// Returns whether this cluster is still active, or whether it has been merged
// into a _larger_ cluster.
bool is_active(uintE id) {
return clusters[id].is_active();
}

uintE new_cluster_id() {
uintE ret = last_cluster_id;
last_cluster_id++;
return ret;
}

uintE unite(uintE a, uintE b, W wgh) {
assert(is_active(a));
assert(is_active(b));
// Identify smaller/larger clusters (will merge smaller -> larger).
uintE d_a = clusters[a].neighbor_size();
uintE d_b = clusters[b].neighbor_size();
uintE smaller, larger;
if (d_a < d_b) {
smaller = a; larger = b;
} else {
larger = a; smaller = b;
}

// Deactivate smaller.
clusters[smaller].active = false;

// Merge smaller and larger's neighbors.
auto smaller_ngh = std::move(clusters[smaller].neighbors);
auto larger_ngh = std::move(clusters[larger].neighbors);

// Some sanity asserts, we are merging an edge incident to both after all.
assert(smaller_ngh.size() > 0);
assert(larger_ngh.size() > 0);

// Remove larger's id from smaller, and vice versa.
assert(smaller_ngh.contains(larger));
assert(larger_ngh.contains(smaller));
auto small_pre_merge = neighbor_map::remove(std::move(smaller_ngh), larger);
auto large_pre_merge = neighbor_map::remove(std::move(larger_ngh), smaller);

auto smaller_keys = neighbor_map::keys(small_pre_merge);

// First merge to calculate the new size of this cluster.
auto first_merge = neighbor_map::map_union(
small_pre_merge,
large_pre_merge);
uintE merged_size = first_merge.size();
first_merge.~neighbor_map();


size_t new_cluster_size = clusters[larger].num_in_cluster + clusters[smaller].num_in_cluster;
auto linkage = Weights::GetLinkage(clusters, new_cluster_size);
std::cout << "Performed first merge. New cluster size = " << new_cluster_size << std::endl;

auto merged = neighbor_map::map_union(
std::move(small_pre_merge),
std::move(large_pre_merge),
linkage);
assert(merged.size() == merged_size);

clusters[larger].neighbors = std::move(merged);

// Save that clusters a and b are merged.
uintE current_a = clusters[a].get_current_id();
uintE current_b = clusters[b].get_current_id();
uintE new_id = new_cluster_id(); // increments next_id
num_merges_performed++;

dendrogram[current_a] = {new_id, wgh};
dendrogram[current_b] = {new_id, wgh};

// Update the current id of the remaining cluster.
clusters[larger].current_id = new_id;

// Update the size of the remaining cluster.
clusters[larger].num_in_cluster = new_cluster_size;
std::cout << "Num in cluster = " << clusters[larger].num_in_cluster << std::endl;

// Map over _all_ of smaller's edges, and update its neighbors to point to
// larger. If the neighbor, w, also has an edge to larger (a
// smaller-larger-w triangle), then update the weight of this edge.
for (size_t i=0; i<smaller_keys.size(); i++) {
uintE w = smaller_keys[i];
assert(clusters[w].neighbors.contains(smaller)); // Sanity.

auto w_zero = std::move(clusters[w].neighbors);
auto found_value = *(w_zero.find(smaller)); // value
auto w_one = neighbor_map::remove(std::move(w_zero), smaller);

// Insert larger, merging using Weights::linkage if it already exists in
// the tree.
found_value.first = larger;
auto new_value = Weights::UpdateWeight(clusters, found_value, new_cluster_size);
auto larger_ent = std::make_pair(larger, new_value);

auto larger_value = w_one.find(larger);

w_one.insert(larger_ent, linkage);

// Move the neighbors back.
clusters[w].neighbors = std::move(w_one);
}


// Staleness check.
if (clusters[larger].is_stale(epsilon)) {
std::cout << "LARGER = " << larger << " is STALE" << std::endl;
// Update our own edges.
auto edges = std::move(clusters[larger].neighbors);
auto map_f = [&] (const auto& entry) {
return Weights::UpdateWeight(clusters, entry.second, new_cluster_size);
};
auto updated_edges = neighbor_map::map(edges, map_f);
clusters[larger].neighbors = std::move(updated_edges);

// Map over the edges, and update on our neighbors endpoints.
auto update_ngh_f = [&] (const auto& entry) {
uintE ngh_id = entry.first;
auto val = entry.second;
uintE val_id = val.first;
assert(ngh_id == val_id);

val.first = larger; // place our id
auto updated_val = Weights::UpdateWeight(clusters, val, new_cluster_size); // update weight
auto new_entry = std::make_pair(larger, updated_val);

// Now update our neighbor.
assert(clusters[ngh_id].neighbors.contains(larger));
clusters[ngh_id].neighbors.insert(new_entry);
};
neighbor_map::map_void(clusters[larger].neighbors, update_ngh_f);

// Update staleness.
clusters[larger].staleness = clusters[larger].size();
std::cout << "Finished update." << std::endl;
}


return larger;
}


clustered_graph(Graph& G, Weights& weights, double epsilon) : G(G), weights(weights), epsilon(epsilon) {
n = G.n;
last_cluster_id = n;
num_merges_performed = 0;
clusters = parlay::sequence<clustered_vertex>(n);
dendrogram = parlay::sequence<std::pair<uintE, W>>(2*n - 2, std::make_pair(UINT_E_MAX, W()));

parallel_for(0, n, [&] (size_t i) {
auto orig = G.get_vertex(i);
clusters[i] = clustered_vertex(i, orig, weights);
});
std::cout << "Built all vertices" << std::endl;
}

// extract dendrogram
sequence<std::pair<uintE, W>> get_dendrogram() {

std::cout << "num_merges_performed = " << num_merges_performed << std::endl;
std::cout << "n = " << n << std::endl;

if (num_merges_performed < n-1) {
size_t last_clust = last_cluster_id;
auto ids = parlay::delayed_seq<uintE>(last_clust + 1, [&] (size_t i) {
if (dendrogram[i].first == UINT_E_MAX) return (uintE)i;
return UINT_E_MAX;
});
auto bad = parlay::filter(ids, [&] (const uintE& e) { return e != UINT_E_MAX; });

std::cout << "num bad = " << bad.size() << std::endl;

std::queue<uintE> bad_queue;
for (size_t i=0; i<bad.size(); i++) {
bad_queue.push(bad[i]);
}

while (bad_queue.size() > 1) {
uintE fst = bad_queue.front();
bad_queue.pop();
uintE snd = bad_queue.front();
bad_queue.pop();

uintE new_id = new_cluster_id(); // increments next_id
dendrogram[fst] = {new_id, Weights::id()};
dendrogram[snd] = {new_id, Weights::id()};

std::cout << "Merged components for: " << fst << " " << snd << std::endl;

bad_queue.push(new_id);
}
}

return std::move(dendrogram);
}


};


} // namespace clustering
} // namespace gbbs
Loading

0 comments on commit 1ecf43c

Please sign in to comment.