In [72]:
import os
import pickle
import random

import networkx as nx
import numpy as np
import pandas as pd
import plotly.graph_objects as go

os.environ["TOKENIZERS_PARALLELISM"] = "false"

### Papermill parameters

In [73]:
EMBEDDING_MODEL = "tfidf"

In [74]:
INPUT_GRAPH_PATH = os.path.join(
    "..",
    "artifacts",
    "outputs",
    EMBEDDING_MODEL + "_graph",
)

INPUT_CSV_PATH = os.path.join(
    "..",
    "artifacts",
    "outputs",
    f"{EMBEDDING_MODEL}_graph_louvain_cluster.csv",
)

OUTPUT_GRAPH_PATH = os.path.join(
    "..",
    "artifacts",
    "outputs",
    EMBEDDING_MODEL + "_graph_plot.html",
)

### Load graph

In [75]:
graph = pickle.load(open(INPUT_GRAPH_PATH, "rb"))
pred_df = pd.read_csv(INPUT_CSV_PATH)
threshold = graph.graph["threshold"]

In [76]:
weight_values = nx.get_edge_attributes(graph, "weight")
positions = nx.spring_layout(graph)
nx.set_node_attributes(graph, name="position", values=positions)

### Edges

In [77]:
edge_x = []
edge_y = []
weights = []
avg_x, avg_y = [], []

# Loop through the edges
for edge in graph.edges():
    node1 = graph.nodes[edge[0]]["URL"]
    node2 = graph.nodes[edge[1]]["URL"]
    if (
        pred_df.loc[pred_df["URL"] == node1, "cluster_label"].values[0]
        == pred_df.loc[pred_df["URL"] == node2, "cluster_label"].values[0]
    ):
        # Get the x and y coordinates of the first and second nodes
        # that has an edge (connection) between them
        x0, y0 = graph.nodes[edge[0]]["position"]
        x1, y1 = graph.nodes[edge[1]]["position"]

        # Store the coordinates
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)

        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)

        # Store the x and y coordinates of the centre of the edge
        avg_x.append(np.mean([x0, x1]))
        avg_y.append(np.mean([y0, y1]))

        # Store the weight of the edge (accessed by the URLs of the nodes)
        weights.append(f"{edge[0]}, {edge[1]}: {weight_values[(edge[0], edge[1])]}")

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    opacity=0.7,
    line=dict(width=2, color="White"),
    hoverinfo="text",
    mode="lines",
)

edge_trace.text = weights

### Nodes

In [78]:
# Create a color mapping dictionary for each groundtruth cluster

random.seed(123)
node_cluster = [id for id in pred_df["Combine Group ID"].unique() if str(id) != "nan"]
cluster_colors = {}
for cluster_id in set(node_cluster):
    color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
    hex_color = "#%02x%02x%02x" % color
    cluster_colors[str(cluster_id)] = hex_color

In [79]:
node_x = []
node_y = []

# Loop through the nodes
for node in graph.nodes():
    # Get x and y coordinates of the node
    x, y = graph.nodes[node]["position"]

    # Store the coordinates
    node_x.append(x)
    node_y.append(y)

# Generate the sizes for each nodes
sizes = [15] * len(graph.nodes)

node_trace = go.Scatter(
    x=node_x,
    y=node_y,
    mode="markers",
    hoverinfo="text",
    marker=dict(
        # showscale=True,
        line=dict(color="White"),
        # colorscale options
        #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
        #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
        #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
        # colorscale="Picnic",
        reversescale=False,
        color=[],
        opacity=0.9,
        size=sizes,
        # colorbar=dict(
        #     thickness=15, title="Ground Truth", xanchor="left", titleside="right"
        # ),
        line_width=2,
    ),
)

invisible_similarity_trace = go.Scatter(
    x=avg_x,
    y=avg_y,
    mode="markers",
    hoverinfo="text",
    marker=dict(
        color=[],
        opacity=0,
    ),
)

invisible_similarity_trace.text = weights

In [80]:
node_cluster = []  # contains the pred cluster labels for each node
node_text = []  # contains the title

for node, adjacencies in enumerate(graph.adjacency()):
    cluster_id = graph.nodes[adjacencies[0]]["Combine Group ID"]  # get cluster id
    node_cluster.append(cluster_colors[str(cluster_id)])
    node_text.append(f"{adjacencies[0]} [Ground Truth: {str(int(cluster_id))}]")

node_trace.marker.color = node_cluster
node_trace.text = node_text

In [81]:
fig = go.Figure(
    data=[edge_trace, node_trace, invisible_similarity_trace],
    layout=go.Layout(
        title=f"Network Graph of Document Embeddings ({EMBEDDING_MODEL}, Threshold={threshold})",
        template="plotly_dark",
        titlefont_size=20,
        showlegend=False,
        hovermode="closest",
        margin=dict(b=20, l=5, r=5, t=40),
        annotations=[
            dict(
                text="Adapted from: <a href='https://plotly.com/ipython-notebooks/network-graphs/'> https://plotly.com/ipython-notebooks/network-graphs/</a>",
                showarrow=False,
                xref="paper",
                yref="paper",
                x=0.005,
                y=-0.002,
            )
        ],
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    ),
)

visualise_path = os.path.join("cluster_visualise", EMBEDDING_MODEL)

if not os.path.exists(visualise_path):
    # Create a new directory because it does not exist
    os.makedirs(visualise_path)


filename = OUTPUT_GRAPH_PATH
fig.write_html(filename)