In [17]:
# !pip install pyvis networkx seaborn

In [18]:
import pandas as pd
import numpy as np
from pyvis.network import Network
import networkx as nx
import seaborn as sns

In [7]:
df_concepts = pd.read_csv('./output_dir/concepts.csv', sep= "|")
print(df_concepts.shape)

(393, 5)


In [8]:
df_concepts.head()

Unnamed: 0,entity,importance,category,chunk_id,type
0,alaska,5,organisation,902292e89f814a7daa128be1540458fe,concept
1,google summer of code,5,event,902292e89f814a7daa128be1540458fe,concept
2,mentor,3,occupation,902292e89f814a7daa128be1540458fe,concept
3,open-source,3,concept,902292e89f814a7daa128be1540458fe,concept
4,software,3,concept,902292e89f814a7daa128be1540458fe,concept


### Graph Dataframe is connection of two nodes in each row !

In [9]:
dfne_join = pd.merge(
    df_concepts, df_concepts, how="inner", on="chunk_id", suffixes=("_L", "_R")
)

## Remove self Loops
self_loops_drop = dfne_join[dfne_join["entity_L"] == dfne_join["entity_R"]].index
dfg = dfne_join.drop(index=self_loops_drop).reset_index()

## This is our graph dataframe
print("Total number of nodes = ", dfg.shape[0])
dfg.head()

Total number of nodes =  5248


Unnamed: 0,index,entity_L,importance_L,category_L,chunk_id,type_L,entity_R,importance_R,category_R,type_R
0,1,alaska,5,organisation,902292e89f814a7daa128be1540458fe,concept,google summer of code,5,event,concept
1,2,alaska,5,organisation,902292e89f814a7daa128be1540458fe,concept,mentor,3,occupation,concept
2,3,alaska,5,organisation,902292e89f814a7daa128be1540458fe,concept,open-source,3,concept,concept
3,4,alaska,5,organisation,902292e89f814a7daa128be1540458fe,concept,software,3,concept,concept
4,5,alaska,5,organisation,902292e89f814a7daa128be1540458fe,concept,development,3,concept,concept


### Graph Dataframe is too big to visualize!

In [12]:
## Drop nodes which are less important
less_important_nodes = dfg[(dfg["importance_L"] < 2)].index
## Drop edges where both the nodes are less important than 5
less_important_edges = dfg[(dfg["importance_L"] < 2) & (dfg["importance_R"] < 2)].index
drops = less_important_nodes.union(less_important_edges)

print(
    "Less important Nodes = ",
    less_important_nodes.shape[0],
    "\nLess Important Edges = ",
    less_important_edges.shape[0],
)

## Remove these rows from the graph dataframe
dfg_vis = dfg.drop(index=drops).reset_index()

Less important Nodes =  130 
Less Important Edges =  42


### Combining similar edges

In [13]:
## Group and aggregate edges.
dfg_vis = (
    dfg_vis.groupby(["entity_L", "entity_R"])
    .agg(
        {
            "importance_L": "mean",
            "importance_R": "mean",
            "chunk_id": [",".join, "count"],
        }
    )
    .reset_index()
)
dfg_vis.columns = [
    "entity_L",
    "entity_R",
    "importance_L",
    "importance_R",
    "chunks",
    "count",
]

print("Final Number of Edges in the Visualisation Graph = ", dfg_vis.shape[0])

dfg_vis.head()

Final Number of Edges in the Visualisation Graph =  4595


Unnamed: 0,entity_L,entity_R,importance_L,importance_R,chunks,count
0,adapt,alaska,3.0,3.0,88d8c61fddbc4d14b8f562aa688f7f94,1
1,adapt,arctic,3.0,3.0,88d8c61fddbc4d14b8f562aa688f7f94,1
2,adapt,change,3.0,3.0,88d8c61fddbc4d14b8f562aa688f7f94,1
3,adapt,climate change,3.0,4.0,88d8c61fddbc4d14b8f562aa688f7f94,1
4,adapt,coastal erosion,3.0,4.0,88d8c61fddbc4d14b8f562aa688f7f94,1


### Creating NetworkX Graph

In [14]:
# Here I am grouping the graph dataframe by left node and calculating the mean importance. 
# This way we will end up with only the unique nodes from the graph dataframe along with their weights.

# nodes = df_graph["entity_L"].unique()
nodes = dfg_vis.groupby(["entity_L"]).agg({"importance_L": "mean"}).reset_index()
nodes.head()

Unnamed: 0,entity_L,importance_L
0,adapt,3.0
1,advocacy,3.0
2,aerial image,4.0
3,alaska,3.075581
4,alaska developer alliance,3.0


In [19]:
G = nx.Graph()
for index, row in nodes.iterrows():
    G.add_node(row["entity_L"])

for index, row in dfg_vis.iterrows():
    G.add_edge(str(row["entity_L"]), str(row["entity_R"]))

In [20]:
# Community Detection using Girvan Newman Algorithm

communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))

Number of Communities =  4


In [21]:
palette = "hls"


## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"entity_L": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)

df_nodes_colors = pd.merge(
    nodes, colors, how="left", on="entity_L", suffixes=("_N", "_C")
)
# nodes.head()
df_nodes_colors.head()

Unnamed: 0,entity_L,importance_L,color,group
0,adapt,3.0,#a157db,1
1,advocacy,3.0,#a157db,1
2,aerial image,4.0,#a157db,1
3,alaska,3.075581,#a157db,1
4,alaska developer alliance,3.0,#a157db,1


In [23]:
G = nx.Graph()
node_size_multiple = 6

for index, row in df_nodes_colors.iterrows():
    G.add_node(
        row["entity_L"],
        size=row["importance_L"] * node_size_multiple,
        title=row["entity_L"],
        color=row["color"],
    )

for index, row in dfg_vis.iterrows():
    G.add_edge(
        str(row["entity_L"]),
        str(row["entity_R"]),
        weight=row["count"],
        name=row["chunks"],
        )

In [25]:
# Visualization

graph_output_directory = "./visuals/index.html"

net = Network(
    notebook=False,
    bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    font_color="#cccccc",
    # filter_menu=True,
)

net.from_nx(G)
net.repulsion(node_distance=150, spring_length=400)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./visuals/index.html
