# Data prep/Create Graphs

In [106]:
import networkx as nx
import pandas as pd
import numpy as np

In [107]:
df = pd.read_excel("../data/edgelist/UN_full.xlsx")

In [108]:
edgelist = [(origin, destination, {"weight":weight}) for origin, destination, weight in df[["origin","destination","weight_scaled"]].values]

In [109]:
G = nx.DiGraph(edgelist)

In [110]:
# Let's start by getting the adjacency matrix
A = nx.adjacency_matrix(G).toarray()

In [111]:
for i in range(len(A)):
    A[i,:] /= A[i,:].sum()

In [112]:
# For convenience, let's make it into a pandas dataframe
df = pd.DataFrame(A).stack().reset_index()
df.columns = ("source", "target", "weight")
df = df[df["weight"] > 0]

In [113]:
# For reference
indexNodes = {i: node for i, node in enumerate(G.nodes())}

## Remove by graph
Remove X% with lowest weight

In [114]:
quat25 = df["weight"].describe()["25%"]
quat50 = df["weight"].describe()["50%"]
quat75 = df["weight"].describe()["75%"]

print(quat25,quat50,quat75)

0.000171368592421224 0.0013355643910900972 0.009280984410031808


In [115]:
edgelists = {"0":edgelist}
for (perc, lim) in [("25",quat25), ("50", quat50), ("75",quat75)]:
    edgelist_ = [(indexNodes[int(origin)], indexNodes[int(destination)], {"weight": weight}) for [origin, destination, weight] in df[df["weight"]>lim].values]
    edgelists[perc] = edgelist_

In [116]:
for k, v in edgelists.items():
    print(k, len(v))

0 8328
25 6222
50 4148
75 2074


In [117]:
for perc, edgelist_ in edgelists.items():
    G_ = nx.DiGraph(edgelist_)
    nx.write_gexf(G_, f"../data/Graphs/bygraph_{perc}_directed.gexf")

## Remove by node
Get edges by percentile

In [118]:
edgelists = {"0":edgelist}
for (perc, limit) in [("25",0.75), ("50", 0.5), ("75",0.25)]:
    edgelist_ = []
    for node in df["source"].unique():
        df_node = df[df["source"] == node].sort_values("weight", ascending=False)
        top_n = int(len(df_node)*limit)
        edges = [(indexNodes[int(origin)], indexNodes[int(destination)], {"weight": weight}) for [origin, destination, weight] in df_node.head(top_n).values]
        edgelist_.extend(edges)
    edgelists[perc] = edgelist_

In [119]:
for k, v in edgelists.items():
    print(k, len(v))

0 8328
25 6132
50 4090
75 1984


In [120]:
for perc, edgelist_ in edgelists.items():
    G_ = nx.DiGraph(edgelist_)
    nx.write_gexf(G_, f"../data/Graphs/bynode_{perc}_directed.gexf")