In [1]:
import pandas as pd
import networkx as nx
from pyvis.network import Network
import matplotlib.pyplot as plt
import community

In [2]:
# Read node data from CSV
nodes_df = pd.read_csv('nodes.csv', dtype={'Id': str})

# Read edge data from CSV
edges_df = pd.read_csv('edges.csv', dtype={'Source': str, 'Target': str})

# Convert DataFrames to dictionaries
nodes_dict = nodes_df.to_dict(orient='records')
edges_dict = edges_df.to_dict(orient='records')

# Create a non-directed graph using NetworkX
G = nx.Graph()

# Add nodes and edges to the NetworkX graph
for node in nodes_dict:
    G.add_node(node['Id'], label=node['Label'], institution=node['INSTITUTION'], first_name=node['FIRST'], 
               middle_name=node['MIDDLE'], last_name=node['LAST'], degree=node['DEGREE'], hindex=node['H_INDEX'])

# Iterate through the DataFrame and aggregate weights
for index, row in edges_df.iterrows():
    source = row['Source']
    target = row['Target']

    if G.has_edge(source, target):
        # Edge already exists, aggregate the weights
        G[source][target]['weight'] += 1
    else:
        # Edge doesn't exist, add it with a weight of 1
        G.add_edge(source, target, weight=1)


In [3]:
# Examine network
node_count = G.number_of_nodes()
edge_count = G.number_of_edges()
print("This network has", node_count, "nodes and", edge_count, "edges.")

This network has 2465 nodes and 29759 edges.


In [4]:
# Remove nodes without edges
isolated_nodes = [node for node in G.nodes() if G.degree(node) == 0]
G.remove_nodes_from(isolated_nodes)
new_node_count = G.number_of_nodes()
print("Now this network has", new_node_count, "nodes.")

Now this network has 2274 nodes.


In [5]:
# Calculate network statistics
degree = G.degree()
eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=500)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)

centrality_df = pd.DataFrame({
    'Node': list(G.nodes),
    'Degree': [degree[node] for node in G.nodes],
    'Eigenvector Centrality': list(eigenvector_centrality.values()),
    'Betweenness Centrality': list(betweenness_centrality.values()),
    'Closeness Centrality': list(closeness_centrality.values())
})

centrality_df

Unnamed: 0,Node,Degree,Eigenvector Centrality,Betweenness Centrality,Closeness Centrality
0,6602108168,7,0.000399,0.000125,0.296989
1,6602906779,9,0.001003,0.000137,0.300952
2,6603462653,7,0.000619,0.000396,0.297341
3,7004413850,29,0.005087,0.001732,0.360575
4,7006634482,22,0.004254,0.000926,0.347624
...,...,...,...,...,...
2269,57052587900,7,0.001987,0.000014,0.320318
2270,57090880700,23,0.007625,0.000390,0.343284
2271,57194722008,5,0.000319,0.000021,0.282940
2272,57218199095,2,0.000031,0.000000,0.244926


In [6]:
# Perform Louvain community detection
partition = community.best_partition(G)
nx.set_node_attributes(G, partition, name='community_id')

In [7]:
# Due to the large size of the network, we pick two institutions and visualize a small sample for demonstration purpose.
institution1 = "Johns Hopkins University"
institution2 = "Vanderbilt University Medical Center"

# Create a Pyvis network
net = Network(
    notebook=True,
    cdn_resources="remote",
    select_menu=True,
    filter_menu=True,
    height='750px',
    width='100%'
)
# net.repulsion()

# Set solver to "forceAtlas2Based"
net.set_options("""
var options = {
  "physics": {
    "solver": "forceAtlas2Based"
  }
}
""")


# Define a mapping from community IDs to colors
community_color_mapping = {
    0: '#1f77b4',
    1: '#ff7f0e',
    2: '#2ca02c',
    3: '#d62728',
    4: '#9467bd',
    5: '#8c564b',
    6: '#e377c2',
    7: '#7f7f7f',
    8: '#bcbd22',
    9: '#17becf',
    10: '#1a55ff',
    11: '#ff6347',
    12: '#7fff00',
    13: '#dda0dd',
    14: '#20b2aa',
    15: '#ff8c00',
    16: '#9932cc',
    17: '#008080',
    18: '#8b0000',
    19: '#8a2be2'
}

# Add nodes and edges only for scholars from the specified institutions
for node_id in G.nodes:
    label = G.nodes[node_id]['label']
    institution = G.nodes[node_id]['institution']
    degree = G.nodes[node_id]['degree']
    hindex = G.nodes[node_id]['hindex']
    size = eigenvector_centrality[node_id] * 1000
    community_id = G.nodes[node_id]['community_id']

    # Check if the scholar belongs to one of the specified institutions
    if institution in {institution1, institution2}:
        color = community_color_mapping.get(community_id, 'gray')  # Default to gray if community_id not in mapping
        net.add_node(node_id, label=label, size=size, color=color,
                     institution=institution, degree=degree, hindex=hindex,
                     labelHighlightBold=True, Physics=True, font={'size': 40})

# Add edges only between scholars from the specified institutions
for edge in G.edges:
    source, target = edge
    source_institution = G.nodes[source]['institution']
    target_institution = G.nodes[target]['institution']

    if source_institution in {institution1, institution2} and target_institution in {institution1, institution2}:
        net.add_edge(source, target)


# Visualize the network
net.show("sample_network.html")

sample_network.html
