In [80]:
# Libraries
import pandas as pd
import networkx as nx
from itertools import combinations
import json
from networkx.readwrite import json_graph

# Load the CSV file
data = pd.read_csv("data_scopus.csv")

# Drop rows without necessary data
data_cleaned = data.dropna(subset=['Year', 'Authors with affiliations', 'Author(s) ID'])

# Create a new NetworkX graph
G = nx.Graph()

# Process each row in the cleaned dataset
for index, row in data_cleaned.iterrows():
    # Split author IDs by semicolon and strip any leading/trailing whitespace
    author_ids = [author_id.strip() for author_id in row['Author(s) ID'].split(';') if author_id]

    # Add each author ID as a node
    for author_id in author_ids:
        if author_id not in G:
            G.add_node(author_id)

    # Add edges between all pairs of author IDs for this publication
    for author_pair in combinations(author_ids, 2):
        G.add_edge(*author_pair)

# Basic information about the graph
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

print(f"Number of nodes (authors): {num_nodes}")
print(f"Number of edges (co-authorships): {num_edges}")

# Convert the NetworkX graph to a format compatible with D3.js
data_for_d3 = json_graph.node_link_data(G)

# Write the data to a JSON file
with open('network_graph.json', 'w') as f:
    json.dump(data_for_d3, f)

Number of nodes (authors): 1552
Number of edges (co-authorships): 3049
