In [2]:
import pandas as pd
import networkx as nx
import json
from networkx.readwrite import json_graph

# Read the CSV data
data = pd.read_csv("./data/data_scopus.csv")

# Drop unnecessary columns and remove rows with missing values
columns_to_remove = ['Abstract', 'Conference name', 'Conference date', 'Abbreviated Source Title']
data.drop(columns=columns_to_remove, inplace=True)
data = data.dropna(subset=['Authors', 'Authors with affiliations', 'Year', 'Publisher'])

# Fill NaN values in the 'Cited by' column with 0
data['Cited by'].fillna(0, inplace=True)

# Function to get the first author's affiliation country
def get_first_author_affiliation(x):
    try:
        countries = [aff.split(",")[-1].strip() for aff in x.split(";")]
        return ','.join(countries)
    except Exception as ex:
        print(x)

# Apply the function to create a new column 'Author_Countries'
data['Author_Countries'] = data['Authors with affiliations'].apply(get_first_author_affiliation)

# Create a graph
G = nx.Graph()

# Iterate through the DataFrame
for _, row in data.iterrows():
    author_ids = row['Author(s) ID'].split(';')
    countries = row['Author_Countries'].split(',')
    names = row['Authors'].split(',')

    for i, author_id in enumerate(author_ids):
        if i < len(names) and i < len(countries):
            if author_id not in G.nodes:
                G.add_node(author_id,
                           Name=names[i].strip(),
                           country=countries[i],
                           num_publications=0,
                           num_citations=0,
                           degree=0)
            G.nodes[author_id]['num_publications'] += 1
            G.nodes[author_id]['num_citations'] += row['Cited by']
            G.nodes[author_id]['degree'] += len(author_ids) - 1

# Add edges to the graph
for _, row in data.iterrows():
    if ";" in row['Author(s) ID']:
        authors = row['Author(s) ID'][:-1].split(";")
        for i, author1 in enumerate(authors):
            for j in range(i + 1, len(authors)):
                G.add_edge(author1, authors[j])

# Save the graph as a JSON file
with open("publication_network.json", 'w') as f:
    json.dump(json_graph.node_link_data(G), f)

# Save the graph as a GML file
nx.write_gml(G, "network.gml")
