In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data.csv')
df_id_authors = df[['ID', 'Authors']].copy()

df_id_authors

Unnamed: 0,ID,Authors
0,1,"Janet M. Weisenberger, Arnold F. Heidbreder, J..."
1,2,"Stephen Brewster, Joanna Lumsden, Marek Bell, ..."
2,3,"Christian Metzger, Matt Anderson, Thad Starner"
3,4,"Vincent Buil, Gerard Hollemans"
4,5,"Vincent Buil, Gerard Hollemans, Sander van de..."
...,...,...
113,114,"Yong Wang, Tianyu Yang, Chunxiao Wang, Feng Li..."
114,115,"Yilin Wang, Zi Wang, Jie Yang"
115,116,"Yanze Xie, Mengzhen Gao, Xiaoning Liu, Shuo Hu..."
116,117,"Yongjie Yang, Tao Chen, Yujing Huang, Xiuzhen ..."


In [8]:
# Build exact co-author matrix from comma-separated author strings
coauthor_matrix = pd.DataFrame(0, index=np.arange(1, 119), columns=np.arange(1, 119))
ids = np.arange(1, 119)

def normalize_name(name: str) -> str:
    # lowercase + collapse internal whitespace
    return ' '.join(name.strip().lower().split())

def to_author_set(value) -> set:
    # Accept list or single comma-separated string
    if isinstance(value, list):
        names = value
    elif isinstance(value, str):
        # split on commas that separate authors
        names = [n for n in (x.strip() for x in value.split(',')) if n]
    else:
        names = []
    return {normalize_name(n) for n in names}

# Map ID -> normalized author set (robust to missing rows)
id_to_authors = {int(row['ID']): to_author_set(row['Authors']) for _, row in df_id_authors.iterrows()}

# Only connect papers sharing at least one EXACT author name (distance == 0)
for id_i in ids:
    authors_i = id_to_authors.get(id_i, set())
    for id_j in range(id_i + 1, len(ids) + 1):
        authors_j = id_to_authors.get(id_j, set())
        if authors_i and authors_j and authors_i.intersection(authors_j):
            coauthor_matrix.loc[id_i, id_j] = 1
            coauthor_matrix.loc[id_j, id_i] = 1  # symmetric

coauthor_matrix.to_csv('interconnections_datasets/coauthor_matrix.csv')
coauthor_matrix.head(10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,109,110,111,112,113,114,115,116,117,118
99,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
102,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
105,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
107,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
