In [1]:
import pandas as pd

In [4]:
df = pd.read_json("./scrape.json", lines=True)

In [9]:
df.dtypes

title             object
id                object
year               int64
citation_count     int64
subject-area      object
authors           object
affiliations      object
dtype: object

In [10]:
df.head(1)

Unnamed: 0,title,id,year,citation_count,subject-area,authors,affiliations
0,Measurement of the WZ production cross section...,SCOPUS_ID:85017527143,2017,43,"[Engineering (miscellaneous), Physics and Astr...","[{'@_fa': 'true', '@auid': '6701387290', '@seq...",[{'@href': 'https://api.elsevier.com/content/a...


In [32]:
df.authors[0]

[{'@_fa': 'true',
  '@auid': '6701387290',
  '@seq': '1',
  'affiliation': '{"@id":"60070862","@href":"https://api.elsevier.com/content/affiliation/affiliation_id/60070862"}',
  'author-url': 'https://api.elsevier.com/content/author/author_id/6701387290',
  'ce:indexed-name': 'Khachatryan V.',
  'ce:initials': 'V.',
  'ce:surname': 'Khachatryan',
  'preferred-name': {'ce:given-name': 'V.',
   'ce:indexed-name': 'Khachatryan V.',
   'ce:initials': 'V.',
   'ce:surname': 'Khachatryan'}},
 {'@_fa': 'true',
  '@auid': '16239550900',
  '@seq': '2',
  'affiliation': '{"@id":"60070862","@href":"https://api.elsevier.com/content/affiliation/affiliation_id/60070862"}',
  'author-url': 'https://api.elsevier.com/content/author/author_id/16239550900',
  'ce:indexed-name': 'Sirunyan A.M.',
  'ce:initials': 'A.M.',
  'ce:surname': 'Sirunyan',
  'preferred-name': {'ce:given-name': 'A. M.',
   'ce:indexed-name': 'Sirunyan A.M.',
   'ce:initials': 'A.M.',
   'ce:surname': 'Sirunyan'}},
 {'@_fa': 'true',

In [33]:
df.affiliations[0]

[{'@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60016653',
  '@id': '60016653',
  'affilname': 'Rheinisch-Westfälische Technische Hochschule Aachen'},
 {'@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60031855',
  '@id': '60031855',
  'affilname': 'Tata Institute of Fundamental Research, Mumbai'},
 {'@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60196721',
  '@id': '60196721',
  'affilname': 'Faculty of Science, University of Split'},
 {'@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60122606',
  '@id': '60122606',
  'affilname': 'Purdue University Northwest'},
 {'@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60119141',
  '@id': '60119141',
  'affilname': 'Rutgers University–New Brunswick'},
 {'@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60115114',
  '@id': '60115114',
  'affilname': 'Academy of Scientific Research &amp; Technology, Cairo'},
 {'@hre

In [37]:
# Prepare nodes from affiliations
affiliations_dict = {}
for aff_list in df.affiliations.dropna():
    for aff in aff_list:
        aff_id = aff['@id']
        if aff_id not in affiliations_dict:
            affiliations_dict[aff_id] = aff['affilname']

nodes = pd.DataFrame(list(affiliations_dict.items()), columns=['Id', 'Label'])
nodes.to_csv('nodes-affiliation.csv', index=False)

# Prepare edges
edges_list = []

# Function to safely get the affiliation ID directly from the author dictionary
def get_affiliation_id(author):
    try:
        return author['affiliation']['@id']
    except (KeyError, TypeError):
        return None

# Iterate through each paper
for index, row in df.iterrows():
    # Check if authors exist and the list is not empty
    if row['authors'] and isinstance(row['authors'], list):
        paper_affiliations = [get_affiliation_id(author) for author in row['authors'] if get_affiliation_id(author) is not None]

        # Create edges between every pair of different affiliations
        unique_affiliations = set(paper_affiliations)
        for aff1 in unique_affiliations:
            for aff2 in unique_affiliations:
                if aff1 != aff2:
                    # Check if this edge is already added
                    existing_edge = next((item for item in edges_list if (item['Source'] == aff1 and item['Target'] == aff2) or (item['Target'] == aff1 and item['Source'] == aff2)), None)
                    if existing_edge:
                        existing_edge['Weight'] += 1
                    else:
                        edges_list.append({'Source': aff1, 'Target': aff2, 'Weight': 1})

edges = pd.DataFrame(edges_list)
edges.to_csv('edges-affiliation.csv', index=False)


In [39]:
from itertools import combinations

# Prepare nodes: Unique Subject Areas
subject_areas = set()
df['subject-area'].dropna().apply(lambda x: subject_areas.update(x))
nodes = pd.DataFrame(list(subject_areas), columns=['Label'])
nodes['Id'] = range(len(nodes))

# Prepare edges
edges_list = []

# Mapping from subject area to node Id
area_to_id = {area: idx for idx, area in enumerate(subject_areas)}

# Generate edges from papers' subject areas
for areas in df['subject-area'].dropna():
    if len(areas) > 1:
        for (area1, area2) in combinations(areas, 2):
            # Sort to avoid duplicate edges (area1, area2) and (area2, area1)
            area1, area2 = sorted([area1, area2])
            edge_key = (area_to_id[area1], area_to_id[area2])
            # Find edge if it already exists
            found_edge = next((e for e in edges_list if e['Source'] == edge_key[0] and e['Target'] == edge_key[1]), None)
            if found_edge:
                found_edge['Weight'] += 1
            else:
                edges_list.append({'Source': edge_key[0], 'Target': edge_key[1], 'Weight': 1})

edges = pd.DataFrame(edges_list)

# Export CSV files
nodes.to_csv('nodes-subject-area.csv', index=False)
edges.to_csv('edges-subject-area.csv', index=False)


In [None]:
# Prepare nodes: Unique authors
authors_dict = {}
for index, row in df.iterrows():
    for author in row['authors']:
        auid = author['@auid']
        name = f"{author['ce:given-name']} {author['ce:surname']}"
        authors_dict[auid] = name

nodes = pd.DataFrame(list(authors_dict.items()), columns=['Id', 'Label'])
nodes.to_csv('nodes-author-collaboration.csv', index=False)

# Prepare edges
edges_list = []

# Generate edges based on co-authorships
for index, row in df.iterrows():
    if len(row['authors']) > 1:
        # Generate all combinations of authors for the paper
        for (author1, author2) in combinations(row['authors'], 2):
            auid1, auid2 = author1['@auid'], author2['@auid']
            # Sort to avoid duplicate edges (auid1, auid2) and (auid2, auid1)
            auid1, auid2 = sorted([auid1, auid2])
            edge_key = (auid1, auid2)
            # Find edge if it already exists
            found_edge = next((e for e in edges_list if e['Source'] == edge_key[0] and e['Target'] == edge_key[1]), None)
            if found_edge:
                found_edge['Weight'] += 1
            else:
                edges_list.append({'Source': edge_key[0], 'Target': edge_key[1], 'Weight': 1})

edges = pd.DataFrame(edges_list)
edges.to_csv('edges-author-collaboration.csv', index=False)
