In [1]:
import xml.etree.ElementTree as ET

def authorship(xml_path):
    namespace = {
        'ce': 'http://www.elsevier.com/xml/common/dtd',
        'sa': 'http://www.elsevier.com/xml/common/struct-aff/dtd'
    }
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # Extract author information
    authors = root.findall('.//ce:author', namespace)
    affiliations = root.findall('.//ce:affiliation', namespace)
    correspondences = root.findall('.//ce:correspondence', namespace)

    # Map affiliations by ID including details from sa:affiliation
    affiliation_map = {}
    for aff in affiliations:
        aff_id = aff.get('id')

        # Extract details from ce:affiliation
        text = aff.find('ce:textfn', namespace).text if aff.find('ce:textfn', namespace) is not None else ""

        # Extract nested sa:affiliation details
        sa_affiliation = aff.find('sa:affiliation', namespace)
        sa_details = {
            'organization': sa_affiliation.find('sa:organization', namespace).text if sa_affiliation and sa_affiliation.find('sa:organization', namespace) is not None else "",
            'city': sa_affiliation.find('sa:city', namespace).text if sa_affiliation and sa_affiliation.find('sa:city', namespace) is not None else "",
            'country': sa_affiliation.find('sa:country', namespace).text if sa_affiliation and sa_affiliation.find('sa:country', namespace) is not None else ""
        }

        # Combine all details into the affiliation map
        affiliation_map[aff_id] = {
            'text': text,
            'sa_details': sa_details,
        }

    # Determine the fallback affiliation (first affiliation available)
    fallback_affiliation = None
    if affiliations:
        first_aff = affiliations[0]
        text = first_aff.find('ce:textfn', namespace).text if first_aff.find('ce:textfn', namespace) is not None else ""
        sa_affiliation = first_aff.find('sa:affiliation', namespace)
        sa_details = {
            'organization': sa_affiliation.find('sa:organization', namespace).text if sa_affiliation and sa_affiliation.find('sa:organization', namespace) is not None else "",
            'city': sa_affiliation.find('sa:city', namespace).text if sa_affiliation and sa_affiliation.find('sa:city', namespace) is not None else "",
            'country': sa_affiliation.find('sa:country', namespace).text if sa_affiliation and sa_affiliation.find('sa:country', namespace) is not None else ""
        }
        fallback_affiliation = {
            'text': text,
            'sa_details': sa_details,
        }

    # Determine the primary affiliation (first author's affiliation or fallback)
    primary_affiliation = None
    if authors:
        first_author = authors[0]
        cross_refs = first_author.findall('ce:cross-ref', namespace)
        primary_affiliation_ids = [ref.get('refid') for ref in cross_refs if ref.get('refid') and ref.get('refid').startswith('af')]
        if primary_affiliation_ids:
            primary_affiliation = [
                affiliation_map[aff_id] for aff_id in primary_affiliation_ids if aff_id in affiliation_map
            ][0] if primary_affiliation_ids[0] in affiliation_map else fallback_affiliation
        else:
            primary_affiliation = fallback_affiliation

    # Extract correspondence references
    correspondence_refs = [
        corr.get('id') for corr in correspondences if corr.get('id') is not None
    ]

    # Parse authors, count them, and check for corresponding authors
    author_details = []
    corresponding_authors = []

    for author in authors:
        given_name = author.find('ce:given-name', namespace)
        surname = author.find('ce:surname', namespace)
        
        # Handle missing names gracefully
        given_name = given_name.text if given_name is not None else "Unknown"
        surname = surname.text if surname is not None else "Unknown"

        cross_refs = author.findall('ce:cross-ref', namespace)
        
        # Affiliation IDs and corresponding author status
        aff_ids = []
        is_corresponding = False
        for ref in cross_refs:
            refid = ref.get('refid')
            if refid and refid.startswith('af'):
                aff_ids.append(refid)
            if refid in correspondence_refs:
                is_corresponding = True
        
        # Match affiliation IDs with full details
        author_affiliations = []
        for aff_id in aff_ids:
            if aff_id in affiliation_map:
                aff = affiliation_map[aff_id]
                author_affiliations.append({
                    'text': aff['text'],
                    'organization': aff['sa_details']['organization'],
                    'city': aff['sa_details']['city'],
                    'country': aff['sa_details']['country'],
                })
        
        # Assign fallback affiliation if no affiliations found
        if not author_affiliations:
            if fallback_affiliation:
                author_affiliations.append({
                    'text': fallback_affiliation['text'],
                    'organization': fallback_affiliation['sa_details']['organization'],
                    'city': fallback_affiliation['sa_details']['city'],
                    'country': fallback_affiliation['sa_details']['country'],
                })

        author_details.append({
            'name': f"{given_name} {surname}",
            'affiliations': author_affiliations,
        })
        
        if is_corresponding:
            corresponding_authors.append(f"{given_name} {surname}")

    # Results
    results = {
        'Total Authors': len(authors),
        'Corresponding Authors': corresponding_authors,
        'Primary Affiliation': primary_affiliation,
        'Authors and Affiliations': author_details
    }
    return results

In [2]:
import pandas as pd
import os
from tqdm import tqdm
from utils import *
# replace it with your downloaded folder path
full_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-full-text'
meta_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-meta'
result_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-results'
data = pd.read_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'))

In [None]:
# for i in range(len(data)):
# # In case for the mini test to debug the code
# see if there's a folder in the result_data_folder for authorship
# if not, create one
data['number_of_authors'] = 0
data['country'] = ""
data['institution'] = ""
data['coresponding_author_name'] = ""
data['coresponding_author_country'] = ""
data['primary_author_name'] = ""
if not os.path.exists(result_data_folder + '/authorship'):
    os.makedirs(result_data_folder + '/authorship')
# for i in tqdm(range(10)):
for i in tqdm(range(len(data))):
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    results = authorship(paper_path)
    data.loc[i, "number_of_authors"] = results['Total Authors']
    if results['Primary Affiliation'] is not None:
        if results['Primary Affiliation']['sa_details']["country"] is not None:
            data.loc[i, "country"] = results['Primary Affiliation']['sa_details']["country"]
            data.loc[i, "institution"] = results['Primary Affiliation']['sa_details']["organization"]
            data.loc[i, "primary_author_name"] = results['Authors and Affiliations'][0]['name']
    if len(results['Corresponding Authors']) > 0:
        for author in results['Authors and Affiliations']:
            if author['name'] in results['Corresponding Authors']:
                for aff in author['affiliations']:
                    if aff['country'] is not None:
                        data.loc[i, "coresponding_author_country"] = aff['country']
                        data.loc[i, "coresponding_author_name"] = author['name']
    # save the results to a json file
    save_json(results, os.path.join(result_data_folder + '/authorship', data['unique_id'][i] + '.json'))
    # print("Total Authors:", results['Total Authors'])
    # print("Corresponding Authors:", results['Corresponding Authors'])
    # print("Primary Affiliation:")
    # print(f"  Affiliation Text: {results['Primary Affiliation']['text']}")
    # print(f"  Organizations: {', '.join(results['Primary Affiliation']['organization'])}")
    # print(f"  Country: {results['Primary Affiliation']['country']}")
    # print("Authors and Affiliations:")
    # for author in results['Authors and Affiliations']:
    #     print(f"- {author['name']}:")
    #     for aff in author['affiliations']:
    #         print(f"  Affiliation Text: {aff['text']}")
    #         print(f"  Organizations: {', '.join(aff['organization'])}")
    #         print(f"  Country: {aff['country']}")

In [None]:
data

In [8]:
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset_authorship.csv'), index=False)