In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from tqdm import tqdm
import json
from pybliometrics.scopus import AuthorRetrieval
from pybliometrics.scopus import AbstractRetrieval
from pybliometrics.scopus import PlumXMetrics

from dicttoxml import dicttoxml
from xml.dom.minidom import parseString

#initialize geolocator to go from city, country to lat,lon
from geopy.geocoders import Nominatim
# Initialize Nominatim API
geolocator = Nominatim(user_agent="MyApp")


import warnings
warnings.filterwarnings("ignore")


from itertools import chain

import pyalex
pyalex.config.email = "" 

def flatten_chain(matrix):
     return list(chain.from_iterable(matrix))


client_orcid = open('orcid_config.json')
client_orcid = json.load(client_orcid)

client_scopus = open('scopus_config.json')
client_scopus = json.load(client_scopus)

df_names = pd.read_csv('people_in_S4S_pureFiltered_withAuthorIDs.csv',keep_default_na=False,index_col=[0])
# df_persons.reset_index(inplace=True)
# df_persons.drop('Index',axis=1,inplace=True)
# df_persons.index.names = ['Index']

#orcid start up
client_id = client_orcid['client_id']
client_secret = client_orcid['client_secret']

resp = requests.post(url="https://orcid.org/oauth/token",
                     headers={'Accept':'application/json'},
                     data={'client_id':client_id, 'client_secret':client_secret,'grant_type': 'client_credentials', 'scope':'/read-public'}
                     )
access_token = resp.json()['access_token']


df_subjectAreas = pd.read_xml('scopus_subject_classification.xml')
df_subjectAreas.set_index('code',inplace=True)


In [42]:
#scopus & openAlex data retrieval
with open('saved_locations.json') as fd:
     saved_locations = json.load(fd)

for i in tqdm(df_names.index):
    subjectAreas_dict = {}

    first_name = df_names['firstname'][i]
    last_name = df_names['lastname'][i]
    scopusId = df_names['scopusID'][i]
    # openalexId = df_names['openalexID'][i]

    if scopusId != '':

        resp = requests.get(url=f"https://api.elsevier.com/content/author/author_id/{scopusId}/", headers={'X-ELS-APIKey':client_scopus['apikey'], 'X-ELS-Insttoken':client_scopus['insttoken'], 'accept':'application/json','view':'ENHANCED'})

        author_dict = {}
        coAuthor_dict = {}       

        author_dict['S4Sid'] = i
        author_dict['firstname'] = df_names['firstname'][i]
        author_dict['lastname'] = df_names['lastname'][i]
        author_dict['scopusID'] = df_names['scopusID'][i]
        author_dict['orcID'] = df_names['orcID'][i]
        # author_dict['openalexID'] = df_names['openalexID'][i]


        #affiliation and location 


        #multiple current affiliations...
        if isinstance(resp.json()['author-retrieval-response'][0]['author-profile']['affiliation-current']['affiliation'],list) == True:
            author_dict['current_affiliation_institute'] = resp.json()['author-retrieval-response'][0]['author-profile']['affiliation-current']['affiliation'][0]['ip-doc']['afdispname']
            author_dict['current_affiliation_country'] = resp.json()['author-retrieval-response'][0]['author-profile']['affiliation-current']['affiliation'][0]['ip-doc']['address']['country']
            author_dict['current_affiliation_city'] = resp.json()['author-retrieval-response'][0]['author-profile']['affiliation-current']['affiliation'][0]['ip-doc']['address']['city']
        else:
            try:
                author_dict['current_affiliation_institute'] = resp.json()['author-retrieval-response'][0]['author-profile']['affiliation-current']['affiliation']['ip-doc']['afdispname']
                author_dict['current_affiliation_country'] = resp.json()['author-retrieval-response'][0]['author-profile']['affiliation-current']['affiliation']['ip-doc']['address']['country']
                author_dict['current_affiliation_city'] = resp.json()['author-retrieval-response'][0]['author-profile']['affiliation-current']['affiliation']['ip-doc']['address']['city']
            except:

                author_dict['current_affiliation_institute'] = resp.json()['author-retrieval-response'][0]['author-profile']['affiliation-history']['affiliation'][0]['ip-doc']['afdispname']
                author_dict['current_affiliation_country'] = resp.json()['author-retrieval-response'][0]['author-profile']['affiliation-history']['affiliation'][0]['ip-doc']['address']['country']
                author_dict['current_affiliation_city'] = resp.json()['author-retrieval-response'][0]['author-profile']['affiliation-history']['affiliation'][0]['ip-doc']['address']['city']
        
        location_abb = f'{author_dict['current_affiliation_city']}, {author_dict['current_affiliation_country']}'
        
        if location_abb in saved_locations.keys():
            location = saved_locations[location_abb]
            author_dict['current_affiliation_lat'] = location['lat']
            author_dict['current_affiliation_lon'] = location['lon']

        else:
            location = geolocator.geocode(query = {'country':scopus_record.affiliation_history[0]._asdict()['country'], 'city':scopus_record.affiliation_history[0]._asdict()['city']})

            if location != None:
                author_dict['current_affiliation_lat'] = location.latitude
                author_dict['current_affiliation_lon'] = location.longitude
                saved_locations[location_abb] = {}
                saved_locations[location_abb]['lat'] = location.latitude
                saved_locations[location_abb]['lon'] = location.longitude
            else:
                author_dict['current_affiliation_lat'] = ''
                author_dict['current_affiliation_lon'] = ''
                saved_locations[location_abb] = {}
                saved_locations[location_abb]['lat'] = ''
                saved_locations[location_abb]['lon'] = ''

        author_dict['publicationRange'] = tuple([resp.json()['author-retrieval-response'][0]['author-profile']['publication-range']['@end'],resp.json()['author-retrieval-response'][0]['author-profile']['publication-range']['@start']])
        # author_dict['researchAreas'] = [item._asdict()['area'] for item in scopus_record.subject_areas] if scopus_record.subject_areas != None else ''
        author_dict['citationCount'] = resp.json()['author-retrieval-response'][0]['coredata']['citation-count']
        author_dict['citedByCount'] = resp.json()['author-retrieval-response'][0]['coredata']['cited-by-count']
        author_dict['documentCount'] = resp.json()['author-retrieval-response'][0]['coredata']['document-count']


        resp = requests.get(url=f"https://api.elsevier.com/content/search/scopus", headers={'X-ELS-APIKey':client_scopus['apikey'], 'X-ELS-Insttoken':client_scopus['insttoken'], 'accept':'application/json','view':'enhanced'},
                            params={'query':f'AU-ID({scopusId}) AND PUBYEAR > 2019', 'cursor':'*'})
        author_docs =  resp.json()['search-results']['entry']

        num_results = int(resp.json()['search-results']['opensearch:totalResults'])
        for page in np.arange(25,num_results,25):
            resp = requests.get(url=f"https://api.elsevier.com/content/search/scopus", headers={'X-ELS-APIKey':client_scopus['apikey'], 'X-ELS-Insttoken':client_scopus['insttoken'], 'accept':'application/json','view':'enhanced'},
                                    params={'query':f'AU-ID({scopusId}) AND PUBYEAR > 2019', 'start':str(page)})
            for entry in resp.json()['search-results']['entry']:
                author_docs.append(entry)

        author_dict['publishedArticles'] = [item['eid'] for item in author_docs] if 'error' not in resp.json()['search-results']['entry'][0].keys() else ''


        if 'error' not in resp.json()['search-results']['entry'][0].keys():
            for item in author_docs:
                # scopus_paper = item._asdict()
                paper_dict = {}
                paper_dict['eid'] = item['eid']
                paper_dict['doi'] = item['prism:doi'] if 'prism:doi' in item.keys() else ''

                url = item['prism:url']
                resp = requests.get(url=url, headers={'X-ELS-APIKey':client_scopus['apikey'], 'X-ELS-Insttoken':client_scopus['insttoken'], 'accept':'application/json','view':'enhanced'})
                # if paper_dict['doi'] = ''

                
                paper_dict['title'] = resp.json()['abstracts-retrieval-response']['item']['bibrecord']['head']['citation-title']

                paper_dict['authorCount'] = len(resp.json()['abstracts-retrieval-response']['authors']['author'])



                date = ''
                try:
                    date+=resp.json()['abstracts-retrieval-response']['item']['bibrecord']['head']['source']['publicationdate']['day']
                    date+='.'
                except:
                    pass
                try:
                    date+=resp.json()['abstracts-retrieval-response']['item']['bibrecord']['head']['source']['publicationdate']['month']
                    date+='.'
                except:
                    pass
                try:
                    date+=resp.json()['abstracts-retrieval-response']['item']['bibrecord']['head']['source']['publicationdate']['year']
                    date+='.'
                except:
                    pass
                paper_dict['releaseDate'] = date 
                paper_dict['citationCount'] = resp.json()['abstracts-retrieval-response']['coredata']['citedby-count']
                
                paper_dict['authorNames'] = [str(author['preferred-name']['ce:given-name']) + ' ' + str(author['preferred-name']['ce:surname']) for author in resp.json()['abstracts-retrieval-response']['authors']['author']]
                paper_dict['authorScopusIds'] =  [author['@auid'] for author in resp.json()['abstracts-retrieval-response']['authors']['author']]
                
                for id in (paper_dict['authorScopusIds']):
                    if id in coAuthor_dict:
                        coAuthor_dict[id] += 1
                    else:
                        coAuthor_dict[id] = 1
                
                try:
                    paper_dict['type'] = resp.json()['abstracts-retrieval-response']['coredata']['subtypeDescription']
                    # paper_dict['refCount'] = abstract.refcount if abstract.refcount != None else ''


                    paper_dict['authorKeywords'] = [keyw['$'] for keyw in resp.json()['abstracts-retrieval-response']['authkeywords']['author-keyword']]
                    paper_dict['idxterms'] = [keyw['$'] for keyw in resp.json()['abstracts-retrieval-response']['idxterms']['mainterm']]



                    paper_dict['abstract'] = resp.json()['abstracts-retrieval-response']['coredata']['dc:description']

                    paper_dict['subjectAreas'] = [item['@code'] for item in resp.json()['abstracts-retrieval-response']['subject-areas']['subject-area']]
                    for code in paper_dict['subjectAreas']:
                        if code in subjectAreas_dict:
                            subjectAreas_dict[code] += 1
                        else:
                            subjectAreas_dict[code] = 1


                    # paper_dict['references'] = []
                    # for ref in (abstract.references):
                    #     ref = ref._asdict()
                    #     eid = "2-s2.0-" + ref['id']
                    #     doi = ref['doi'] if ref['doi']!=None else ""
                    #     authorNames = ref['authors'].split(";")if ref['authors']!= None else ''
                    #     authorIds = ref['authors_auid'].split(";") if ref['authors_auid']!= None else ''
                    #     releaseDate = ref['coverDate']
                    #     citationCount = ref['citedbycount']
                    #     paper_dict['references'].append({'eid':eid,
                    #                                     'doi':doi,
                    #                                     'authorNames':authorNames,
                    #                                     'authorIds':authorIds,
                    #                                     'releaseDate':releaseDate,
                    #                                     'citationCount':citationCount})
                except:
                        pass
                
                #PLUMX metrics
                # try:
                #     plum = PlumXMetrics(paper_dict['doi'],id_type='doi')
                    
                #     for i in range(len(plum.citation)):
                #          name = plum.citation[i]._asdict()['name']
                #          paper_dict['plumX_'+name] = plum.citation[i]._asdict()['total']

                #     for i in range(len(plum.social_media)):
                #          name = plum.social_media[i]._asdict()['name']
                #          paper_dict['plumX_'+name] = plum.social_media[i]._asdict()['total']

                #     for i in range(len(plum.mention)):
                #          name = plum.mention[i]._asdict()['name']
                #          paper_dict['plumX_'+name] = plum.mention[i]._asdict()['total']

                #     for i in range(len(plum.capture)):
                #          name = plum.capture[i]._asdict()['name']
                #          paper_dict['plumX_'+name] = plum.capture[i]._asdict()['total']
                # except:
                #      pass
                
                with open(f'scopus_publication_files_16042024/scopus_publication_{paper_dict['eid']}.json', "w",encoding='utf16') as outfile: 
                    json.dump(paper_dict, outfile, ensure_ascii=False)
                outfile.close()

            #this is kind of strange, may it be that scopus returns not all authors of a publication??
            try:
                    del coAuthor_dict[author_dict['scopusid']]
            except:
                    pass
            author_dict['coauthorCount'] = coAuthor_dict

            author_dict['subjectAreaCount_detailed'] = subjectAreas_dict
            df_subjectAreas_dict = pd.DataFrame.from_dict(subjectAreas_dict,orient='index',columns=['count'])
            df_subjectAreas_dict = df_subjectAreas_dict.join(df_subjectAreas.loc[list(map(int, subjectAreas_dict.keys()))]['subject-classification'])
            author_dict['subjectAreaCount_general'] = dict(zip(df_subjectAreas_dict.groupby('subject-classification').sum().index.to_list(), df_subjectAreas_dict.groupby('subject-classification').sum()['count'].to_list()))

        with open(f'scopus_author_files_16042024/scopus_author_information_{author_dict['scopusID']}.json', "w",encoding='utf16') as outfile: 
                json.dump(author_dict, outfile, ensure_ascii=False)
        outfile.close()


        with open(f'saved_locations.json', "w") as outfile: 
            json.dump(saved_locations, outfile)


100%|██████████| 203/203 [47:01<00:00, 13.90s/it] 


In [26]:
#openAlex publication retrieval
with open('saved_locations.json') as fd:
     saved_locations = json.load(fd)

for i in tqdm(range(len(df_names))):
    topics_dict = {}
    subfields_dict = {}
    fields_dict = {}

    concepts_dict = {}

    first_name = df_names['firstname'][i]
    last_name = df_names['lastname'][i]
    openAlexID = df_names['openAlexID'][i]
    # openalexId = df_names['openalexID'][i]

    if openAlexID != '':
        openAlex_record = pyalex.Authors()[f"{openAlexID}"]

        author_dict = {}
        coAuthor_dict = {}       

        author_dict['S4Sid'] = i
        author_dict['firstname'] = df_names['firstname'][i]
        author_dict['lastname'] = df_names['lastname'][i]
        author_dict['scopusID'] = df_names['scopusID'][i]
        author_dict['orcID'] = df_names['orcID'][i]
        author_dict['openAlexID'] = df_names['openAlexID'][i]


        #affiliation and location 
        author_dict['current_affiliation_institute'] = openAlex_record['affiliations'][0]['institution']['display_name'] if len(openAlex_record['affiliations']) > 0 else ''
        author_dict['current_affiliation_country'] = openAlex_record['affiliations'][0]['institution']['country_code'] if len(openAlex_record['affiliations']) > 0 else ''

        
        #metrics
        author_dict['counts_by_year'] = openAlex_record['counts_by_year']
        author_dict['citedByCount'] = openAlex_record['cited_by_count']
        author_dict['documentCount'] = openAlex_record['works_count']

        #fetch publications of author
        query = pyalex.Works().filter(authorships={'author.id':openAlexID}, from_publication_date='2020-01-01')
        author_dict['publications'] = [] #if author_docs != None else ''

        for item in chain(*query.paginate(per_page=200)):
            author_dict['publications'].append(item)
            paper_dict = {}
            paper_dict['openAlexID'] = item['id'][21:]
            paper_dict['doi'] = item['doi'][16:] if item['doi'] != None else ''
            paper_dict['title'] = item['title'] if item['title'] != None else ''
            paper_dict['type'] = item['type']   

            if (item['primary_location']!=None):
                paper_dict['landingPageURL'] = item['primary_location']['landing_page_url']
                if (item['primary_location']['source']!=None):
                    paper_dict['source'] = item['primary_location']['source']['display_name']
                    paper_dict['hostOrganization'] = item['primary_location']['source']['host_organization_name']
                else:
                    paper_dict['source'] = ''
                    paper_dict['hostOrganization'] = ''
            else:
                paper_dict['source'] = ''
                paper_dict['landingPageURL'] = ''
                paper_dict['hostOrganization'] = ''

            paper_dict['releaseDate'] = item['publication_date']
            paper_dict['citationCount'] = item['cited_by_count']
            paper_dict['countsPerYear'] = item['counts_by_year']

            paper_dict['authorNames'] = [author['author']['display_name'] for author in item['authorships']]
            paper_dict['authorOpenAlexIDs'] = [author['author']['id'][21:] for author in item['authorships']]
            paper_dict['authorCount'] = len(paper_dict['authorNames'])
            paper_dict['numberCountries'] = item['countries_distinct_count']
            paper_dict['numberInstitutions'] = item['institutions_distinct_count']


            paper_dict['SDGs'] = [goal['display_name'] for goal in item['sustainable_development_goals']]
            
            #primary topic retrieval
            if item['primary_topic'] != None:
                paper_dict['primaryTopic'] = item['primary_topic']['display_name'] if item['primary_topic'] != None else ''
                paper_dict['primaryTopicID'] = item['primary_topic']['id'][21:] if item['primary_topic'] != None else ''
                paper_dict['primarySubfield'] = item['primary_topic']['subfield']['display_name'] if item['primary_topic'] != None else ''
                paper_dict['primaryField'] = item['primary_topic']['field']['display_name'] if item['primary_topic'] != None else ''

                #all topics
                paper_dict['topics'] = []
                paper_dict['topicsID'] = []
                for topic in item['topics']:
                    if topic['score'] > 0.5:
                        paper_dict['topics'].append(topic['display_name'])
                        paper_dict['topicsID'].append(topic['id'])
                        
                for topic in paper_dict['topics']:
                    if topic in topics_dict:
                        topics_dict[topic] += 1
                    else:
                        topics_dict[topic] = 1

                #all subfields
                paper_dict['subfields'] = []
                for topic in item['topics']:
                    if topic['score'] > 0.5:
                        paper_dict['subfields'].append(topic['subfield']['display_name'])

                for subfield in paper_dict['subfields']:
                    if subfield in subfields_dict:
                        subfields_dict[subfield] += 1
                    else:
                        subfields_dict[subfield] = 1

                #all fields
                paper_dict['fields'] = []
                for topic in item['topics']:
                    if topic['score'] > 0.5:
                        paper_dict['fields'].append(topic['field']['display_name'])

                for field in paper_dict['fields']:
                    if field in fields_dict:
                        fields_dict[field] += 1
                    else:
                        fields_dict[field] = 1

            #concepts are deprected, we still gather them 
            paper_dict['concepts'] = []
            for concept in item['concepts']:
                if concept['score'] > 0.5:
                    paper_dict['concepts'].append(concept['display_name'])

            for concept in paper_dict['concepts']:
                if concept in concepts_dict:
                    concepts_dict[concept] += 1
                else:
                    concepts_dict[concept] = 1

            #store co-Authors
            for id in (paper_dict['authorOpenAlexIDs']):
                if id in coAuthor_dict:
                    coAuthor_dict[id] += 1
                else:
                    coAuthor_dict[id] = 1
            

            #gather reference information
            paper_dict['references'] = [ref[21:] for ref in item['referenced_works']]
            paper_dict['refCount'] = item['referenced_works_count']

            #full reference retrieval
            # for ref in item['referenced_works']:
            #     ref_item = pyalex.Works()[ref[21:]]
            #     reference_dict = {}
            #     reference_dict['openAlexID'] = ref[21:]
            #     reference_dict['doi'] = ref_item['doi'][16:] if ref_item['doi'] != None else ''
            #     reference_dict['authorNames'] =  [author['author']['display_name'] for author in ref_item['authorships']]
            #     reference_dict['authorOpenAlexIDs'] = [author['author']['id'][21:] for author in ref_item['authorships']]
            #     reference_dict['releaseDate'] = ref_item['publication_date']
            #     reference_dict['citationCount'] = ref_item['cited_by_count']
            #     paper_dict['references'].append(reference_dict)

            #gather related work information
            paper_dict['relatedWorks'] = [ref[21:] for ref in item['related_works']]

            #full related works retrieval
            # for ref in item['related_works']:
            #     ref_item = pyalex.Works()[ref[21:]]
            #     reference_dict = {}
            #     reference_dict['openAlexID'] = ref[21:]
            #     reference_dict['doi'] = ref_item['doi'][16:] if ref_item['doi'] != None else ''
            #     reference_dict['authorNames'] =  [author['author']['display_name'] for author in ref_item['authorships']]
            #     reference_dict['authorOpenAlexIDs'] = [author['author']['id'][21:] for author in ref_item['authorships']]
            #     reference_dict['releaseDate'] = ref_item['publication_date']
            #     reference_dict['citationCount'] = ref_item['cited_by_count']
            #     paper_dict['relatedWorks'].append(reference_dict)

            
            #save publication (paper_dict) as .json in specified folder
            with open(f'openAlex_publication_files_23032024/openAlex_publication_{paper_dict['openAlexID']}.json', "w", encoding='utf16') as outfile: 
                json.dump(paper_dict, outfile, ensure_ascii=False)
            outfile.close()

        try:
            del coAuthor_dict[author_dict['openAlexID']]
        except:
            pass
        
        
        author_dict['coauthorCount'] = coAuthor_dict

        author_dict['topicsCount'] = topics_dict
        author_dict['subfieldsCount'] = subfields_dict
        author_dict['fieldsCount'] = fields_dict

        author_dict['conceptsCount'] = concepts_dict

        #save autthor information (paper_dict) as .json in specified folder
        with open(f'openAlex_author_files_23032024/openAlex_author_information_{author_dict['openAlexID']}.json', "w", encoding='utf16') as outfile: 
                json.dump(author_dict, outfile, ensure_ascii=False)
        outfile.close()


        with open(f'saved_locations.json', "w") as outfile: 
                json.dump(saved_locations, outfile)


100%|██████████| 204/204 [08:25<00:00,  2.48s/it]
