# Add Google Scholar IDs to SNF researchers records in SNFp3

This notebook is a sandbox to test approaches and propose solutions.

## Setup

In [64]:
import os
import glob
import pandas as pd
import pickle
import re
from scholarly import scholarly

datadir = os.path.join(os.getcwd(), "..", "data")

# Load SNF data tables
snfdir = glob.glob(os.path.join(datadir, "snf/*.csv"))
snf = {}
# Uncomment to load all the SNF data tables
"""
for fp in snfdir:
    print(os.path.basename(fp))
    snf[os.path.basename(fp)] = pd.read_csv(fp, sep = ";")
"""
# Load the SNF authors table
fp = os.path.join(datadir, 'snf', 'P3_PersonExport.csv')
print(os.path.basename(fp))
snf[os.path.basename(fp)] = pd.read_csv(fp, sep = ";")


P3_PersonExport.csv


In [9]:
# Have a look at the authors record
snf["P3_PersonExport.csv"].head()

Unnamed: 0,Last Name,First Name,Gender,Institute Name,Institute Place,Person ID SNSF,OCRID,Projects as responsible Applicant,Projects as Applicant,Projects as Partner,Projects as Practice Partner,Projects as Employee,Projects as Contact Person
0,a Marca,Davide,male,,,53856,,,,,,36549.0,
1,a Marca,Andrea,male,,,132628,,67368.0,,,,,
2,A. Jafari,Golnaz,female,Universität Luzern,Luzern,747886,,191432.0,,,,,
3,Aaberg,Johan,male,,,575257,,,,,,119868.0,
4,Aahman,Josefin,female,,,629557,,,,,,141014.0,



## Extract authors Google Scholar IDs and add them to the SNF authors record

WARNING: Google Scholar limits the number of requests! Test can be done on a small number of records only. A solution should be found if more requests are needed.


In [None]:
# Test the Scholarly API search by author name

search_query = scholarly.search_author('Alexis Rapin')
author = next(search_query)

In [15]:
author

{'container_type': 'Author',
 'filled': set(),
 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 2>,
 'scholar_id': 'ZgCd-OgAAAAJ',
 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=ZgCd-OgAAAAJ',
 'name': 'Alexis Rapin',
 'affiliation': 'EPFL',
 'email_domain': '@epfl.ch',
 'interests': ['bioinformatics', 'open science', 'data science'],
 'citedby': 328}

In [None]:
# Add retrieved Google Scholar ID(s) to the SNF authors record when available
# Use a subset of the records ony (35) to avoid robot-behavior falgging

snf_person_head = snf["P3_PersonExport.csv"].head(35)
snf_person_head['Google Scholar ID'] = ""
for index, row in snf_person_head.iterrows():
    search_query = scholarly.search_author(row['First Name'] + " " + row['Last Name'])
    gs_record = list(search_query)
    # If multiple records are found, put together a list of GS IDs
    if gs_record:
        snf_person_head.loc[index, 'Google Scholar ID'] = ','.join([e['scholar_id'] for e in gs_record])

In [29]:
# Checkpoint
with open('snf_person_head_dump', 'wb') as snf_person_head_dump:
  pickle.dump(snf_person_head, snf_person_head_dump)

In [30]:
snf_person_head.head()

Unnamed: 0,Last Name,First Name,Gender,Institute Name,Institute Place,Person ID SNSF,OCRID,Projects as responsible Applicant,Projects as Applicant,Projects as Partner,Projects as Practice Partner,Projects as Employee,Projects as Contact Person,Google Scholar ID
0,a Marca,Davide,male,,,53856,,,,,,36549.0,,7ThYw2IAAAAJ
1,a Marca,Andrea,male,,,132628,,67368.0,,,,,,
2,A. Jafari,Golnaz,female,Universität Luzern,Luzern,747886,,191432.0,,,,,,
3,Aaberg,Johan,male,,,575257,,,,,,119868.0,,YUBv7p8AAAAJ
4,Aahman,Josefin,female,,,629557,,,,,,141014.0,,



## Handle ambiguous identities


In [22]:
# Find authors with multiple retrieved GS IDs
is_amb = [len(e.split(',')) > 1 for e in snf_person_head['Google Scholar ID']]

# Count ambiguous records
sum(is_amb)

3

In [23]:
# Show the ambiguous records
snf_person_head[is_amb]

Unnamed: 0,Last Name,First Name,Gender,Institute Name,Institute Place,Person ID SNSF,OCRID,Projects as responsible Applicant,Projects as Applicant,Projects as Partner,Projects as Practice Partner,Projects as Employee,Projects as Contact Person,Google Scholar ID
25,Abankwa,Daniel,male,Centre for Biotechnology University of Turku,Turku,523490,0000-0003-2769-0745,111446.0,,,,,,"XeOgll8AAAAJ,voeFGVMAAAAJ"
32,Abate,Antonio,male,Laboratoire de photonique et interfaces EPFL -...,Lausanne,642410,,,,,,153952;153990,,"QcgVxfIAAAAJ,K17owBYAAAAJ"
34,Abatista,Angela,female,,,774289,,,,,,181083,,"fnPaKuUAAAAJ,sZdJ114AAAAJ,qrYqYO8AAAAJ,2l4x5aA..."


In the case of "Abankwa Daniel", an existing ORCID record can be used to disambiguate the author's identity.

In [76]:
search_query = scholarly.search_author('Abate Antonio')

In [77]:
list(search_query)

[{'container_type': 'Author',
  'filled': set(),
  'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 2>,
  'scholar_id': 'QcgVxfIAAAAJ',
  'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=QcgVxfIAAAAJ',
  'name': 'Antonio Abate',
  'affiliation': 'Helmholtz-Zentrum Berlin',
  'email_domain': '@helmholtz-berlin.de',
  'interests': ['Hybrid solar cells',
   'Perovskite solar cells',
   'Perovskites',
   'Materials',
   'Supramolecular chemistry'],
  'citedby': 30316},
 {'container_type': 'Author',
  'filled': set(),
  'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 2>,
  'scholar_id': 'K17owBYAAAAJ',
  'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=K17owBYAAAAJ',
  'name': 'Maurizio Giuseppe Abrignani',
  'affiliation': 'UO di Cardiologia, POS Antonio Abate di Trapani, Italy',
  'email_domain': '@asptrapani.it',
  'interests': ['epidemiologia',
   'fattori di rischio',
   'ipertensione arteriosa',
   'dislipidemie',
   'cardiopa

In this case, verifying that the author name matches a name, and not an affiliation, will be sufficient to disambiguate the identity.

In [80]:
search_query = scholarly.search_author('Angela Abatista')

In [81]:
list(search_query)

[{'container_type': 'Author',
  'filled': set(),
  'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 2>,
  'scholar_id': 'fnPaKuUAAAAJ',
  'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=fnPaKuUAAAAJ',
  'name': 'Ângela Giovana Batista',
  'affiliation': 'University of Santa Maria - UFSM',
  'email_domain': '@ufsm.br',
  'interests': ['bioactive compounds',
   'compostos bioativos',
   'polyphenols',
   'obesity'],
  'citedby': 1130},
 {'container_type': 'Author',
  'filled': set(),
  'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 2>,
  'scholar_id': 'sZdJ114AAAAJ',
  'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=sZdJ114AAAAJ',
  'name': 'Miguel Angel Herrera Batista',
  'affiliation': 'Universidad Autónoma Metropolitana',
  'email_domain': '@azc.uam.mx',
  'interests': ['Educacion',
   'Diseño',
   'e-learning',
   'aprendizaje virtual',
   'tecnología y educación'],
  'citedby': 575},
 {'container_type': 'Author',
  'fill

In this case, comparing the names using a metric like "edit distance" would help in disambiguate the author's identity.

Alternatively, matching existing records with information on GoogleScholar can help. This could include:
 - Institution names
 - Bibliography records from P3_PublicationExport.csv
 - Keywords extracted from P3_PublicationExport.csv and P3_GrantExport.csv

In [67]:
# Load the SNF publications table
fp = os.path.join(datadir, 'snf', 'P3_PublicationExport.csv')
print(os.path.basename(fp))
snf[os.path.basename(fp)] = pd.read_csv(fp, sep = ";")

P3_PublicationExport.csv


In [87]:
snf['P3_PublicationExport.csv'].head(5)

Unnamed: 0,Publication ID SNSF,Project Number,Peer Review Status,Type of Publication,Title of Publication,Authors,Status,Publication Year,ISBN,DOI,...,Publisher,Editors,Journal Title,Volume,Issue / Number,Page from,Page to,Proceeding Title,Proceeding Place,Abstract
0,{3001D662-AD1B-43DA-8BA2-5ADC25C9C347},1094,Peer-reviewed,Book (peer-reviewed),Platos Idee des Guten,Ferber Rafael,Published,2015.0,978-3-89665-666-7,,...,"Academia Verlag ,St. Augustin",,,,,,,,,At the centre of the monograph (1984 first ed...
1,{3F5669B1-C09F-4486-87FF-21A561C15B8A},20108,Peer-reviewed,Original article (peer-reviewed),"MICROSTRUCTURE, LATTICE-PARAMETERS, AND SUPERC...","Xu Y. W., Suenaga M., Tafto J., Sabatini R....",Published,1989.0,,10.1103/PhysRevB.39.6667 ...,...,,,Physical Review B,39.0,10.0,6667.0,6680.0,Physical Review B,,
2,{12293018-B2F8-4320-A5C8-24C21A2AB7D3},20108,Peer-reviewed,Original article (peer-reviewed),NEUTRON-POWDER-DIFFRACTION STUDY OF NUCLEAR AN...,"Zolliker P., Cox D. E., Tranquada J. M., Sh...",Published,1988.0,,10.1103/PhysRevB.38.6575 ...,...,,,Physical Review B,38.0,10.0,6575.0,6582.0,Physical Review B,,
3,{DD1E2D93-7B51-4B32-8062-2AF42F5E4E94},25095,Peer-reviewed,Original article (peer-reviewed),NEUTRON AND SYNCHROTRON X-RAY POWDER-DIFFRACTI...,"Zolliker P., Cox D. E., Parise J. B., McCar...",Published,1990.0,,10.1103/PhysRevB.42.6332 ...,...,,,Physical Review B,42.0,10.0,6332.0,6341.0,Physical Review B,,
4,{C4B57BD5-668D-47A6-92DA-8CE1962D87DE},25095,Peer-reviewed,Original article (peer-reviewed),HEXAMAGNESIUM DICOBALT UNDECADEUTERIDE {Mg$_6$...,"Cerny R., Bonhomme F., Yvon K., Fischer P.,...",Published,1992.0,,10.1016/0925-8388(92)90537-j ...,...,,,Journal of Alloys and Compounds,187.0,1.0,233.0,241.0,Journal of Alloys and Compounds,,


The "DOI" records can be used to match authors bibliography in GoogleScholar.

In [88]:
# Load the SNF grants table
fp = os.path.join(datadir, 'snf', 'P3_GrantExport.csv')
print(os.path.basename(fp))
snf[os.path.basename(fp)] = pd.read_csv(fp, sep = ";")

P3_GrantExport.csv


In [89]:
snf['P3_GrantExport.csv'].head(5)

Unnamed: 0,Project Number,Project Number String,Project Title,Project Title English,Responsible Applicant,Funding Instrument,Funding Instrument Hierarchy,Institution,Institution Country,University,Discipline Number,Discipline Name,Discipline Name Hierarchy,All disciplines,Start Date,End Date,Approved Amount,Keywords
0,1,1000-000001,Schlussband (Bd. VI) der Jacob Burckhardt-Biog...,,Kaegi Werner,Project funding (Div. I-III),Project funding,,,Unassignable - NA,10302,Swiss history,Humanities and Social Sciences;Theology & reli...,10302,01.10.1975,30.09.1976,11619.0,
1,4,1000-000004,Batterie de tests à l'usage des enseignants po...,,Massarenti Léonard,Project funding (Div. I-III),Project funding,FPSE Université de Genève,Switzerland,University of Geneva - GE,10104,"Education and learning sciences, subject-speci...","Humanities and Social Sciences;Psychology, edu...",10104,01.10.1975,30.09.1976,41022.0,
2,5,1000-000005,Kritische Erstausgabe der 'Evidentiae contra D...,,Kommission für das Corpus philosophorum medii ...,Project funding (Div. I-III),Project funding,Kommission für das Corpus philosophorum medii ...,Switzerland,"Non-profit organisations (libraries, museums, ...",10101,Philosophy,Humanities and Social Sciences;Linguistics and...,10101,01.03.1976,28.02.1985,79732.0,
3,6,1000-000006,Katalog der datierten Handschriften in der Sch...,,Burckhardt Max,Project funding (Div. I-III),Project funding,Abteilung Handschriften und Alte Drucke Univer...,Switzerland,University of Basel - BS,10302,Swiss history,Humanities and Social Sciences;Theology & reli...,10302,01.10.1975,30.09.1976,52627.0,
4,7,1000-000007,Wissenschaftliche Mitarbeit am Thesaurus Lingu...,,Schweiz. Thesauruskommission,Project funding (Div. I-III),Project funding,Schweiz. Thesauruskommission,Switzerland,"Non-profit organisations (libraries, museums, ...",10303,Ancient history and Classical studies,Humanities and Social Sciences;Theology & reli...,10303,01.01.1976,30.04.1978,120042.0,


The "Discipline Name Hierarchy" records can be used to extract keywords matching authors interests in GoogleScholar