# Libraries

In [2]:
# Google Scholar
# https://scholar.google.com/citations?user=sKSTKAoAAAAJ&hl=en
# https://scholar.google.com/citations?view_op=view_org&hl=en&org=1896398670060433590

import pickle
import numpy as np
import pandas as pd

from scholarly import scholarly
# https://github.com/scholarly-python-package/scholarly
# https://www.scraperapi.com/blog/best-google-scholar-apis-proxies/

from random import randint
from time import sleep

from collections import Counter
from itertools import combinations

# Scraper API
# https://www.scraperapi.com/blog/best-google-scholar-apis-proxies/
# https://dev.to/iankerins/build-your-own-google-scholar-api-with-python-scrapy-4p73


# Initial Scrapping

### Load list of faculty

In [3]:
df_listFaculty = pd.read_csv('Data_FacultyAreas.csv')
df_listFaculty


Unnamed: 0,ID,Title,Area
0,1762,Amir Yacoby,Applied Physics
1,2626,Ann Pearson,Environmental Science & Engineering
2,1795,Ariel Amir,Applied Mathematics
3,13261,Ariel Procaccia,Computer Science
4,1523,Barbara J. Grosz,Computer Science
...,...,...,...
120,1713,Yaron Singer,Computer Science
121,1519,Yiling Chen,Computer Science
122,1651,Yue Lu,Electrical Engineering
123,1553,Zhigang Suo,Materials Science & Mechanical Engineering


In [4]:
listFaculty = list(df_listFaculty['Title'])
listFaculty


['Amir Yacoby',
 'Ann Pearson',
 'Ariel Amir',
 'Ariel Procaccia',
 'Barbara J. Grosz',
 'Boaz Barak',
 'Boris Kozinsky',
 'Brian F. Farrell',
 'Cengiz Pehlevan',
 'Cherry Murray',
 'Christin Y. Sander',
 'Christopher Joseph Lombardo',
 'Christopher Rycroft',
 'Conor J Walsh',
 'Cynthia Dwork',
 'Cynthia Friend',
 'Daniel J. Jacob',
 'Daniel Needleman',
 'Daniel P. Schrag',
 'David A. Weitz',
 'David Brooks',
 'David C. Bell',
 'David Clarke',
 'David J. Malan',
 'David Keith',
 'David Mooney',
 'David Parkes',
 'David R. Nelson',
 'Demba Ba',
 'Doeke Romke Hekstra',
 'Donald Elliot Ingber',
 'Donhee Ham',
 'Eddie Kohler',
 'Efthimios Kaxiras',
 'Elena Leah Glassman',
 'Eli Tziperman',
 'Elsie M Sunderland',
 'Eric Mazur',
 'Evelyn Hu',
 'Federico Capasso',
 'Finale Doshi-Velez',
 'Flavio P. Calmon',
 'Frank J. Doyle',
 'Frank N Keutsch',
 'Frans A. Spaepen',
 'Gu-Yeon Wei',
 'H.T. Kung',
 'Hanspeter Pfister',
 'Harry R. Lewis',
 'James G. Anderson',
 'James H. Waldo',
 'James Mickens'

### Keep only first and last names

In [5]:
def firstLastName(name):
    splitName = name.split()
    numName = len(splitName)
    
    firstName = splitName[0]
    lastName = splitName[numName-1]
    
    return firstName + ' ' + lastName
    

In [6]:
listFaculty_FirstLastName = [firstLastName(x) for x in listFaculty]
listFaculty_FirstLastName


['Amir Yacoby',
 'Ann Pearson',
 'Ariel Amir',
 'Ariel Procaccia',
 'Barbara Grosz',
 'Boaz Barak',
 'Boris Kozinsky',
 'Brian Farrell',
 'Cengiz Pehlevan',
 'Cherry Murray',
 'Christin Sander',
 'Christopher Lombardo',
 'Christopher Rycroft',
 'Conor Walsh',
 'Cynthia Dwork',
 'Cynthia Friend',
 'Daniel Jacob',
 'Daniel Needleman',
 'Daniel Schrag',
 'David Weitz',
 'David Brooks',
 'David Bell',
 'David Clarke',
 'David Malan',
 'David Keith',
 'David Mooney',
 'David Parkes',
 'David Nelson',
 'Demba Ba',
 'Doeke Hekstra',
 'Donald Ingber',
 'Donhee Ham',
 'Eddie Kohler',
 'Efthimios Kaxiras',
 'Elena Glassman',
 'Eli Tziperman',
 'Elsie Sunderland',
 'Eric Mazur',
 'Evelyn Hu',
 'Federico Capasso',
 'Finale Doshi-Velez',
 'Flavio Calmon',
 'Frank Doyle',
 'Frank Keutsch',
 'Frans Spaepen',
 'Gu-Yeon Wei',
 'H.T. Kung',
 'Hanspeter Pfister',
 'Harry Lewis',
 'James Anderson',
 'James Waldo',
 'James Mickens',
 'James Rice',
 'Jennifer Lewis',
 'Jenny Hoffman',
 'Jerry Mitrovica',
 '

### Scholarly test with single entries

In [7]:
search_query = scholarly.search_author('Amir Yacoby Harvard')
author = scholarly.fill(next(search_query))


In [8]:
author['scholar_id']

'slFLfLYAAAAJ'

In [9]:
author

{'container_type': 'Author',
 'filled': {'basics', 'coauthors', 'counts', 'indices', 'publications'},
 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 2>,
 'scholar_id': 'slFLfLYAAAAJ',
 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=slFLfLYAAAAJ',
 'name': 'Amir Yacoby',
 'affiliation': 'Professor of Physics, Harvard University',
 'email_domain': '@g.harvard.edu',
 'interests': ['Experimental Condensed Matter Physics'],
 'citedby': 31269,
 'coauthors': [],
 'publications': [{'container_type': 'Publication',
   'source': <PublicationSource.AUTHOR_PUBLICATION_ENTRY: 2>,
   'bib': {'title': 'Coherent manipulation of coupled electron spins in semiconductor quantum dots',
    'pub_year': '2005'},
   'filled': False,
   'author_pub_id': 'slFLfLYAAAAJ:u5HHmVD_uO8C',
   'num_citations': 3465},
  {'container_type': 'Publication',
   'source': <PublicationSource.AUTHOR_PUBLICATION_ENTRY: 2>,
   'bib': {'title': 'Nanoscale magnetic sensing with an individual elect

In [10]:
fake_query = scholarly.search_author('Fake Author Harvard')
fake_author = scholarly.fill(next(fake_query))


StopIteration: 

In [11]:
fake_query

<generator object Navigator.search_authors at 0x120e3ec10>

In [12]:
try:
    fake_query = scholarly.search_author('Fake Author Harvard')
    fake_author = scholarly.fill(next(fake_query))
except:
    print("NO AUTHOR")

NO AUTHOR


### Scholarly run with faculty list

In [13]:
def get_id(list_titles):
    list_ids = []
    list_containers = []
    
    for title in list_titles:
        sleep(randint(5,30))
        
        try:
            query = scholarly.search_author(title+' Harvard')
            author = scholarly.fill(next(query))
            author_id = author['scholar_id']
            
            print(title)
            list_ids.append(author_id)
            list_containers.append(author)
        except:
            print("NO PROFILE: "+title)
            list_ids.append('None')
            list_containers.append('None')
            
    return list_ids, list_containers


In [14]:
list1, list2 = get_id(listFaculty_FirstLastName)


Amir Yacoby
Ann Pearson
NO PROFILE: Ariel Amir
Ariel Procaccia
NO PROFILE: Barbara Grosz
Boaz Barak
Boris Kozinsky
Brian Farrell
Cengiz Pehlevan
NO PROFILE: Cherry Murray
Christin Sander
NO PROFILE: Christopher Lombardo
Christopher Rycroft
Conor Walsh
Cynthia Dwork
Cynthia Friend
Daniel Jacob
NO PROFILE: Daniel Needleman
Daniel Schrag
David Weitz
David Brooks
David Bell
David Clarke
NO PROFILE: David Malan
David Keith
David Mooney
David Parkes
David Nelson
Demba Ba
Doeke Hekstra
Donald Ingber
NO PROFILE: Donhee Ham
Eddie Kohler
Efthimios Kaxiras
Elena Glassman
Eli Tziperman
Elsie Sunderland
Eric Mazur
NO PROFILE: Evelyn Hu
Federico Capasso
Finale Doshi-Velez
Flavio Calmon
NO PROFILE: Frank Doyle
NO PROFILE: Frank Keutsch
NO PROFILE: Frans Spaepen
Gu-Yeon Wei
H.T. Kung
Hanspeter Pfister
NO PROFILE: Harry Lewis
NO PROFILE: James Anderson
James Waldo
James Mickens
James Rice
Jennifer Lewis
Jenny Hoffman
Jerry Mitrovica
Jia Liu
Joanna Aizenberg
John Shaw
NO PROFILE: John Holdren
Jonathan Z

### Append lists to df

In [16]:
df_listFaculty['Scholar ID'] = list1
df_listFaculty


Unnamed: 0,ID,Title,Area,Scholar ID
0,1762,Amir Yacoby,Applied Physics,slFLfLYAAAAJ
1,2626,Ann Pearson,Environmental Science & Engineering,Hwk4sP0AAAAJ
2,1795,Ariel Amir,Applied Mathematics,
3,13261,Ariel Procaccia,Computer Science,8ZpV-lkAAAAJ
4,1523,Barbara J. Grosz,Computer Science,
...,...,...,...,...
120,1713,Yaron Singer,Computer Science,j-MBXNMAAAAJ
121,1519,Yiling Chen,Computer Science,x_7xA0UAAAAJ
122,1651,Yue Lu,Electrical Engineering,84FUioAAAAAJ
123,1553,Zhigang Suo,Materials Science & Mechanical Engineering,GlBl-VUAAAAJ


### Export data

In [18]:
df_listFaculty.to_csv('df_listFaculty.csv', index=False)


In [19]:
readCSV_listFaculty = pd.read_csv('df_listFaculty.csv')
readCSV_listFaculty


Unnamed: 0,ID,Title,Area,Scholar ID
0,1762,Amir Yacoby,Applied Physics,slFLfLYAAAAJ
1,2626,Ann Pearson,Environmental Science & Engineering,Hwk4sP0AAAAJ
2,1795,Ariel Amir,Applied Mathematics,
3,13261,Ariel Procaccia,Computer Science,8ZpV-lkAAAAJ
4,1523,Barbara J. Grosz,Computer Science,
...,...,...,...,...
120,1713,Yaron Singer,Computer Science,j-MBXNMAAAAJ
121,1519,Yiling Chen,Computer Science,x_7xA0UAAAAJ
122,1651,Yue Lu,Electrical Engineering,84FUioAAAAAJ
123,1553,Zhigang Suo,Materials Science & Mechanical Engineering,GlBl-VUAAAAJ


In [20]:
with open('output_scholarly.pkl', 'wb') as output:
    pickle.dump(list2, output, pickle.HIGHEST_PROTOCOL)
    