In [8]:
from scholarly import scholarly
import pandas as pd
import requests
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [4]:
df = pd.read_csv("./faculty_data/Faculty_CS_ECE-20230806.csv")
df_cs_original = df[df['Department_Id'] == 0].copy()  # filter df to only include faculty with department_id = 0
df_cs_original = df_cs_original.reset_index(drop=True)
df_cs_original['scholar_id'] = None
df_cs_original['openalex_id'] = None
df_cs_original['image_url'] = None
df_cs = df_cs_original.copy()
# df_cs # uncomment this line to see the dataframe

In [5]:
def possible_name_list(name):
    cleared_name = name.replace('PhD','').replace('MS','').strip() # remove PhD and MS
    name_list = [cleared_name]
    temp_name = re.sub(r'\b[A-Za-z]\b[ .]*', '', cleared_name).strip()
    if temp_name != cleared_name:
        name_list.append(temp_name)
    # get different parts of name splitted by ' '
    if ' ' in cleared_name:
        name_split = cleared_name.split(' ')
        for part in name_split:
            # if part is not a single letter
            if len(part) > 1:
                name_list.append(part)
    return name_list

def google_scholar_get_one_author(name, affiliation, email):
    alternative_name_list = possible_name_list(name)
    if '@' in email:  # only keep email after @
        email0 = email.split('@')[1]
    elif email == 'Missing':
        email0 = None
    else:
        email0 = email

    # search by name and affiliation
    for possible_name in alternative_name_list:
        search_query = possible_name + ' ' + affiliation
        author = next(scholarly.search_author(search_query), None)
        if author:
            return (author.get('scholar_id'), author.get('url_picture'))
    if email0 is None:
        return (None, None)
    # search by name and email
    for possible_name in alternative_name_list:
        search_query = possible_name + ' ' + email0
        author = next(scholarly.search_author(search_query), None)
        if author:
            return (author.get('scholar_id'), author.get('url_picture'))

    return (None, None)


In [7]:
# google scholar test
i = 0 # change i to test different faculty
name = df_cs['Name'][i]
affiliation = df_cs['University_Name'][i]
email = df_cs['Email'][i]
(id,image_url) = google_scholar_get_one_author(name,affiliation,email)
print(id,image_url)

yyDMlesAAAAJ https://scholar.google.com/citations?view_op=medium_photo&user=yyDMlesAAAAJ


In [9]:
def openalex_get_one_author(name, institution):
    name_list = possible_name_list(name)
    for n in name_list:
        url = "https://api.openalex.org/authors?filter=display_name.search:{},last_known_institution.country_code:US&per-page=200".format(n)
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if data['meta']['count'] == 0:
                # print('here')
                return None
            elif data['meta']['count'] >= 1:
                # collect authors' institutions list
                inst_list = []
                for author in data['results']:
                    # if author has no institution, skip
                    if author['last_known_institution'] is not None:
                        inst_list.append(author['last_known_institution']['display_name'])
                    else:
                        inst_list.append(None)

                # find the best match
                best_match = process.extractOne(institution,inst_list,scorer=fuzz.token_sort_ratio)
                if best_match is None:
                    print(f'No match found for {name} from {institution}\n')
                    return None

                if best_match[1] > 80:  # if there is a match, return the author object
                    # print(f'Match Score: {best_match[1]}, Matched Institution: {best_match[0]}\n')
                    return data['results'][inst_list.index(best_match[0])]
                else:  # if there is no match, return None
                    # print(f'Match Score: {best_match[1]}, Matched Institution: {best_match[0]}\n')
                    return None

        else:
            return None

In [10]:
# openalex test
i = 0  # change i to test different faculty
name = df_cs['Name'][i]
affiliation = df_cs['University_Name'][i]
author = openalex_get_one_author(name,affiliation)
print(author)

{'id': 'https://openalex.org/A5006981411', 'orcid': None, 'display_name': 'Josh Alman', 'display_name_alternatives': ['Josh Alman', 'Alman Josh'], 'relevance_score': 1014.1382, 'works_count': 58, 'cited_by_count': 427, 'summary_stats': {'2yr_mean_citedness': 3.6666666666666665, 'h_index': 9, 'i10_index': 8}, 'ids': {'openalex': 'https://openalex.org/A5006981411'}, 'last_known_institution': {'id': 'https://openalex.org/I78577930', 'ror': 'https://ror.org/00hj8s172', 'display_name': 'Columbia University', 'country_code': 'US', 'type': 'education'}, 'x_concepts': [{'id': 'https://openalex.org/C33923547', 'wikidata': 'https://www.wikidata.org/wiki/Q395', 'display_name': 'Mathematics', 'level': 0, 'score': 98.3}, {'id': 'https://openalex.org/C114614502', 'wikidata': 'https://www.wikidata.org/wiki/Q76592', 'display_name': 'Combinatorics', 'level': 1, 'score': 89.7}, {'id': 'https://openalex.org/C41008148', 'wikidata': 'https://www.wikidata.org/wiki/Q21198', 'display_name': 'Computer science'