# 1. Importing libraries

In [2]:
import pandas as pd

# collections.Callable has been moved to collections.abc.Callable in python 3.10+.
# Hence, the below lines are added.
# Without it, there will be an error: AttributeError: module 'collections' has no attribute 'Callable'
import collections
collections.Callable = collections.abc.Callable
from collections import Counter

import bs4 # imported to compare if certain bs4 object are a certain bs4 type
from bs4 import BeautifulSoup
import requests
import math
import re
from scholarly import scholarly
from pprint import pprint
import urllib
import urllib.parse
from urllib.parse import urljoin, urlparse, parse_qs
from urllib.request import urlopen
from urllib.error import HTTPError
import json
from thefuzz import process
from tqdm.notebook import trange, tqdm
import ast

# 2. Utility Functions

In [2]:
def get_api_result(query_url):
    """
    Return API result by using query_url.

    Input:
    - query_url (string): URL to query to API.

    Output:
    - result (Dict): Dictionary of results from the API.
                     If error occurs, return the error details.
    """
    try:
        response = urlopen(query_url)
        result = json.loads(response.read().decode('utf-8'))
        return result
    except HTTPError as e:
        # Handle any HTTP error by returning a custom error message
        return {"error": f"HTTP error {e.code}: {e.reason}"}

In [3]:
def get_most_similar_index(find_string, string_list):
    """
    Return the index of the string (in string_list), that is the most similar with find_string.

    Input:
    - find_string (string): String to find in string_list.
    - string_list (List(string)): List of strings.

    Output:
    - similar_index (int): Index of the string (in string_list) that is most similar to find_string.
                           If there is no string that is at least 70% similar to find_string,
                           return None.
    """

    # Get the name that is most similar
    similar_name, _ = process.extractOne(find_string, string_list)
    similar_name_index = string_list.index(similar_name)

    return similar_name_index

In [4]:
def have_words(input_string, at_least_num):
    """
    Return True if the string contains at least at_least_num words.
    Return False otherwise.

    Input:
    - input_string (string): String to count the no. of words.
    - at_least_num (int): Number of words the string should at least contain,
                    to return True.
    """
    
    # Split the string into words using spaces as the delimiter
    words = input_string.split()
    
    # Check if the no .of words is at least at_least_num
    if len(words) >= at_least_num:
        return True
    else:
        return False

In [5]:
def find_list_with_value(list_of_lists, target_value):
    """
    Return index of a list that contains the target value, from a list of lists.

    Input:
    - list_of_lists ( List( List(string) ) ): List of lists.
    - target_value (string): String to search for in the list of lists.

    Output:
    - index (int): Index of list that contains target_value.
                   If no list contains the target_value, return -1.

    """
    
    for index, sub_list in enumerate(list_of_lists):
        if target_value in sub_list:
            return index  # Return the index of the list containing the value
    return -1  # Return -1 if the value is not found in any list

# 3. Functions for DR-NTU

In [12]:
def get_research_interest_from_drNTU(drNTU_link):
    """
    Return the list of tags of the researcher in their DR-NTU page.

    Input:
    - drNTU_link (string): DR-NTU profile URL of a SCSE faculty.

    Output:
    - tag_list (List(string)) : List of research interest found on DR-NTU profile page of that faculty.
                                Return nan if not avilable.
    """
    soup_source = requests.get(drNTU_link).text
    soup = BeautifulSoup(soup_source,'lxml')

    try:
        tag_list = []
    
        for tag in soup.find('div', id='taxonomyDiv', class_='dynaFieldValue').find_all('span', class_='rkeyword'):
            tag_name = tag.text.strip()
            if not tag_name == 'Computer Science and Engineering':
                tag_list.append(tag_name)

        # If no tags available
        if len(tag_list) == 0:
            return float('nan')
        
        return tag_list
        
    except:
        return float('nan')

In [7]:
# From Individual Assignment 1
def get_cleaned_pub_list(unprocessed_pub_list):
    """
    Return list of cleaned publication citation from the unprocessed_pub_list retrieved
    from the DR-NTU profile publication tab.

    Input:
        - unprocessed_pub_list (list): List of publication details extracted by BeautifulSoup.
                                       It also contains elements that are tags, without any text.
    """
    
    # Initialize variables
    cleaned_pub_list = []
    current_subset = []
    
    # To keep track of the number of consecutive <br/> elements
    # Because each publication can be splitted by 2 consecutive <br/> elements
    br_count = 0
    
    # Iterate through the unprocessed_pub_list
    for item in unprocessed_pub_list:
        
        # Check if the item is a string or a BeautifulSoup object
        if isinstance(item, str):
            # If it is a string, add it to the current_subset
            current_subset.append(item)
            
        elif str(item) != '<br/>':
            # If it is a BeautifulSoup object and not <br/>, convert it to a string and add it to the current_subset
            current_subset.append(str(item))
    
        # Check if the current item is a <br/>
        if str(item) == '<br/>':
            br_count += 1
            if br_count == 2:
                # If two consecutive <br/> tags are found,
                # concatenate the current subset and reset it
                cleaned_pub_list.append(''.join(current_subset))
                
                # Empty current_subset to store the next subset
                current_subset = []
                
            elif br_count > 2:
                # If more than 2 consecutive <br/> tags are found, reset the count
                br_count = 1
    
    # Append the last subset if there are remaining elements
    if current_subset:
        cleaned_pub_list.append(''.join(current_subset))

    # Remove the subsets that are not publication citations
    remove_list = ['<br/>', 'Highly Cited:', 'Click', 'Recent Publication:']
    for remove_word in remove_list:
        cleaned_pub_list[:] = [subset for subset in cleaned_pub_list if remove_word not in subset]

    # Remove element that is a empty string
    cleaned_pub_list[:] = [subset for subset in cleaned_pub_list if not subset == '']

    # Remove the tags in the citations
    replace_list = ['<b>', '</b>', '<i>', '</i>']
    for replace_word in replace_list:
        cleaned_pub_list[:] = [subset.replace(replace_word, '') for subset in cleaned_pub_list]
    
    return cleaned_pub_list

In [8]:
def get_unprocessed_pub_list(drNTU_link):
    """
    Return publication details from DR-NTU faculty's profile in publication tab.

    Input:
    -  drNTU_link (string): DR-NTU profile link (in publication tab) of a SCSE faculty.

    Output:
    - unprocessed_pub_list (list): List of publication details extracted by BeautifulSoup.
                                       It also contains elements that are tags, without any text.
    """

    # print(drNTU_link+'/selectedPublications.html')
    soup_source = requests.get(drNTU_link+'/selectedPublications.html').text
    soup = BeautifulSoup(soup_source,'lxml')

    # If "Articles (Journal)" tab does not exist for this faculty,
    # return an empty list
    if not soup.find('div', id="facultyjournalDiv"):
        return []
    
    # Get publication list from the profile page, but unprocessed
    unprocessed_pub_list = soup.find('div', id="facultyjournalDiv").contents[1].contents

    return unprocessed_pub_list

In [9]:
def get_doi_list_from_drNTU(drNTU_link):
    """
    Return the list of DOI of all publications written by the researcher, in their DR-NTU page.

    Input:
    - drNTU_link (string): DR-NTU profile link (in publication tab) of a SCSE faculty.

    Output:
    - doi_list (List(string)) : List of DOI found on DR-NTU publication page of that faculty.
    """

    doi_list = []
    
    # Get publication list from the profile page, but unprocessed
    unprocessed_pub_list = get_unprocessed_pub_list(drNTU_link)

    # Get the processed version of the publication 
    cleaned_pub_list = get_cleaned_pub_list(unprocessed_pub_list)

    # Regex to extract DOI
    doi_pattern = r'doi:\s+(\S+)'

    for pub in cleaned_pub_list:
        match = re.search(doi_pattern, pub)

        if match:
            doi_list.append(match.group(1))

    return doi_list

In [10]:
# Modified code from Assignment 1
def get_pub_list_from_article(drNTU_link):
    """
    Return list of all publication with title only, from the "Articles (Journal)" tab if it exist

    <pub_1_title> will not be appended if <pub_1_title> was not correctly extracted by regex.

    Input:
    - dr_ntu_pub_link (string): DR-NTU profile link (in publication tab)

    Output:
    - pub_title_list (List(string)): List of publication extracted from faculty's profile in publication tab.
    """

    # List to store list of publication title and year.
    pub_title_list = []
    
    # Get publication list from the profile page, but unprocessed
    unprocessed_pub_list = get_unprocessed_pub_list(drNTU_link)

    # Get the processed version of the publication 
    cleaned_pub_list = get_cleaned_pub_list(unprocessed_pub_list)
    
    # Extract title and year from all the publications
    for pub in cleaned_pub_list:
        one_pub_title = get_one_pub_title(pub)
        
        # If title was retrieved
        if one_pub_title:
            # If the title have at least 3 words,
            # append to the list
            if have_words(one_pub_title, 3):
                pub_title_list.append(one_pub_title)

    return pub_title_list

In [11]:
# Modified code from Assignment 1
def get_one_pub_title(pub_citation):
    """
    Return publication title for one publication citation.
    The citation will not be a full citation. 
    Return None if title could not be retrieved.

    Input:
    - pub_citation (string): Citation of a paper retrieved from publication tab
                              in DR-NTU profile page of a SCSE faculty.

    Output:
    - pub_title (string): Title of the publication retrieved from the citation.
    """
    
    # If there is double inverted commas in the citation, that is the title.
    # So get the title between the double inverted commas
    if '"' in pub_citation or '“' in pub_citation or '”' in pub_citation:
        # Replace the other variation of double inverted commas
        # to the default ones
        pub_citation = pub_citation.replace("“", '"').replace("”", '"')

        # Extract the title and year (if exist)
        pub_title = re.findall(r'"(.*?)"', pub_citation)[0]
        
        return pub_title

    # For papers that do not have double inverted commas:
    pattern = r'(\d{4}),*\s*\w*[),.]+\s*(.*?)[,.]'
    matches = re.findall(pattern, pub_citation)

    # If no match found
    if len(matches) == 0: 
        return None

    # If match found
    pub_title = matches[0][1]

    return pub_title

# 4. Functions for OpenAlex API

In [12]:
def get_author_info_from_OpenAlexAPI(author_name, keyword, mode):
    """
    Return the dictionary of details of a specified author of the publication with specified doi.
    
    Input:
    - author_name (string): Name of the author retrieved from DR-NTU.
                            It will not be used when mode='orcid', as only the keyword is needed in that mode.
    - keyword (string): Term related to the author, which is used to search the author's details.
                        When mode='institution', keyword will not be used as the instituion ID (of NTU),
                        which is needed for this mode, will be defined in this function. Hence, it will
                        be usually left as an empty string for that mode.
    - mode (string): To indicate search method of author.
                     - if 'api_id', search author based on author's OpenAlex id.
                     - if 'doi', search author based on name and Digital Object Identifier (DOI) of a publication,
                       with author_name as one of the authors.
                     - if 'orcid', search author based on their ORCID profile link.
                     - if 'pub', search author based on name and publication title written by the author.
                     - if 'institution', search author based on last institution, name and x_concept.
                     - if 'name', search author based on name and x_concept.

    Output:
    - author_info (dict): Dictionary containing the details of the author by OpenAlex API.
    """
    
    if mode == 'api_id':
        query_url = 'https://api.openalex.org/authors/' + keyword
        author_info = get_api_result(query_url)            
        return author_info
        
    elif mode == 'doi':
        # API Query to search for pub with the specified doi
        query_url = "https://api.openalex.org/works/https://doi.org/" + keyword
        # print(query_url)
        result = get_api_result(query_url)

        # If API gave error
        if 'error' in result:
            return result

        # If no authors written
        if len(result['authorships']) == 0:
            return []
    
        authors_list = []
    
        # Get all authors' name
        for author in result['authorships']:
            authors_list.append(author['author']['display_name'])
    
        # Compare the names with author_name and get the index with the highest similarity
        similar_index = get_most_similar_index(author_name, authors_list)
        author_info = result['authorships'][similar_index]

        return author_info

    elif mode == 'orcid':
        # Get details by ORCID id
        query_url = "https://api.openalex.org/authors/" + keyword
        # print(query_url)
        author_info = get_api_result(query_url)            
        return author_info

    elif mode == 'pub':
        # Get details by searching with publication title
        
        # Convert the name for query url
        query_converted_pub = urllib.parse.quote_plus(keyword)
        query_url = "https://api.openalex.org/works?search=" + query_converted_pub
        # print(query_url)
        result = get_api_result(query_url) 

        # If API gave error
        if 'error' in result:
            return result

        # If no results or no authors written
        if len(result['results']) == 0 or len(result['results'][0]['authorships']) == 0:
            return []

        authors_list = []
    
        # Get all authors' name from the first candinate publication only
        for author in result['results'][0]['authorships']:
            authors_list.append(author['author']['display_name'])
    
        # Compare the names with author_name and get the index with the highest similarity
        similar_index = get_most_similar_index(author_name, authors_list)
        author_info = result['results'][0]['authorships'][similar_index]

        return author_info

    elif mode == 'institution':
        # Get details by last institution

        # Convert the name for query url
        query_converted_name = urllib.parse.quote_plus(author_name)
        
        query_url = 'https://api.openalex.org/authors?search=' + query_converted_name \
                    + '&filter=last_known_institution.id:I172675005,' \
                    + 'x_concepts.id:C41008148&sort=relevance_score:desc'
        result = get_api_result(query_url)
        # print(query_url)

        # If API gave error
        if 'error' in result:
            return result

        # If no results
        if len(result['results']) == 0:
            return []

        for candinate in result['results']:
            # Check if the author is relevant to 'Computer Science'
            for tag in candinate['x_concepts']:
                if tag['display_name'] == 'Computer science':
                    if tag['score'] > 70:
                        author_info = candinate
                        return author_info
                    else:
                        continue
                    
        # If no possible candinate
        return []

    else: # mode == 'name'
        # Get details by name only
        # This should be used only when no other option is viable
        
        # Convert the name for query url
        query_converted_name = urllib.parse.quote_plus(author_name)
        
        # x_concepts.id for 'Computer science' included in url
        query_url = 'https://api.openalex.org/authors?search=' + query_converted_name \
                    + '&filter=x_concepts.id:C41008148&sort=relevance_score:desc'
        result = get_api_result(query_url)
        # print(query_url)

        # If API gave error
        if 'error' in result:
            return result

        # If no results
        if len(result['results']) == 0:
            return []

        for candinate in result['results']:
            # Check if the author is relevant to 'Computer Science'
            for tag in candinate['x_concepts']:
                if tag['display_name'] == 'Computer science':
                    if tag['score'] > 70:
                        author_info = candinate
                        return author_info
                    else:
                        continue

        # If no possible candinate
        return []

# 5. Add more info into original csv

## 5.1. Load dataframe from Assignment 1

In [26]:
df = pd.read_csv('Submission/Takesawa_Saori.csv')
df.head(20)

Unnamed: 0,Name,Email,dr_ntu_link,website_link,dblp_link,citations_all_num
0,A S Madhukumar,asmadhukumar@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00083,"['http://www3.ntu.edu.sg/home/asmadhukumar/', ...",https://dblp.org/pid/66/549,2907.0
1,Alexei Sourin,assourin@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00274,"['http://www3.ntu.edu.sg/home/assourin/', 'htt...",https://dblp.org/pid/15/3108,2939.0
2,Anupam Chattopadhyay,anupam@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01076,['https://scholar.google.co.in/citations?user=...,https://dblp.org/pid/99/4535,6226.0
3,Anwitaman Datta,anwitaman@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00706,"['https://personal.ntu.edu.sg/anwitaman/', 'ht...",https://dblp.org/pid/d/AnwitamanDatta,8047.0
4,Arvind Easwaran,arvinde@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00687,"['https://cps-research-group.github.io/', 'htt...",https://dblp.org/pid/73/1708,2817.0
5,Bo An,boan@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00698,"['https://personal.ntu.edu.sg/boan/', 'https:/...",https://dblp.org/pid/42/6178-1,6957.0
6,Cham Tat Jen,astjcham@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp01067,['https://personal.ntu.edu.sg/astjcham/index.h...,https://dblp.org/pid/29/3808,5533.0
7,Chan Syin,asschan@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00691,"['http://www3.ntu.edu.sg/home/asschan/', 'http...",https://dblp.org/pid/80/2106,266.0
8,Chee Wei Tan,cheewei.tan@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp02029,,,
9,Chen Change Loy,ccloy@ntu.edu.sg,https://dr.ntu.edu.sg/cris/rp/rp00574,"['https://personal.ntu.edu.sg/ccloy', 'https:/...",https://dblp.org/pid/01/5855,62816.0


## 5.2. Get all ORCID IDs if possible from csv. Also delete ORCID link in website_link column.

In [159]:
orcid_link_list = []

for i in range(len(df)):
    found = 0
    if isinstance(df['website_link'][i], str):
        tmp_website_link = ast.literal_eval(df['website_link'][i])

        for link in tmp_website_link:
            if 'https://orcid.org/' in link:
                found = 1
                orcid_link_list.append(link)
                break

        # If no ORCID ID is found, add in as nan
        if found == 0:
            orcid_link_list.append(float('nan'))
                
    # If the website_link is nan 
    else:
        orcid_link_list.append(float('nan'))

df['orcid_link'] = orcid_link_list

In [160]:
website_link_list = list(df['website_link'])

# Remove ORCID link in website_link if have, to clean up csv
for i in range(len(website_link_list)):
    if isinstance(df['website_link'][i], str):
        website_link_list[i] = [link for link in ast.literal_eval(website_link_list[i]) if 'https://orcid.org' not in link]

df['website_link'] = website_link_list

## 5.3. Get all Google Scholar link if possible from csv. Also delete Google Scholar link in website_link column.

In [161]:
google_scholar_link_list = []

for i in range(len(df)):
    found = 0
    if isinstance(df['website_link'][i], list):
        tmp_website_link = df['website_link'][i]

        for link in tmp_website_link:
            if 'https://scholar.google.com' in link:
                found = 1
                google_scholar_link_list.append(link)
                break

        # If no ORCID ID is found, add in as nan
        if found == 0:
            google_scholar_link_list.append(float('nan'))
                
    # If the website_link is nan 
    else:
        google_scholar_link_list.append(float('nan'))

df['google_scholar_link'] = google_scholar_link_list

In [163]:
website_link_list = list(df['website_link'])

# Remove ORCID link in website_link if have, to clean up csv
for i in range(len(website_link_list)):
    if isinstance(df['website_link'][i], list):
        website_link_list[i] = [link for link in website_link_list[i] if 'https://scholar.google' not in link]

df['website_link'] = website_link_list

In [165]:
# df.to_csv('Takesawa_Saori_updated.csv', index=False)

## 5.4. Retrieve missing ORCID id for each faculty.

In [29]:
# Add ORCID link into the df
# For that, first retrieve the doi if available from the faculties' DR-NTU page

# Get the list of orcid links
orcid_link_list = list(df['orcid_link'])

for i in tqdm(range(len(df))):

    orcid_link = None

    if isinstance(orcid_link_list[i], str):
        continue

    # Retrieve the doi
    doi_list = get_doi_list_from_drNTU(df['dr_ntu_link'][i])

    # If DOI obtained, use that to search for the faculty in the API
    for doi in doi_list:
        author_details = get_author_info_from_OpenAlexAPI(df['Name'][i], doi, 'doi')
        
        # If API gave error, go to next DOI
        if 'error' in author_details:
            continue
            
        author_details_found = 1

        # If author has ORCID id and not appended to df, add to df
        if author_details['author']['orcid'] and not isinstance(orcid_link_list[i], str):
            orcid_link = author_details['author']['orcid']
        break

    # If cannot retrieve any DOI or still have not found author, use publication title to search
    if author_details_found == 0:
        pub_list = get_pub_list_from_article(df['dr_ntu_link'][i])

        for pub in pub_list:
            author_details = get_author_info_from_OpenAlexAPI(df['Name'][i], pub, 'pub')

            # If API gave error or no results, go to next pub
            if 'error' in author_details or len(author_details)==0:
                continue
            author_details_found = 1


            # If author has ORCID id and not appended to df, add to df
            if author_details['author']['orcid'] and not isinstance(orcid_link_list[i], str):
                orcid_link = author_details['author']['orcid']
            break
    
    # If still have not found author, use name and institution to search
    if author_details_found == 0:
        author_details = get_author_info_from_OpenAlexAPI(df['Name'][i], '', 'institution')
    
        # If API did not give error and have results
        if not ('error' in author_details or len(author_details)==0):
            author_details_found = 1
            
            # If author has ORCID id, add to df
            if author_details['orcid']:
                orcid_link = author_details['orcid']

    # If still have not found author, use name only to search
    if author_details_found == 0:
        author_details = get_author_info_from_OpenAlexAPI(df['Name'][i], '', 'name')
    
        # If API did not give error and have results
        if not ('error' in author_details or len(author_details)==0):
            author_details_found = 1
            
            # If author has ORCID id, add to df
            if author_details['orcid']:
                orcid_link = author_details['orcid'] 

    if author_details_found == 1:
        if orcid_link:
            orcid_link_list[i] = orcid_link
        

df['orcid_link'] = orcid_link_list

  0%|          | 0/86 [00:00<?, ?it/s]

## 5.5. Get all interest for each faculty from DR_NTU

In [None]:
# Get interest from DR-NTU profile page
interest_list = []

for i in tqdm(range(len(df))):
    interest_list.append(get_research_interest_from_drNTU(df['dr_ntu_link'][i]))

In [15]:
df['Interests'] = interest_list

## 5.6. Get profile image of each faculty from DR-NTU

In [42]:
img_link_list = []

for i in tqdm( range(len(df)) ):
    drNTU_link = df['dr_ntu_link'][i]
    soup_source = requests.get(drNTU_link).text
    soup = BeautifulSoup(soup_source,'lxml')

    # Extract the profile image link 
    # also, replace the single space in 'src' to '%20' to convert to proper url format
    img_link = 'https://dr.ntu.edu.sg' + soup.find('img', id="picture")['src'].replace(' ', '%20')

    img_link_list.append(img_link)

  0%|          | 0/86 [00:00<?, ?it/s]

In [44]:
df['img_link'] = img_link_list

In [46]:
df.to_csv('Takesawa_Saori_updated.csv', index=False)