In [1]:
import json
import os
import datetime
import math
import numpy as np
from utils import get_h_index
import pandas as pd
from tqdm import tqdm

In [2]:
raw_dr_ntu_dir='./raw_dr_ntu'
raw_faculty_db = 'scse_profile'

process_faculty_db_dir ='./processed'
process_faculty_db = 'scse_profile'
process_co_author_db = 'google_scholar_co_author'

raw_google_scholar_dir = './raw_google_scholar'
raw_google_search_dir = './google_search'

process_publications_dir = './processed_google_scholar_publications'

research_interest_dir = './research_interest'

education_output_dir = './dr_ntu_education'

profile_dir = './profile'
os.makedirs(profile_dir, exist_ok=True)

In [3]:
faculties = pd.read_csv(os.path.join(process_faculty_db_dir, process_faculty_db+'.csv'))
google_scholar_faculties = faculties[faculties['google_scholar'].notna()]

# DR NTU

In [4]:
dr_ntu_information = [
    'designation',
    'biography', 'orcid', 'other_websites', 'image_path'
]

In [5]:
for i, row in tqdm(faculties.iterrows()):
    name, id = row['full_name'], row['dr_ntu_id']

    profile = {'full_name': name, 'email': row['email'], 'google_scholar': row['google_scholar'], 'dr_ntu': row['dr_ntu']}
    with open(f'{raw_dr_ntu_dir}/{id}.json', 'r') as f:
        dr_ntu = json.load(f)

    profile.update({k:v for k, v in dr_ntu.items() if k in dr_ntu_information})

    with open(f"{profile_dir}/{id}.json", 'w') as f:
        json.dump(profile, f)

86it [00:00, 4015.83it/s]


# Merging Education

In [6]:
for i, row in tqdm(faculties.iterrows()):
    name, id = row['full_name'], row['dr_ntu_id']

    education_file = f"{education_output_dir}/{id}.json"
    if os.path.exists(education_file):
        with open(education_file, 'r') as f:
            education = json.load(f)
        
        with open(f"{profile_dir}/{id}.json", 'r') as f:
            profile = json.load(f)
        profile.update(education)
        
        with open(f"{profile_dir}/{id}.json", 'w') as f:
            json.dump(profile, f)

86it [00:00, 5604.83it/s]


# All Publications

In [7]:
for i, row in tqdm(google_scholar_faculties.iterrows()):
    name, google_scholar_id, dr_ntu_id = row['full_name'], row['google_scholar_id'], row['dr_ntu_id']

    research_interest_file = f"{research_interest_dir}/{google_scholar_id}.json"
    publication_file = f"{process_publications_dir}/{google_scholar_id}.json"
    if os.path.exists(publication_file):
        with open(publication_file, 'r') as f:
            pubs = json.load(f)
        pubs_df = pd.DataFrame(pubs)
    if os.path.exists(research_interest_file):
        with open(research_interest_file, 'r') as f:
            research_interests = json.load(f)
        research_interests_df = pd.DataFrame(research_interests)
        pubs_df = pubs_df.merge(research_interests_df, how='left', on='link')
        pubs_df['topic'] = pubs_df['topic'].fillna('Others')

    pubs_df['publication_year'] = pubs_df['publication_year'].fillna(0).astype(int).replace(0, 'Unknown').astype(str)
    pubs_df['total_citations'] = pubs_df['citations_by_year'].apply(lambda x: sum(x['num_citations']))    

    publications = {
        'Publication Year': pubs_df['publication_year'].to_list(), 
        'Title': pubs_df['title'].to_list(), 
        'Link': pubs_df['link'].to_list(), 
        'Topic': pubs_df['topic'].to_list() if os.path.exists(research_interest_file) else ['Others']*len(pubs_df),
        '# of Citations': pubs_df['total_citations'].to_list(), 
        'Description': pubs_df['description'].to_list()
    }
    # publications['Publication Year'].append(str(pub['publication_year']) if pub['publication_year'] is not None else 'Unknown')
    # publications['Title'].append(pub['title'])
    # publications['Description'].append(pub['description'])
    # publications['Link'].append(pub['link'])
    # publications['Topic'].append(pub['final_topic'] if pub['final_topic'] is not None else 'Others')
    # publications['# of Citations'].append(sum(pub['citations_by_year']['num_citations']))
    
    with open(f'{profile_dir}/{dr_ntu_id}.json', 'r') as f:
        profile = json.load(f)
        profile['publications'] = publications
    
    with open(f'{profile_dir}/{dr_ntu_id}.json', 'w') as f:
        json.dump(profile, f)


65it [00:00, 232.55it/s]


# Co Authors

In [8]:
scse_google_scholar = {}
for i, row in tqdm(google_scholar_faculties.iterrows()):
    name, google_scholar_id, dr_ntu_id = row['full_name'], row['google_scholar_id'], row['dr_ntu_id']
    scse_google_scholar[google_scholar_id] = name

65it [00:00, 55121.26it/s]


In [9]:
google_scholar_org_id = {'3012140508424117850': 'Nanyang Technological University'}

for i, row in tqdm(faculties.iterrows()):
    name, google_scholar_id, dr_ntu_id = row['full_name'], row['google_scholar_id'], row['dr_ntu_id']

    google_scholar_profile_file = f"{raw_google_scholar_dir}/{google_scholar_id}.json"
    if os.path.exists(google_scholar_profile_file):
        with open(google_scholar_profile_file, 'r') as f:
            google_scholar_profile = json.load(f)
    
    titles = set([x['title'].lower() for x in google_scholar_profile['publications']])
    co_authors = google_scholar_profile['co_authors']

    network = {'target':[], 'target_id':[], 'type':[], 'location':[], 'year': [], 'title': [], 'link': []}
    for co_author in co_authors:
        co_author_id = co_author['link'].split('user=')[1].split('&')[0]
        co_author_name = scse_google_scholar[co_author_id] if co_author_id in scse_google_scholar else co_author['name']
        co_author_file = f"{raw_google_scholar_dir}/{co_author_id}.json"

        with open(co_author_file, 'r') as f:
            co_author_profile = json.load(f)
            
        if co_author_profile['publications'] is not None: 
            co_author_titles = set([x['title'].lower()for x in co_author_profile['publications']])
        else:
            co_author_titles = set()

        # Checking no. of collaborations
        same_titles = len(titles.intersection(co_author_titles))
        if same_titles==0:
            continue

        # Getting affiliation details
        affiliates = co_author_profile['affiliates']
        if affiliates is None or len(affiliates)==0:
            type='Unknown'
        else:
            type = 'Outside NTU'
        
        if co_author_id in scse_google_scholar:
            type = 'NTU'
            location = google_scholar_org_id['3012140508424117850']
        elif len(affiliates)>0:
            org = affiliates[0]['link'].split('org=')[1].split('&')[0]
            # Storing org name to get standardised organisation name
            google_scholar_org_id[org] = google_scholar_org_id.get(org, affiliates[0]['name'])
            type = 'Outside NTU'
            location = google_scholar_org_id[org]
            # Checking for multiple affiliations if there is a NTU affiliation
            for aff in affiliates:
                if 'org=3012140508424117850' in aff['link'].lower():
                    type = 'Outside SCSE'
                    location = google_scholar_org_id['3012140508424117850']
        else:
            type = 'Unknown'
            location = 'Unknown'
        

        min_year = float('inf')
        # Get year of publication
        for title in titles.intersection(co_author_titles):
            for pub in google_scholar_profile['publications']:
                if pub['title'].lower()==title:
                    if not math.isnan(pub['year']):
                        publication_year = int(pub['year'])
                        min_year = min(min_year, publication_year)
                    else:
                        publication_year = 'Unknown'
                        
                    network['target'].append(co_author_name); network['target_id'].append(co_author_id)
                    network['type'].append(type); network['location'].append(location)
                    network['year'].append(publication_year)
                    network['title'].append(pub['title']); network['link'].append(pub['title_link'])
                    break

    with open(f'{profile_dir}/{dr_ntu_id}.json', 'r') as f:
        profile = json.load(f)
        profile['collaboration_network'] = network
    
    with open(f'{profile_dir}/{dr_ntu_id}.json', 'w') as f:
        json.dump(profile, f)

86it [00:02, 36.77it/s]


# No. of publications by year

In [10]:
cur_year = datetime.datetime.now().year

for i, row in tqdm(faculties.iterrows()):
    name, google_scholar_id, dr_ntu_id = row['full_name'], row['google_scholar_id'], row['dr_ntu_id']

    publication_file = f"{process_publications_dir}/{google_scholar_id}.json"

    if os.path.exists(publication_file):
        with open(publication_file, 'r') as f:
            pubs = json.load(f)

        pubs_df = pd.DataFrame(pubs)

        pubs_df['publication_year'] = pubs_df['publication_year'].fillna(0).astype(int).replace(0, 'unknown').astype(str)
        min_year = pubs_df['publication_year'].min()
        publication_by_year = pubs_df.groupby(by=['publication_year'])['title'].count().reset_index()
        publication_by_year = publication_by_year.set_index('publication_year')
        publication_by_year = publication_by_year.to_dict('index')

        final = {'Year': [], '# of Publications': []}
        if min_year!='unknown':
            # For years in between with no citations
            for year in range(int(min_year), cur_year+1):
                final['Year'].append(str(year))
                final['# of Publications'].append(publication_by_year.get(str(year), {'title':0})['title'])
        
        if 'unknown' in publication_by_year:
            final['Year'].append('unknown')
            final['# of Publications'].append(publication_by_year['unknown']['title'])
            
        
        with open(f'{profile_dir}/{dr_ntu_id}.json', 'r') as f:
            profile = json.load(f)
            profile['published_by_year'] = final
        
        with open(f'{profile_dir}/{dr_ntu_id}.json', 'w') as f:
            json.dump(profile, f)


86it [00:00, 300.77it/s]


# No. of citations by year

In [11]:
cur_year = datetime.datetime.now().year

for i, row in tqdm(faculties.iterrows()):
    name, google_scholar_id, dr_ntu_id = row['full_name'], row['google_scholar_id'], row['dr_ntu_id']

    publication_file = f"{process_publications_dir}/{google_scholar_id}.json"

    if os.path.exists(publication_file):
        with open(publication_file, 'r') as f:
            pubs = json.load(f)
    
        total_citations_by_year = {}
        min_year = float('inf')
        for details in pubs:
            citations_by_year = details['citations_by_year']
            for i in range(len(citations_by_year['year'])):
                year = citations_by_year['year'][i]
                citations = citations_by_year['num_citations'][i]
                # Has unknown
                total_citations_by_year[str(year)] = total_citations_by_year.get(str(year), 0) + citations
                if year!='unknown':
                    min_year = min(min_year, year)

        # For years in between with no citations
        final = {'Year': [], '# of Citations': []}
        for year in range(min_year, cur_year+1):
            final['Year'].append(str(year))
            final['# of Citations'].append(total_citations_by_year.get(str(year), 0))
        
        if 'unknown' in total_citations_by_year:
            final['Year'].append('unknown')
            final['# of Citations'].append(total_citations_by_year['unknown'])
            

        with open(f'{profile_dir}/{dr_ntu_id}.json', 'r') as f:
            profile = json.load(f)
            profile['citations_by_year'] = final
        
        with open(f'{profile_dir}/{dr_ntu_id}.json', 'w') as f:
            json.dump(profile, f)

86it [00:00, 304.62it/s]


# h-index & i10-index

In [12]:
cur_year = datetime.datetime.now().year

for i, row in tqdm(faculties.iterrows()):
    name, google_scholar_id, dr_ntu_id = row['full_name'], row['google_scholar_id'], row['dr_ntu_id']

    publication_file = f"{process_publications_dir}/{google_scholar_id}.json"

    if os.path.exists(publication_file):
        with open(publication_file, 'r') as f:
            pubs = json.load(f)
    
        citations_list = []
        by_year = {}
        by_publication_year = {}
        by_year_since_published = {}


        min_year = float('inf')
        for details in pubs:
            if details['publication_year'] is not None:
                citations_by_year = details['citations_by_year']
                publication_year = details['publication_year']
                min_year = min(publication_year, min_year)

                cur_year_since_published = by_year_since_published.get(publication_year, {})
                total_citations = 0
                sorted_year_index = sorted([x for x in range(len(citations_by_year['year']))], key=lambda x: str(citations_by_year['year'][x]))
                
                for i in sorted_year_index:
                    year = citations_by_year['year'][i]
                    citations = citations_by_year['num_citations'][i]
                    # No unknowns
                    if year!='unknown':
                        total_citations += citations
                        by_year[year] = by_year.get(year, []) + [total_citations]
                        cur_year_since_published[year] = cur_year_since_published.get(year, []) + [total_citations]
                
                citations_list.append(total_citations)
                by_publication_year[publication_year] = by_publication_year.get(publication_year, []) + [total_citations]
                by_year_since_published[publication_year] = cur_year_since_published    


        h_index_by_year_df = {'Year': [], 'h-index': []}
        h_index_by_publication_year_df = {'Publication Year': [], 'h-index': []}
        avg_citations_by_publication_year_df = {'Publication Year': [], 'Avg Citations per Publication': []}
        h_index_by_year_since_published_df = {'Publication Year': [], 'Year': [], 'h-index': []}
        for year in range(min_year, cur_year+1):
            h_index_by_year_df['Year'].append(year)
            h_index_by_year_df['h-index'].append(get_h_index(by_year.get(year,[])))

            h_index_by_publication_year_df['Publication Year'].append(year)
            h_index_by_publication_year_df['h-index'].append(get_h_index(by_publication_year.get(year, [])))

            avg_citations_by_publication_year_df['Publication Year'].append(year)
            avg_citations_by_publication_year_df['Avg Citations per Publication'].append(np.mean(by_publication_year.get(year, [0])))

            for y in range(year, cur_year+1):
                h_index_by_year_since_published_df['Publication Year'].append(year)
                h_index_by_year_since_published_df['Year'].append(y)
                h_index_by_year_since_published_df['h-index'].append(get_h_index(by_year_since_published.get(year,{}).get(y, [])))
        
        all_time_h_index = get_h_index(citations_list)
        all_time_i10_index = len([i for i in citations_list if i>=10])
        all_time_i20_index = len([i for i in citations_list if i>=20])
        
        with open(f'{profile_dir}/{dr_ntu_id}.json', 'r') as f:
            profile = json.load(f)
            profile['all_time_h_index'] = all_time_h_index
            profile['all_time_i10_index'] = all_time_i10_index
            profile['all_time_i20_index'] = all_time_i20_index
            profile['h_index_by_year'] = h_index_by_year_df
            profile['h_index_by_publication_year'] = h_index_by_publication_year_df
            profile['avg_citations_by_publication_year'] = avg_citations_by_publication_year_df
            profile['h_index_by_years_from_publication_year'] = h_index_by_year_since_published_df
        
        with open(f'{profile_dir}/{dr_ntu_id}.json', 'w') as f:
            json.dump(profile, f)

86it [00:00, 216.19it/s]
