In [61]:
import json
import os
import datetime
import math
import numpy as np
from utils import get_h_index

In [15]:
dir = './raw_data'
files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'dr_ntu' in f]
names = []

dr_ntu_keys = set()
for file in files:
    with open(file, 'r') as f:
        profile = json.load(f)
    
    dr_ntu_keys = dr_ntu_keys | profile.keys()

len(dr_ntu_keys)

21

In [16]:
dr_ntu_keys

{'articles',
 'bibliometrics',
 'biography',
 'book_chapters',
 'books',
 'conferences',
 'designation',
 'dr_ntu',
 'email',
 'full_name',
 'github',
 'google_scholar',
 'grants',
 'image_path',
 'interests',
 'keywords',
 'name_card',
 'orcid',
 'other_websites',
 'scopus',
 'web_of_science'}

In [17]:
dr_ntu_information = [
    'full_name', 'email', 'designation',
    'biography',
    'dr_ntu', 'google_scholar', 'orcid', 'other_websites', 'image_path'
]

In [18]:
dir = './raw_data'
files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' not in f]
names = []

google_scholar_keys = set()
for file in files:
    with open(file, 'r') as f:
        profile = json.load(f)
    
    google_scholar_keys = google_scholar_keys | profile.keys()

len(google_scholar_keys)

5

In [19]:
google_scholar_keys

{'citation_statistics',
 'co_authors',
 'google_scholar',
 'interests',
 'publications'}

# DR NTU

In [21]:
dir = './raw_data'
dr_ntu_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'dr_ntu' in f]

output_dir = './processed_data'

for dr_ntu in dr_ntu_files:
    profile = {}
    with open(dr_ntu, 'r') as f:
        dr_ntu = json.load(f)
    name = dr_ntu['full_name']

    profile.update({k:v for k, v in dr_ntu.items() if k in dr_ntu_information})

    # Updating google scholar url from google scholar profile search
    google_scholar_profile_search_file = os.path.join(dir, f'google_scholar_profile_search_{name.lower().replace(" ", "_")}.json')

    if os.path.exists(google_scholar_profile_search_file):
        with open(google_scholar_profile_search_file, 'r') as f:
            google_scholar = json.load(f)
        
        profile.update(google_scholar)

    with open(os.path.join(output_dir, f'{name.lower().replace(" ", "_")}.json'), 'w') as f:
        json.dump(profile, f)

# Merging Education

In [22]:
dir = './processed_data'
processed = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
gpt_dir = './education_gpt_output_data'
gpt_prefix = 'education_v1'
for file in processed:
    with open(file, 'r') as f:
        profile = json.load(f)
    
    name = file[17:-5]

    with open(f'{gpt_dir}/{gpt_prefix}_{name}.json', 'r') as f:
        education = json.load(f)

    del education['name']
    profile.update(education)

    with open(file, 'w') as f:
        json.dump(profile, f)

# Interests

In [87]:
dir = './research_interest'
files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]


for file in files:
    with open(file, 'r') as f:
        pubs = json.load(f)

    file_name = file.split('/')[2]
    
    publications = {'Publication Year': [], 'Title': [], 'Link': [], 'Topic': [], '# of Citations': [], 'Description': []}
    for pub in pubs:
        publications['Publication Year'].append(str(pub['publication_year']) if pub['publication_year'] is not None else 'Unknown')
        publications['Title'].append(pub['title'])
        publications['Description'].append(pub['description'])
        publications['Link'].append(pub['link'])
        publications['Topic'].append(pub['final_topic'] if pub['final_topic'] is not None else 'Others')
        publications['# of Citations'].append(sum(pub['citations_by_year']['num_citations']))
    
    with open(f'./processed_data/{file_name}', 'r') as f:
        profile = json.load(f)
        profile['publications'] = publications
    
    with open(f'./processed_data/{file_name}', 'w') as f:
        json.dump(profile, f)



# Co Authors

In [25]:
dir = './processed_data'
files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]

ntu_url = {}
for file in files:
    with open(file, 'r') as f:
        profile = json.load(f)
    name = profile['full_name']
    if profile['google_scholar'] is not None:
        id = profile['google_scholar'].split('user=')[1].split('&')[0]
        ntu_url[id] = name

In [42]:
dir = './raw_data'
co_author_dir ='./co_authors_raw_data'
google_scholar_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' not in f and 'profile' not in f]

google_scholar_org_id = {'3012140508424117850': 'Nanyang Technological University'}
for file in google_scholar_files:
    with open(file, 'r') as f:
        profile = json.load(f)

    
    titles = set([x['title'].lower() for x in profile['publications']])
    co_authors = profile['co_authors']
    
    network = {'target':[], 'target_id':[], 'type':[], 'location':[], 'year': [], 'title': [], 'link': []}
    for co_author in co_authors:

        co_author_id = co_author['link'].split('user=')[1].split('&')[0]
        # Checking if within ntu
        co_author_name = ntu_url[co_author_id] if co_author_id in ntu_url else co_author['name']
        co_author_file = f"{co_author_dir}/google_scholar_{co_author['name'].lower().replace(' ', '_').replace('/','_')}.json"

        with open(co_author_file, 'r') as f:
            co_author_profile = json.load(f)
        if co_author_profile['publications'] is not None: 
            co_author_titles = set([x['title'].lower()for x in co_author_profile['publications']])
        else:
            co_author_titles = set()

        same_titles = len(titles.intersection(co_author_titles))
        if same_titles==0:
            continue
        

        # Get location and type
        affiliates = co_author_profile['affiliates']
        if affiliates is None or len(affiliates)==0:
            type='Unknown'
        else:
            type = 'Outside NTU'
        
        if co_author_id in ntu_url:
            type = 'NTU'
            location = 'Nanyang Technological University'
        elif len(affiliates)>0:
            org = affiliates[0]['link'].split('org=')[1].split('&')[0]
            google_scholar_org_id[org] = google_scholar_org_id.get(org, affiliates[0]['name'])
            type = 'Outside NTU'
            location = google_scholar_org_id[org]
            for aff in affiliates:
                if 'org=3012140508424117850' in aff['link'].lower():
                    type = 'Outside SCSE'
                    location = google_scholar_org_id['3012140508424117850']
        else:
            type = 'Unknown'
            location = 'Unknown'


        min_year = float('inf')
        # Get year of publication
        for title in titles.intersection(co_author_titles):
            for pub in profile['publications']:
                if pub['title'].lower()==title:
                    if not math.isnan(pub['year']):
                        publication_year = int(pub['year'])
                        min_year = min(min_year, publication_year)
                    else:
                        publication_year = 'unknown'
                        
                    network['target'].append(co_author_name); network['target_id'].append(co_author_id)
                    network['type'].append(type); network['location'].append(location)
                    network['year'].append(publication_year)
                    network['title'].append(pub['title']); network['link'].append(pub['title_link'])
                    break
        


    with open(f'./processed_data/{file.split("/")[-1][15:-5]}.json', 'r') as f:
        profile = json.load(f)
        profile['collaboration_network'] = network
    
    with open(f'./processed_data/{file.split("/")[-1][15:-5]}.json', 'w') as f:
        json.dump(profile, f)

# No. of publications by year

In [54]:
dir = './processed_publications'
pubs_file = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]

cur_year = datetime.datetime.now().year
for file in pubs_file:
    with open(file, 'r') as f:
        pubs = json.load(f)
    file_name = file.split('/')[2]

    publication_by_year = {}
    min_year = float('inf')
    for details in pubs:
        if details['publication_year'] is not None:
            publication_year = details['publication_year']
            min_year = min(publication_year, min_year)
        else:
            publication_year = 'unknown'
        publication_by_year[publication_year] = publication_by_year.get(publication_year, 0) + 1
    
    # For years in between with no citations
    final = {'Year': [], '# of Publications': []}
    for year in range(min_year, cur_year+1):
        final['Year'].append(str(year))
        final['# of Publications'].append(publication_by_year.get(year, 0))
    
    if 'unknown' in publication_by_year:
        final['Year'].append('unknown')
        final['# of Publications'].append(publication_by_year['unknown'])
        
    
    with open(f'./processed_data/{file_name}', 'r') as f:
        profile = json.load(f)
        profile['published_by_year'] = final
    
    with open(f'./processed_data/{file_name}', 'w') as f:
        json.dump(profile, f)


# No. of citations by year

In [84]:
dir = './processed_publications'
pubs_file = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]

cur_year = datetime.datetime.now().year
for file in pubs_file:
    with open(file, 'r') as f:
        pubs = json.load(f)

    file_name = file.split('/')[2]
    
    total_citations_by_year = {}
    min_year = float('inf')
    for details in pubs:
        
        citations_by_year = details['citations_by_year']
        for i in range(len(citations_by_year['year'])):
            year = citations_by_year['year'][i]
            citations = citations_by_year['num_citations'][i]
            # Has unknown
            total_citations_by_year[str(year)] = total_citations_by_year.get(str(year), 0) + citations
            if year!='unknown':
                min_year = min(min_year, year)

    # For years in between with no citations
    final = {'Year': [], '# of Citations': []}
    for year in range(min_year, cur_year+1):
        final['Year'].append(str(year))
        final['# of Citations'].append(total_citations_by_year.get(str(year), 0))
    
    if 'unknown' in total_citations_by_year:
        final['Year'].append('unknown')
        final['# of Citations'].append(total_citations_by_year['unknown'])

    with open(f'./processed_data/{file_name}', 'r') as f:
        profile = json.load(f)
        profile['citations_by_year'] = final
    
    with open(f'./processed_data/{file_name}', 'w') as f:
        json.dump(profile, f)

# h-index & i10-index

In [73]:
dir = './processed_publications'
pubs_file = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]

cur_year = datetime.datetime.now().year

for file in pubs_file:
    with open(file, 'r') as f:
        pubs = json.load(f)
    file_name = file.split('/')[2]
    
    citations_list = []
    by_year = {}
    by_publication_year = {}
    by_year_since_published = {}


    min_year = float('inf')
    for details in pubs:
        if details['publication_year'] is not None:
            citations_by_year = details['citations_by_year']
            publication_year = details['publication_year']
            min_year = min(publication_year, min_year)

            cur_year_since_published = by_year_since_published.get(publication_year, {})
            total_citations = 0
            sorted_year_index = sorted([x for x in range(len(citations_by_year['year']))], key=lambda x: str(citations_by_year['year'][x]))
            
            for i in sorted_year_index:
                year = citations_by_year['year'][i]
                citations = citations_by_year['num_citations'][i]
                # No unknowns
                if year!='unknown':
                    total_citations += citations
                    by_year[year] = by_year.get(year, []) + [total_citations]
                    cur_year_since_published[year] = cur_year_since_published.get(year, []) + [total_citations]
            
            citations_list.append(total_citations)
            by_publication_year[publication_year] = by_publication_year.get(publication_year, []) + [total_citations]
            by_year_since_published[publication_year] = cur_year_since_published    


    h_index_by_year_df = {'Year': [], 'h-index': []}
    h_index_by_publication_year_df = {'Publication Year': [], 'h-index': []}
    avg_citations_by_publication_year_df = {'Publication Year': [], 'Avg Citations per Publication': []}
    h_index_by_year_since_published_df = {'Publication Year': [], 'Year': [], 'h-index': []}
    for year in range(min_year, cur_year+1):
        h_index_by_year_df['Year'].append(year)
        h_index_by_year_df['h-index'].append(get_h_index(by_year.get(year,[])))

        h_index_by_publication_year_df['Publication Year'].append(year)
        h_index_by_publication_year_df['h-index'].append(get_h_index(by_publication_year.get(year, [])))

        avg_citations_by_publication_year_df['Publication Year'].append(year)
        avg_citations_by_publication_year_df['Avg Citations per Publication'].append(np.mean(by_publication_year.get(year, [0])))

        for y in range(year, cur_year+1):
            h_index_by_year_since_published_df['Publication Year'].append(year)
            h_index_by_year_since_published_df['Year'].append(y)
            h_index_by_year_since_published_df['h-index'].append(get_h_index(by_year_since_published.get(year,{}).get(y, [])))
    
    all_time_h_index = get_h_index(citations_list)
    all_time_i10_index = len([i for i in citations_list if i>=10])
    all_time_i20_index = len([i for i in citations_list if i>=20])
    
    with open(f'./processed_data/{file_name}', 'r') as f:
        profile = json.load(f)
        profile['all_time_h_index'] = all_time_h_index
        profile['all_time_i10_index'] = all_time_i10_index
        profile['all_time_i20_index'] = all_time_i20_index
        profile['h_index_by_year'] = h_index_by_year_df
        profile['h_index_by_publication_year'] = h_index_by_publication_year_df
        profile['avg_citations_by_publication_year'] = avg_citations_by_publication_year_df
        profile['h_index_by_years_from_publication_year'] = h_index_by_year_since_published_df
    
    with open(f'./processed_data/{file_name}', 'w') as f:
        json.dump(profile, f)