In [35]:
import json
import os
import datetime
import math

In [2]:
dir = './raw_data'
files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'dr_ntu' in f]
names = []

dr_ntu_keys = set()
for file in files:
    with open(file, 'r') as f:
        profile = json.load(f)
    
    dr_ntu_keys = dr_ntu_keys | profile.keys()

len(dr_ntu_keys)

21

In [3]:
dr_ntu_keys

{'articles',
 'bibliometrics',
 'biography',
 'book_chapters',
 'books',
 'conferences',
 'designation',
 'dr_ntu',
 'email',
 'full_name',
 'github',
 'google_scholar',
 'grants',
 'image_path',
 'interests',
 'keywords',
 'name_card',
 'orcid',
 'other_websites',
 'scopus',
 'web_of_science'}

In [4]:
dr_ntu_information = [
    'full_name', 'email', 'designation',
    'biography', 'grants',
    'dr_ntu', 'google_scholar', 'github', 'orcid', 'scopus', 'web_of_science', 'other_websites', 'image_path'
]

In [5]:
dir = './raw_data'
files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' not in f]
names = []

google_scholar_keys = set()
for file in files:
    with open(file, 'r') as f:
        profile = json.load(f)
    
    google_scholar_keys = google_scholar_keys | profile.keys()

len(google_scholar_keys)

5

In [6]:
google_scholar_keys

{'citation_statistics',
 'co_authors',
 'google_scholar',
 'interests',
 'publications'}

In [7]:
google_scholar_information = [
    'citation_statistics', 'interests'
]

In [8]:
dir = './raw_data'
dr_ntu_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'dr_ntu' in f]

output_dir = './processed_data'

for dr_ntu in dr_ntu_files:
    profile = {}
    with open(dr_ntu, 'r') as f:
        dr_ntu = json.load(f)
    name = dr_ntu['full_name']

    profile.update({k:v for k, v in dr_ntu.items() if k in dr_ntu_information})

    # Updating google scholar url from google scholar profile search
    google_scholar_profile_search_file = os.path.join(dir, f'google_scholar_profile_search_{name.lower().replace(" ", "_")}.json')

    if os.path.exists(google_scholar_profile_search_file):
        with open(google_scholar_profile_search_file, 'r') as f:
            google_scholar = json.load(f)
        
        profile.update(google_scholar)

    # Updating with google scholar information
    google_scholar_file = os.path.join(dir, f'google_scholar_{name.lower().replace(" ", "_")}.json')

    if os.path.exists(google_scholar_file):
        with open(google_scholar_file, 'r') as f:
            google_scholar = json.load(f)
        

        processed_google_scholar = {}
        processed_google_scholar['interests'] = google_scholar['interests']
        

        profile.update(processed_google_scholar)

    with open(os.path.join(output_dir, f'{name.lower().replace(" ", "_")}.json'), 'w') as f:
        json.dump(profile, f)

# Merging Education

In [9]:
dir = './processed_data'
processed = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
gpt_dir = './education_gpt_output_data'
gpt_prefix = 'education_v1'
for file in processed:
    with open(file, 'r') as f:
        profile = json.load(f)
    
    name = file[17:-5]

    with open(f'{gpt_dir}/{gpt_prefix}_{name}.json', 'r') as f:
        education = json.load(f)

    del education['name']
    profile.update(education)

    with open(file, 'w') as f:
        json.dump(profile, f)

# Interests

In [10]:
dir = './raw_data'
google_scholar_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' not in f and 'profile' not in f]

interests = {}
for file in google_scholar_files:
    with open(file, 'r') as f:
            google_scholar = json.load(f)
        
    for interest in google_scholar['interests']:
          interests[interest] = interests.get(interest, 0)+1

len(interests)

189

# Co Authors

In [42]:
dir = './raw_data'
co_author_dir ='./co_authors_raw_data'
google_scholar_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' not in f and 'profile' not in f]

google_scholar_org_id = {'3012140508424117850': 'Nanyang Technological University'}
for file in google_scholar_files:
    with open(file, 'r') as f:
        profile = json.load(f)
    titles = set([x['title'].lower() for x in profile['publications']])
    co_authors = profile['co_authors']

    collaboration = {}
    collaboration_by_year = {}
    for co_author in co_authors:
        co_author_name = co_author['name']
        co_author_file = f"{co_author_dir}/google_scholar_{co_author_name.lower().replace(' ', '_').replace('/','_')}.json"
        with open(co_author_file, 'r') as f:
            co_author_profile = json.load(f)
        if co_author_profile['publications'] is not None: 
            co_author_titles = set([x['title'].lower()for x in co_author_profile['publications']])
        else:
            co_author_titles = set()

        same_titles = len(titles.intersection(co_author_titles))
        if same_titles==0:
            continue
        
        collaboration[co_author_name] = {'# of Collaborations': same_titles, 'google_scholar': co_author['link']}

        affiliates = co_author_profile['affiliates']
        if affiliates is None or len(affiliates)==0:
            type='Unknown'
        else:
            type = 'Outside NTU'
        
        if len(affiliates)>0:
            org = affiliates[0]['link'].split('org=')[1].split('&')[0]
            google_scholar_org_id[org] = google_scholar_org_id.get(org, affiliates[0]['name'])
            type = 'Outside NTU'
            location = google_scholar_org_id[org]
            for aff in affiliates:
                if 'org=3012140508424117850' in aff['link'].lower():
                    type = 'NTU'
                    location = google_scholar_org_id['3012140508424117850']
        else:
            type = 'Unknown'
            location = 'Unknown'

        
        collaboration[co_author_name]['type'] = type
        collaboration[co_author_name]['location'] = location

        min_year = float('inf')
        for title in titles.intersection(co_author_titles):
            for pub in profile['publications']:
                if pub['title'].lower()==title:
                    if not math.isnan(pub['year']):
                        publication_year = int(pub['year'])
                        min_year = min(min_year, publication_year)
                    else:
                        publication_year = 'unknown'
                    temp = collaboration_by_year.get(publication_year, {'External':0, 'NTU':0, 'Unknown':0})
                    temp['External' if type=='Outside NTU' else ('NTU' if type=='NTU' else 'Unknown')] += 1
                    collaboration_by_year[publication_year] = temp
                    break
        
        if min_year!=float('inf'):
            for year in range(min_year, datetime.datetime.now().year+1):
                collaboration_by_year[year] = collaboration_by_year.get(year, {'External':0, 'NTU':0, 'Unknown':0})


    with open(f'./processed_data/{file.split("/")[-1][15:-5]}.json', 'r') as f:
        profile = json.load(f)
        profile['co_authors'] = collaboration
        profile['collaboration_by_year'] = collaboration_by_year
    
    with open(f'./processed_data/{file.split("/")[-1][15:-5]}.json', 'w') as f:
        json.dump(profile, f)