In [1]:
import os, json
import datetime
import pandas as pd

In [2]:
dir = './raw_data'
google_scholar_pub_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' in f]

pubs_by_year = {}
citations_by_year = {}

min_year, max_year = float('inf'), datetime.datetime.now().year
for file in google_scholar_pub_files:
    with open(file, 'r') as f:
        pubs = json.load(f)
    
    for pub in pubs.values():
        if 'publication_date' in pub:
            publication_year = int(pub['publication_date'].split('/')[0])

            if publication_year>max_year:
                publication_year='unknown'
            else:
                min_year = min(min_year, publication_year)

        else:
            publication_year = 'unknown'

        pubs_by_year[publication_year] = pubs_by_year.get(publication_year, 0) + 1

        if 'total_citations' in pub:
            if publication_year=='unknown':
                citations_by_year['unknown'] = citations_by_year.get('unknown', 0) + sum(pub['total_citations'].values())
            else:
                pub_citations = pub['total_citations']
                for year in range(publication_year, max_year+1):
                    citations_by_year[year] = citations_by_year.get(year, 0) + pub_citations.get(str(year), 0)

final = {'Year':[], '# of Publications': [], '# of Citations': []}
for year in range(min_year, max_year+1):
    final['Year'].append(year)
    final['# of Publications'].append(pubs_by_year.get(year,0))
    final['# of Citations'].append(citations_by_year.get(year,0))

if 'unknown' in pubs_by_year:
    final['Year'].append('unknown')
    final['# of Publications'].append(pubs_by_year.get('unknown',0))
    final['# of Citations'].append(citations_by_year.get('unknown',0))


scse_profile = {'by_year': final}
os.makedirs('./scse', exist_ok=True)
with open('./scse/profile.json', 'w') as f:
    json.dump(scse_profile, f)

# Total number of faculty

In [3]:
scse = pd.read_csv('./raw_data/scse_profiles.csv')
total_profile = len(scse)

with open('./scse/profile.json', 'r') as f:
    profile = json.load(f)

profile['# of Faculty'] = total_profile

with open('./scse/profile.json', 'w') as f:
    json.dump(profile, f)

# Top Faculty

In [6]:
dir = './processed_data'
files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]

publications = []
citations = []
h_index = []
name = []
top_faculty = {'Name': [], '# of Publications': [], '# of Citations': [], 'Avg Citations per Publication': [],'h-index': []}
for file in files:
    with open(file, 'r') as f:
        profile = json.load(f)

    # Only those with google scholar
    if profile['google_scholar'] is not None:
        total_publications = sum(profile['published_by_year'].values())
        total_citations = sum(profile['citations_by_year']['# of Citations'])
        avg_citations = total_citations / total_publications
        h_index = profile['all_time_h_index']

        name = profile['full_name']

        top_faculty['Name'].append(name)
        top_faculty['# of Publications'].append(total_publications)
        top_faculty['# of Citations'].append(total_citations)
        top_faculty['Avg Citations per Publication'].append(avg_citations)
        top_faculty['h-index'].append(h_index)

with open('./scse/profile.json', 'r') as f:
    profile = json.load(f)

profile['Top Faculty'] = top_faculty

with open('./scse/profile.json', 'w') as f:
    json.dump(profile, f) 

# Collaboration Network

In [18]:
dir = './processed_data'
files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]

ntu_url = {}
for file in files:
    with open(file, 'r') as f:
        profile = json.load(f)
    name = profile['full_name']
    if profile['google_scholar'] is not None:
        id = profile['google_scholar'].split('user=')[1].split('&')[0]
        ntu_url[id] = name

network = {'source':[], 'target':[], '# of Collaboration':[], 'source_id':[], 'target_id':[], 'type':[], 'location':[]}
for file in files:
    with open(file, 'r') as f:
        profile = json.load(f)
    
    if profile['google_scholar'] is not None:
        profile_id = profile['google_scholar'].split('user=')[1].split('&')[0]
        profile_name = profile['full_name']
        key = tuple([profile_id, profile_name])
        co_authors = profile['co_authors']
        cur = []
        for name, details in co_authors.items():
            id = details['google_scholar'].split('user=')[1].split('&')[0]
            if details['type']=='NTU' and id not in ntu_url:
                ntu_url[id] = name

            network['source'].append(profile_name); network['source_id'].append(profile_id); 
            network['target'].append(name); network['target_id'].append(id)
            network['# of Collaboration'].append(details['# of Collaborations'])
            network['type'].append(details['type']); network['location'].append(details['location'])

with open('./scse/profile.json', 'r') as f:
    profile = json.load(f)

profile['Collaboration Network'] = network

with open('./scse/profile.json', 'w') as f:
    json.dump(profile, f) 
            