In [1]:
import os, json
import datetime
import pandas as pd
from streamlit_extras.card import card
from tqdm import tqdm

In [2]:
raw_dr_ntu_dir='./raw_dr_ntu'
raw_faculty_db = 'scse_profile'

process_faculty_db_dir ='./processed'
process_faculty_db = 'scse_profile'
process_co_author_db = 'google_scholar_co_author'

raw_google_scholar_dir = './raw_google_scholar'
raw_google_search_dir = './google_search'

process_publications_dir = './processed_google_scholar_publications'

research_interest_dir = './research_interest'

education_output_dir = './dr_ntu_education'

profile_dir = './profile'


In [3]:
faculties = pd.read_csv(os.path.join(process_faculty_db_dir, process_faculty_db+'.csv'))
google_scholar_faculties = faculties[faculties['google_scholar'].notna()]

In [4]:
cur_year = datetime.datetime.now().year
total_pubs_by_year = {}
total_citations_by_year = {}
min_year = str(cur_year+2)
for i, row in tqdm(google_scholar_faculties.iterrows()):
    name, google_scholar_id, dr_ntu_id = row['full_name'], row['google_scholar_id'], row['dr_ntu_id']

    profile_file = f"{profile_dir}/{dr_ntu_id}.json"

    
    with open(profile_file, 'r') as f:
        profile = json.load(f)
    citations_by_year = pd.DataFrame(profile['citations_by_year'])
    pubs_by_year = pd.DataFrame(profile['published_by_year'])
    min_year = min(min_year, pubs_by_year['Year'].min())
    for i, row in citations_by_year.iterrows():
        year = row['Year']
        total_citations_by_year[year] = total_citations_by_year.get(year, 0) + row['# of Citations']
    for i, row in pubs_by_year.iterrows():
        year = row['Year']
        total_pubs_by_year[year] = total_pubs_by_year.get(year, 0) + row['# of Publications']

final = {'Year':[], '# of Publications': [], '# of Citations': []}
for year in range(int(min_year), cur_year+1):
    final['Year'].append(str(year))
    final['# of Publications'].append(total_pubs_by_year.get(str(year),0))
    final['# of Citations'].append(total_citations_by_year.get(str(year),0))

if 'unknown' in pubs_by_year:
    final['Year'].append('unknown')
    final['# of Publications'].append(total_pubs_by_year.get('unknown',0))
    final['# of Citations'].append(total_citations_by_year.get('unknown',0))


scse_profile = {'by_year': final}
with open(f'{profile_dir}/scse.json', 'w') as f:
    json.dump(scse_profile, f)

0it [00:00, ?it/s]

64it [00:00, 505.15it/s]


# Total number of faculty

In [5]:
scse = pd.read_csv(f'{process_faculty_db_dir}/{process_faculty_db}.csv')
total_profile = len(scse)

with open(f'{profile_dir}/scse.json', 'r') as f:
    profile = json.load(f)

profile['# of Faculty'] = total_profile

with open(f'{profile_dir}/scse.json', 'w') as f:
    json.dump(profile, f)

# Top Faculty

In [6]:
top_faculty = {'Name': [], '# of Publications': [], '# of Citations': [], 'Avg Citations per Publication': [],'h-index': []}
cur_year = datetime.datetime.now().year

for i, row in tqdm(google_scholar_faculties.iterrows()):
    name, google_scholar_id, dr_ntu_id = row['full_name'], row['google_scholar_id'], row['dr_ntu_id']

    profile_file = f"{profile_dir}/{dr_ntu_id}.json"

    with open(profile_file, 'r') as f:
        profile = json.load(f)
    total_publications = sum(profile['published_by_year']['# of Publications'])
    total_citations = sum(profile['citations_by_year']['# of Citations'])
    avg_citations = total_citations / total_publications
    h_index = profile['all_time_h_index']

    name = profile['full_name']

    top_faculty['Name'].append(name)
    top_faculty['# of Publications'].append(total_publications)
    top_faculty['# of Citations'].append(total_citations)
    top_faculty['Avg Citations per Publication'].append(avg_citations)
    top_faculty['h-index'].append(h_index)

with open(f'{profile_dir}/scse.json', 'r') as f:
    profile = json.load(f)

profile['Top Faculty'] = top_faculty

with open(f'{profile_dir}/scse.json', 'w') as f:
    json.dump(profile, f) 

64it [00:00, 1396.46it/s]


# Collaboration Network

In [7]:
network = {'source':[], 'target':[], 'source_id':[], 'target_id':[], 'type':[], 'location':[], 'year': []}
cur_year = datetime.datetime.now().year

for i, row in tqdm(google_scholar_faculties.iterrows()):
    name, google_scholar_id, dr_ntu_id = row['full_name'], row['google_scholar_id'], row['dr_ntu_id']

    profile_file = f"{profile_dir}/{dr_ntu_id}.json"

    with open(profile_file, 'r') as f:
        profile = json.load(f)
    
    co_authors = profile['collaboration_network']
    source_name = name
    source_id = google_scholar_id


    
    network['source']+=[source_name]*len(co_authors['target']); network['source_id']+=[source_id]*len(co_authors['target']); 
    network['target']+=co_authors['target']; network['target_id']+=co_authors['target_id']
    network['type']+=co_authors['type']; network['location']+=co_authors['location']
    network['year']+=co_authors['year']


with open(f'{profile_dir}/scse.json', 'r') as f:
    profile = json.load(f)

profile['Collaboration Network'] = network

with open(f'{profile_dir}/scse.json', 'w') as f:
    json.dump(profile, f) 
            

64it [00:00, 1379.19it/s]


# Top Publications

In [8]:
top_pubs = {'name': [], 'title': [], 'link': [], 'total_citations': [], 'publication_year': [], 'topic':[]}
cur_year = datetime.datetime.now().year

for i, row in tqdm(google_scholar_faculties.iterrows()):
    name, google_scholar_id, dr_ntu_id = row['full_name'], row['google_scholar_id'], row['dr_ntu_id']

    profile_file = f"{profile_dir}/{dr_ntu_id}.json"

    with open(profile_file, 'r') as f:
        profile = json.load(f)
    pubs = profile['publications']

    top_pubs['name']+=[name]*len(pubs['Title'])
    top_pubs['title']+=pubs['Title']
    top_pubs['link']+=pubs['Link']
    top_pubs['total_citations']+=pubs['# of Citations']
    top_pubs['publication_year']+=pubs['Publication Year']
    top_pubs['topic']+=pubs['Topic']

with open(f'{profile_dir}/scse.json', 'r') as f:
    profile = json.load(f)

profile['All Publications'] = top_pubs

with open(f'{profile_dir}/scse.json', 'w') as f:
    json.dump(profile, f) 


64it [00:00, 1250.44it/s]
