In [1]:
import json, os, datetime
from tqdm import tqdm
import pandas as pd

In [2]:
raw_dr_ntu_dir='./raw_dr_ntu'
raw_faculty_db = 'scse_profile'

process_faculty_db_dir ='./processed'
process_faculty_db = 'scse_profile'
process_co_author_db = 'google_scholar_co_author'

raw_google_scholar_dir = './raw_google_scholar'
raw_google_search_dir = './google_search'

process_publications_dir = './processed_google_scholar_publications'
os.makedirs(process_publications_dir, exist_ok=True)

cur_year = datetime.datetime.now().year

In [3]:
faculties = pd.read_csv(os.path.join(process_faculty_db_dir, process_faculty_db+'.csv'))
google_scholar_faculties = faculties[faculties['google_scholar'].notna()]

In [4]:
for i, row in tqdm(google_scholar_faculties.iterrows()):
    name, url, id = row['full_name'], row['google_scholar'], row['google_scholar_id']

    pub_file = f"{raw_google_scholar_dir}/{id}_publications.json"
    profile_file = f"{raw_google_scholar_dir}/{id}.json"

    with open(pub_file, 'r') as f:
        pubs = json.load(f)

    with open(profile_file, 'r') as f:
        profile = json.load(f)
    
    clean = []

    for profile_pub in profile['publications']:
        # Details to store
        link = profile_pub['title_link']
        title = profile_pub['title']
        # Details not scraped
        if link not in pubs:
            continue

        pub_details = pubs[link]
        authors = pub_details['authors'] if 'authors' in pub_details else None
        description = pub_details['description'] if 'description' in pub_details else None
        citation_by_year = {'year':[], 'num_citations': []}
        publication_year = None

        # publication_year
        if 'publication_date' in pub_details:
            publication_year = int(pub_details['publication_date'].split('/')[0])
            if publication_year>cur_year:
                publication_year = None

        # citation_by_year
        if 'total_citations' in pub_details and publication_year is not None:
            pub_citations = pub_details['total_citations']
            total_citations = 0
            for year in range(publication_year, cur_year+1):
                total_citations += pub_citations.get(str(year), 0)
                citation_by_year['year'].append(year)
                citation_by_year['num_citations'].append(pub_citations.get(str(year), 0))

            citation_by_year['year'].append('unknown')
            citation_by_year['num_citations'].append(sum(pub_citations.values())-sum(citation_by_year['num_citations']))
        # citation_by_year
        elif 'total_citations' in pub_details:
            citation_by_year['year'].append('unknown')
            citation_by_year['num_citations'].append(sum(pub_citations.values()))
        
        final = {
            'link': link, 
            'title': title, 
            'authors': authors, 
            'description': description, 
            'publication_year': publication_year,
            'citations_by_year': citation_by_year,
        }
        clean.append(final)

        
    with open(f"{process_publications_dir}/{id}.json", 'w') as f:
        json.dump(clean, f)

64it [00:00, 154.70it/s]
