In [6]:
import json
import os
import datetime

In [7]:
dir = './raw_data'
google_scholar_pub_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' in f]

In [8]:
google_scholar_pubs_keys = set()

for file in google_scholar_pub_files:
    with open(file, 'r') as f:
        pubs = json.load(f)
    for v in pubs.values():
        google_scholar_pubs_keys = google_scholar_pubs_keys | v.keys()
len(google_scholar_pubs_keys)

19

In [9]:
google_scholar_pubs_keys

{'application_number',
 'authors',
 'book',
 'conference',
 'description',
 'external_link',
 'institution',
 'inventors',
 'issue',
 'journal',
 'pages',
 'patent_number',
 'patent_office',
 'publication_date',
 'publisher',
 'report_number',
 'source',
 'total_citations',
 'volume'}

# No. of published by year

In [10]:
unknowns = 0

for file in google_scholar_pub_files:
    with open(file, 'r') as f:
        pubs = json.load(f)
    years = {}
    min_year, max_year = float('inf'), datetime.datetime.now().year
    for v in pubs.values():
        if 'publication_date' in v:
            year = int(v['publication_date'].split('/')[0])
            min_year = min(year, min_year)
        else:
            year = 'unknown'
            unknowns+=1
        years[year] = years.get(year, 0) + 1
    
    # For years in between with no citations
    for year in range(min_year, max_year+1):
        years[year] = years.get(year, 0)
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'r') as f:
        profile = json.load(f)
        profile['num_published'] = years
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'w') as f:
        json.dump(profile, f)
unknowns

615

# h-index by publication year

In [16]:
def h_index(citations):
    return sum(x >= i + 1 for i, x in enumerate(sorted(list(citations), reverse=True)))

In [41]:
dir = './raw_data'
google_scholar_pub_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' in f]


for file in google_scholar_pub_files:
    with open(file, 'r') as f:
        pubs = json.load(f)
    years = {}
    min_year, max_year = float('inf'), datetime.datetime.now().year
    for v in pubs.values():
        if 'publication_date' in v and 'total_citations' in v:
            publication_year = int(v['publication_date'].split('/')[0])
            min_year = min(publication_year, min_year)
            # Appending total citations by year for each publication
            years[publication_year] = years.get(publication_year, []) + [sum(v['total_citations'].values())]

    # Getting h-index for each year
    for year in years:
        years[year] = h_index(years[year])


    # For years in between with no citations,
    for year in range(int(min_year), max_year+1):
        years[year] = years.get(year, 0)
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'r') as f:
        profile = json.load(f)
        profile['h_index_by_publication_year'] = years
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'w') as f:
        json.dump(profile, f)

# h-index over time

In [50]:
dir = './raw_data'
google_scholar_pub_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' in f]


for file in google_scholar_pub_files:
    with open(file, 'r') as f:
        pubs = json.load(f)
    years = {}
    min_year, max_year = float('inf'), datetime.datetime.now().year
    for v in pubs.values():
        if 'publication_date' in v and 'total_citations' in v:
            publication_year = int(v['publication_date'].split('/')[0])
            min_year = min(publication_year, min_year)

            pub_citations = v['total_citations']
            total_citations = 0
            for year in range(publication_year, max_year+1):
                total_citations += pub_citations.get(str(year), 0)
                years[year] = years.get(year, []) + [total_citations]

    # Getting h-index for each year
    for year in years:
        years[year] = h_index(years[year])

    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'r') as f:
        profile = json.load(f)
        profile['h-index_over_time'] = years
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'w') as f:
        json.dump(profile, f)

# h-index by year since publication

In [43]:
dir = './raw_data'
google_scholar_pub_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' in f]


for file in google_scholar_pub_files:
    with open(file, 'r') as f:
        pubs = json.load(f)
    years = {}
    print(file)
    min_year, max_year = float('inf'), datetime.datetime.now().year
    for v in pubs.values():
        if 'publication_date' in v and 'total_citations' in v:
            publication_year = int(v['publication_date'].split('/')[0])
            # Appending total citations by year for each publication

            cur_year = years.get(publication_year, {})
            pub_citations = v['total_citations']
            total_citations = 0
            for year in range(publication_year, max_year+1):
                total_citations += pub_citations.get(str(year), 0)
                cur_year[year] = cur_year.get(year, []) + [total_citations]
            
            years[publication_year] = cur_year



    # Getting h-index for each year
    for publication_year in years:
        for year in years[publication_year]:
            years[publication_year][year] = h_index(years[publication_year][year])


    
    # with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'r') as f:
    #     profile = json.load(f)
    #     profile['h_index_by_publication_year'] = years
    
    # with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'w') as f:
    #     json.dump(profile, f)

./raw_data/google_scholar_cham_tat_jen_publications.json
{1999: {1999: 2, 2000: 4, 2001: 4, 2002: 4, 2003: 4, 2004: 4, 2005: 4, 2006: 4, 2007: 4, 2008: 4, 2009: 4, 2010: 4, 2011: 4, 2012: 4, 2013: 4, 2014: 4, 2015: 4, 2016: 4, 2017: 4, 2018: 4, 2019: 4, 2020: 4, 2021: 4, 2022: 4, 2023: 4}, 2019: {2019: 3, 2020: 3, 2021: 4, 2022: 4, 2023: 4}, 2015: {2015: 3, 2016: 3, 2017: 4, 2018: 5, 2019: 5, 2020: 5, 2021: 5, 2022: 6, 2023: 6}, 2007: {2007: 2, 2008: 3, 2009: 3, 2010: 3, 2011: 3, 2012: 4, 2013: 4, 2014: 4, 2015: 4, 2016: 4, 2017: 4, 2018: 4, 2019: 4, 2020: 4, 2021: 5, 2022: 5, 2023: 5}, 2018: {2018: 1, 2019: 2, 2020: 3, 2021: 4, 2022: 4, 2023: 4}, 2001: {2001: 1, 2002: 3, 2003: 4, 2004: 5, 2005: 5, 2006: 5, 2007: 5, 2008: 5, 2009: 5, 2010: 5, 2011: 5, 2012: 5, 2013: 5, 2014: 5, 2015: 5, 2016: 5, 2017: 5, 2018: 5, 2019: 5, 2020: 5, 2021: 5, 2022: 5, 2023: 5}, 2006: {2006: 2, 2007: 2, 2008: 3, 2009: 3, 2010: 3, 2011: 4, 2012: 4, 2013: 4, 2014: 4, 2015: 4, 2016: 4, 2017: 4, 2018: 4, 2019:

In [28]:
h_index(cited)

64

In [29]:
len(cited)

351