In [77]:
import json
import os
import datetime

In [78]:
dir = './raw_data'
google_scholar_pub_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' in f]

In [79]:
google_scholar_pubs_keys = set()

for file in google_scholar_pub_files:
    with open(file, 'r') as f:
        pubs = json.load(f)
    for v in pubs.values():
        google_scholar_pubs_keys = google_scholar_pubs_keys | v.keys()
len(google_scholar_pubs_keys)

19

In [80]:
google_scholar_pubs_keys

{'application_number',
 'authors',
 'book',
 'conference',
 'description',
 'external_link',
 'institution',
 'inventors',
 'issue',
 'journal',
 'pages',
 'patent_number',
 'patent_office',
 'publication_date',
 'publisher',
 'report_number',
 'source',
 'total_citations',
 'volume'}

# No. of published by year

In [81]:
unknowns = 0

for file in google_scholar_pub_files:
    with open(file, 'r') as f:
        pubs = json.load(f)
    years = {}
    min_year, max_year = float('inf'), datetime.datetime.now().year
    for v in pubs.values():
        if 'publication_date' in v:
            publication_year = int(v['publication_date'].split('/')[0])
            min_year = min(publication_year, min_year)
            if publication_year>max_year:
                publication_year = 'unknown'
                unknowns+=1
        else:
            publication_year = 'unknown'
            unknowns+=1
        years[publication_year] = years.get(publication_year, 0) + 1
    
    # For years in between with no citations
    for year in range(min_year, max_year+1):
        years[year] = years.get(year, 0)
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'r') as f:
        profile = json.load(f)
        profile['published_by_year'] = years
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'w') as f:
        json.dump(profile, f)
unknowns

636

# \# of Citations by Year

In [96]:
def h_index(citations):
    return sum(x >= i + 1 for i, x in enumerate(sorted(list(citations), reverse=True)))

In [102]:
dir = './raw_data'
google_scholar_pub_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' in f]

for file in google_scholar_pub_files:
    with open(file, 'r') as f:
        pubs = json.load(f)
    years = {}
    min_year, max_year = float('inf'), datetime.datetime.now().year
    for v in pubs.values():
        if 'publication_date' in v and 'total_citations' in v:
            publication_year = int(v['publication_date'].split('/')[0])
            min_year = min(publication_year, min_year)
            if publication_year>max_year:
                years['unknown'] = years.get('unknown', 0) + sum(v['total_citations'].values())

            pub_citations = v['total_citations']
            for year, cites in pub_citations.items():
                if int(year)<=max_year:
                    years[int(year)] = years.get(int(year), 0) + pub_citations[year]
                else:
                    years['unknown'] = years.get('unknown', 0) + pub_citations[year]


    # For years in between with no citations
    final = {'Year': [], '# of Citations': []}
    for year in range(min_year, max_year+1):
        final['Year'].append(year)
        final['# of Citations'].append(years.get(year, 0))
    
    
    if 'unknown' in years:
        final['Year'].append('unkown')
        final['# of Citations'].append(years['unknown'])
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'r') as f:
        profile = json.load(f)
        profile['citations_by_year'] = final
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'w') as f:
        json.dump(profile, f)

# All time h-index & i10-index

In [98]:
dir = './raw_data'
google_scholar_pub_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' in f]

for file in google_scholar_pub_files:
    with open(file, 'r') as f:
        pubs = json.load(f)
    citations = []
    min_year, max_year = float('inf'), datetime.datetime.now().year
    for v in pubs.values():
        if 'publication_date' in v and 'total_citations' in v:
            publication_year = int(v['publication_date'].split('/')[0])
            # Appending total citations by year for each publication year
            if publication_year>max_year:
                continue
            pub_citations = v['total_citations']
            total_citations = 0
            for year in range(publication_year, max_year+1):
                total_citations += pub_citations.get(str(year), 0)
                
            citations.append(total_citations)
    
    all_time_h_index = h_index(citations)
    all_time_i10_index = len([i for i in citations if i>=10])
    
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'r') as f:
        profile = json.load(f)
        profile['all_time_h_index'] = all_time_h_index
        profile['all_time_i10_index'] = all_time_i10_index
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'w') as f:
        json.dump(profile, f)

# h-index by publication year

In [103]:
dir = './raw_data'
google_scholar_pub_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' in f]


for file in google_scholar_pub_files:
    with open(file, 'r') as f:
        pubs = json.load(f)
    years = {}
    min_year, max_year = float('inf'), datetime.datetime.now().year
    for v in pubs.values():
        if 'publication_date' in v and 'total_citations' in v:
            publication_year = int(v['publication_date'].split('/')[0])
            min_year = min(publication_year, min_year)
            
            if publication_year>max_year:
                continue
            # Appending total citations by year for each publication year
            pub_citations = v['total_citations']
            total_citations = 0
            for year in range(publication_year, max_year+1):
                total_citations += pub_citations.get(str(year), 0)
            years[publication_year] = years.get(publication_year, []) + [total_citations]

    # Getting h-index for each year
    for year in years:
        years[year] = h_index(years[year])


    # For years in between with no citations,
    for year in range(int(min_year), max_year+1):
        years[year] = years.get(year, 0)
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'r') as f:
        profile = json.load(f)
        profile['h_index_by_publication_year'] = years
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'w') as f:
        json.dump(profile, f)

# h-index by year

In [85]:
dir = './raw_data'
google_scholar_pub_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' in f]


for file in google_scholar_pub_files:
    with open(file, 'r') as f:
        pubs = json.load(f)
    years = {}
    min_year, max_year = float('inf'), datetime.datetime.now().year
    for v in pubs.values():
        if 'publication_date' in v and 'total_citations' in v:
            publication_year = int(v['publication_date'].split('/')[0])
            min_year = min(publication_year, min_year)
            if publication_year>max_year:
                continue
            pub_citations = v['total_citations']
            total_citations = 0
            for year in range(publication_year, max_year+1):
                total_citations += pub_citations.get(str(year), 0)
                years[year] = years.get(year, []) + [total_citations]

    # Getting h-index for each year
    for year in years:
        years[year] = h_index(years[year])

    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'r') as f:
        profile = json.load(f)
        profile['h_index_by_year'] = years
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'w') as f:
        json.dump(profile, f)

# h-index by year since publication

In [104]:
dir = './raw_data'
google_scholar_pub_files = [os.path.join(dir, f) for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) and 'google_scholar' in f and 'publication' in f]


for file in google_scholar_pub_files:
    with open(file, 'r') as f:
        pubs = json.load(f)
    years = {}
    min_year, max_year = float('inf'), datetime.datetime.now().year
    for v in pubs.values():
        if 'publication_date' in v and 'total_citations' in v:
            publication_year = int(v['publication_date'].split('/')[0])
            min_year = min(min_year, publication_year)

            if publication_year>max_year:
                continue

            cur_year = years.get(publication_year, {})
            pub_citations = v['total_citations']
            total_citations = 0
            for year in range(publication_year, max_year+1):
                total_citations += pub_citations.get(str(year), 0)
                cur_year[year] = cur_year.get(year, []) + [total_citations]
            
            years[publication_year] = cur_year
    
    # Getting h-index for each year
    final = {'Publication Year': [], 'Year': [], 'h-index': []}
    for publication_year in years:
        final['Publication Year'].append(publication_year)
        final['h-index'].append([None]*(publication_year-min_year)+[h_index(years[publication_year][year]) for year in years[publication_year]])
    
    final['Year'] = [x for x in range(min_year, max_year+1)]


    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'r') as f:
        profile = json.load(f)
        profile['h_index_by_years_from_publication_year'] = final
    
    with open(f'./processed_data/{file.split("/")[-1][15:-18]}.json', 'w') as f:
        json.dump(profile, f)