In [14]:
import matplotlib
import pandas as pd
from habanero import Crossref 
from tqdm import tqdm
from collections import Counter
import pickle
from scholarly import scholarly

In [15]:
# read bik dataset into pandas table, replace NaNs with 0s and get first 214 entries (all other rows are empty)
table = pd.read_csv('./bik_dataset.tsv', sep='\t', encoding = "ISO-8859-1").fillna(0)[0:214]

In [16]:
# add affiliation categories
affiliation_categories = ['department', 'university', 'city', 'state', 'country']
for cat in affiliation_categories:
    table[cat] = pd.NaT
table['publisher'] = pd.NaT

In [17]:
try: # try to load last res_list saved
    with open('res_list.pickle', 'rb') as handle:
        res_list = pickle.load(handle)
except (OSError, IOError) as e: # if can't find the pickle file reload it from crossref api
    cr = Crossref()
    res_list = []
    failed_dois = []
    for doi in tqdm(table['DOI']):
        try:
            res = cr.works(ids = doi)
            res_list.append((doi, res['message']))
        except Exception as e:
            print(e)
            failed_dois.append(doi)
    with open('res_list.pickle', 'wb') as handle:
        pickle.dump(res_list, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('failed_dois.pickle', 'wb') as handle:
        pickle.dump(failed_dois, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
#use output from citeref api to get affiliation and update table
no_affiliation_authors = []
for r in res_list:
    doi, data = r
    first_author = data['author'][0]
    last_author = data['author'][-1]
    target_author = last_author if (len(last_author['affiliation']) > 0) else first_author if (len(first_author['affiliation']) > 0) else None
    if target_author is not None:
        affiliation_breakdown = target_author['affiliation'][0]['name'].split(',')
        affiliations = {}
        for idx, category in enumerate(affiliation_categories):
            try:
                affiliations[category] = affiliation_breakdown[idx].strip()
            except IndexError:
                break
        for cat, val in affiliations.items():
            table.loc[table['DOI'] == doi, cat] = val
    else:
        author_name = last_author['given'] + ' ' + last_author['family']
        no_affiliation_authors.append(author_name)
    table.loc[table['DOI'] == doi, 'publisher'] = data['publisher']

In [19]:
# use google scholar api to get affiliation. Note that first result in search query may not be the right author
# some asian names are family then first name rather than the opposite
author_dict = {}
for author in tqdm(no_affiliation_authors[:10]):
    search_query = scholarly.search_author(author)
    try:
        first_author_result = next(search_query)
        author_dict[author] = scholarly.fill(first_author_result)
        print(f'FOUND SOME RESULTS FOR {author}')
    except:
        print(f'FAILED TO FIND {author}')

 10%|█         | 1/10 [00:01<00:14,  1.60s/it]

FAILED TO FIND Hermann Ammer


 20%|██        | 2/10 [00:11<00:50,  6.27s/it]

FOUND SOME RESULTS FOR Susan K. Dutcher


 30%|███       | 3/10 [00:20<00:53,  7.65s/it]

FOUND SOME RESULTS FOR Lucille London


 40%|████      | 4/10 [00:21<00:30,  5.10s/it]

FAILED TO FIND Min-Jean Yin


 50%|█████     | 5/10 [00:27<00:27,  5.49s/it]

FOUND SOME RESULTS FOR Huangui Xiong


 60%|██████    | 6/10 [01:25<01:32, 23.09s/it]

FOUND SOME RESULTS FOR Yue Yang


 70%|███████   | 7/10 [01:32<00:53, 17.85s/it]

FOUND SOME RESULTS FOR Renato Morona


 80%|████████  | 8/10 [01:53<00:37, 18.96s/it]

FOUND SOME RESULTS FOR Francesco P. Schena


 90%|█████████ | 9/10 [02:03<00:16, 16.26s/it]

FOUND SOME RESULTS FOR Zhiwei Wang


100%|██████████| 10/10 [02:04<00:00, 12.50s/it]

FAILED TO FIND Ruth Gabizon



