In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

import pyscisci.all as pyscisci

import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

%matplotlib inline

# set this path to where the MAG database is stored
path2mag = '/home/ajgates/MAG'
path2mag = '/users/hgt6rn/Documents/DataSets/MAG'

mymag = pyscisci.MAG(path2mag, database_extension='hdf', keep_in_memory=False) 

In [3]:
# sometimes you want to focus your analysis on a pre-defined set of publications

# lets load an example subset
publication_list = pd.read_csv('../example_data/focus_publications_example.csv')
publication_list

Unnamed: 0,PublicationId,Year,Doi,Title
0,2607159947,2017.0,10.1016/J.JOI.2017.04.002,the time dimension of science connecting the p...
1,3106261317,2017.0,10.1016/J.JOI.2017.10.001,the research production of nations and departm...
2,3106392307,2017.0,10.1016/J.JOI.2017.04.004,do mathematicians economists and biomedical sc...
3,2593672369,2017.0,10.1016/J.JOI.2017.02.007,the impact of collaboration and knowledge netw...
4,2722577421,2017.0,10.1016/J.JOI.2017.05.021,when social scientists disagree comments on th...
...,...,...,...,...
102,2745510841,2017.0,10.1016/J.JOI.2017.05.008,counting citations generalizing the perry reny...
103,2560973443,2017.0,10.1016/J.JOI.2016.11.006,partial orders for zero sum arrays with applic...
104,2563792373,2017.0,10.1016/J.JOI.2016.11.008,can we use google scholar to identify highly c...
105,2550521721,2017.0,10.1016/J.JOI.2016.10.008,the brazilian scientific output published in j...


# Filtering when you already have the list of Publication Ids

In [4]:
# getting the full publication information when we already have the publication ids is very easy
# just pass the list into the load function using a filter_dictionary

# but make sure the list of ids is SORTED!!!
full_pub_info = mymag.load_publications(filter_dict={'PublicationId':np.sort(publication_list['PublicationId'].values)})
full_pub_info

Loading Publications:   0%|          | 0/132 [00:00<?, ?it/s]

Unnamed: 0,PublicationId,Year,JournalId,FamilyId,Doi,Title,Date,Volume,Issue,FirstPage,LastPage,DocSubTypes,DocType
133899,2607159947,2017.0,205292342.0,2.607160e+09,10.1016/J.JOI.2017.04.002,the time dimension of science connecting the p...,2017-05-01,11,2,608,621,,j
290226,3106261317,2017.0,205292342.0,3.106261e+09,10.1016/J.JOI.2017.10.001,the research production of nations and departm...,2017-11-01,11,4,1142,1157,,j
394705,3106392307,2017.0,205292342.0,3.106392e+09,10.1016/J.JOI.2017.04.004,do mathematicians economists and biomedical sc...,2017-05-01,11,2,598,607,,j
286538,2593672369,2017.0,205292342.0,,10.1016/J.JOI.2017.02.007,the impact of collaboration and knowledge netw...,2017-05-01,11,2,407,422,,j
1828962,2722577421,2017.0,205292342.0,,10.1016/J.JOI.2017.05.021,when social scientists disagree comments on th...,2017-08-01,11,3,937,940,,j
...,...,...,...,...,...,...,...,...,...,...,...,...,...
531419,2745510841,2017.0,205292342.0,,10.1016/J.JOI.2017.05.008,counting citations generalizing the perry reny...,2017-08-01,11,3,685,688,,j
1657627,2560973443,2017.0,205292342.0,,10.1016/J.JOI.2016.11.006,partial orders for zero sum arrays with applic...,2017-02-01,11,1,257,274,,j
1754755,2563792373,2017.0,205292342.0,2.563792e+09,10.1016/J.JOI.2016.11.008,can we use google scholar to identify highly c...,2017-02-01,11,1,152,163,,j
1756548,2550521721,2017.0,205292342.0,2.724689e+09,10.1016/J.JOI.2016.10.008,the brazilian scientific output published in j...,2017-02-01,11,1,18,31,,j


# Filtering when you have Doi

In [5]:
# getting the full publication information when we have the publication dois is similarly very easy
# just pass the DOI list into the load function using a filter_dictionary

# but make sure the list of DOIs is SORTED!!!
full_pub_info = mymag.load_publications(filter_dict={'Doi':np.sort(publication_list['Doi'].values)})
full_pub_info

Loading Publications:   0%|          | 0/132 [00:00<?, ?it/s]

Unnamed: 0,PublicationId,Year,JournalId,FamilyId,Doi,Title,Date,Volume,Issue,FirstPage,LastPage,DocSubTypes,DocType
133898,3106485173,2017.0,2.595727e+09,2.607160e+09,10.1016/J.JOI.2017.04.002,the time dimension of science connecting the p...,2017-04-15,,,,,,r
133899,2607159947,2017.0,2.052923e+08,2.607160e+09,10.1016/J.JOI.2017.04.002,the time dimension of science connecting the p...,2017-05-01,11,2,608,621,,j
290225,2766136890,2017.0,2.595119e+09,3.106261e+09,10.1016/J.JOI.2017.10.001,the research production of nations and departm...,2017-11-23,,,,,,r
290226,3106261317,2017.0,2.052923e+08,3.106261e+09,10.1016/J.JOI.2017.10.001,the research production of nations and departm...,2017-11-01,11,4,1142,1157,,j
394705,3106392307,2017.0,2.052923e+08,3.106392e+09,10.1016/J.JOI.2017.04.004,do mathematicians economists and biomedical sc...,2017-05-01,11,2,598,607,,j
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1657627,2560973443,2017.0,2.052923e+08,,10.1016/J.JOI.2016.11.006,partial orders for zero sum arrays with applic...,2017-02-01,11,1,257,274,,j
1754755,2563792373,2017.0,2.052923e+08,2.563792e+09,10.1016/J.JOI.2016.11.008,can we use google scholar to identify highly c...,2017-02-01,11,1,152,163,,j
1754756,3105977400,2018.0,2.595119e+09,2.563792e+09,10.1016/J.JOI.2016.11.008,can we use google scholar to identify highly c...,2018-04-27,,,,,,r
1756548,2550521721,2017.0,2.052923e+08,2.724689e+09,10.1016/J.JOI.2016.10.008,the brazilian scientific output published in j...,2017-02-01,11,1,18,31,,j


notice that now we got 131! publications:  this is because a preprint and the journal article may share a doi!
you will have to decide how to handle this duplication depending on your application

# Filtering when all you have are the Publication Titles and Years

In [None]:
# matching by titles is the most computationally expensive, and is greatly assisted by having some additional information
# in this case, we will also use the publication year

# first we have to load the database publication information
# lets just take the journal articles
pub=mymag.load_publications(filter_dict={'DocType':np.sort(['j'])})
pub

In [7]:
# pyscisci does this filterign using an align_publications function

matched_publication_idx = pyscisci.align_publications(df1 = publication_list, # this should be the information for your subset
                                                   df2=pub, # this is the database publication information
                                                   columns2match_exact=['Year'], # columns you can leverage to match exactly
                                                   column2match_approx='Title', # the column to fuzzy-match on
                                                   ntop=1, # decide if you want only the best match, or (ntop>1) also include near matches
                                                   cosine_lower_bound=0.75, # a rough matching first, threshold 0-1
                                                    use_threads=True, # parrelize?
                                                   n_jobs=10, # number of workers
                                                   lev_lower_bound=0.9, # the finer matching, threhold 0-1
                                                   show_progress=False)

# the returned array gives the indices of the match
matched_publication_idx

array([  719149.,  1436395.,  2150638.,  4160284.,  4670599.,  6395332.,
        6786035., 10108126., 10719680., 11737951., 12524311., 13113938.,
       13179952., 14583062., 15343815., 17422892., 18157598., 18207365.,
       18921116., 20349831., 21032010., 21111275., 21518535., 21776916.,
       22918351., 23203324., 23205803., 23251257., 23560332., 23882133.,
       54793508., 26566541., 26650188., 27486281., 28249189., 28917739.,
       28959166., 29186556., 29765315., 29854507., 30347157., 31058195.,
       31552769., 31927753., 32012664., 33600012., 33870115., 35352421.,
       35413714., 35696570., 35831995., 39264690., 40566544., 41479708.,
       43594349., 43906973., 44261803., 44893363., 45158315., 46402512.,
       48093997., 48109252., 48363807., 48504715., 50114030., 50436226.,
       51860207., 52468527., 53182512., 55472308., 56442946., 56603799.,
       57097898., 58930017., 59252407., 60085395., 60657401., 60679605.,
       61395154., 61960282., 62248856., 64105716., 

In [8]:
# to recover the full information, we need to index into the publication dataframe
matched_publications = pub.iloc[matched_publication_idx]
matched_publications

Unnamed: 0,PublicationId,Year,JournalId,FamilyId,Doi,Title,Date,Volume,Issue,FirstPage,LastPage,DocSubTypes,DocType
719149,2607159947,2017.0,205292342.0,2.607160e+09,10.1016/J.JOI.2017.04.002,the time dimension of science connecting the p...,2017-05-01,11,2,608,621,,j
1436395,3106261317,2017.0,205292342.0,3.106261e+09,10.1016/J.JOI.2017.10.001,the research production of nations and departm...,2017-11-01,11,4,1142,1157,,j
2150638,3106392307,2017.0,205292342.0,3.106392e+09,10.1016/J.JOI.2017.04.004,do mathematicians economists and biomedical sc...,2017-05-01,11,2,598,607,,j
4160284,2593672369,2017.0,205292342.0,,10.1016/J.JOI.2017.02.007,the impact of collaboration and knowledge netw...,2017-05-01,11,2,407,422,,j
4670599,2722577421,2017.0,205292342.0,,10.1016/J.JOI.2017.05.021,when social scientists disagree comments on th...,2017-08-01,11,3,937,940,,j
...,...,...,...,...,...,...,...,...,...,...,...,...,...
82238075,2745510841,2017.0,205292342.0,,10.1016/J.JOI.2017.05.008,counting citations generalizing the perry reny...,2017-08-01,11,3,685,688,,j
85325307,2560973443,2017.0,205292342.0,,10.1016/J.JOI.2016.11.006,partial orders for zero sum arrays with applic...,2017-02-01,11,1,257,274,,j
86730593,2563792373,2017.0,205292342.0,2.563792e+09,10.1016/J.JOI.2016.11.008,can we use google scholar to identify highly c...,2017-02-01,11,1,152,163,,j
86730856,2550521721,2017.0,205292342.0,2.724689e+09,10.1016/J.JOI.2016.10.008,the brazilian scientific output published in j...,2017-02-01,11,1,18,31,,j
