In [1]:
import pandas as pd
import pickle 
import numpy as np

from scihub_upgraded import SciHub
from crossref.restful import Works, Etiquette

import warnings
warnings.filterwarnings("ignore")

## Part 1: 
Get CR metadata & Second_CR_retr results for second_run

In [2]:
%store -r retr_complete_one
%store -r date_df
%store -r second_run_df

In [4]:
with open("second_run_df", "rb") as fp:
    second_run_df = pickle.load(fp)

In [130]:
my_etiquette = Etiquette('Analysing Publishing Delay in Academic Journals', 'v2.0', 'https://github.com/Spidey0023/THEsis-Codes', 'oguzkokes@gmail.com')
ogi_works = Works(etiquette=my_etiquette)

In [59]:
# The full loop code:

issn_rry = second_run_df.Issn.values
year_rry = second_run_df.Year.values

#second_run_df.reset_index(drop=True, inplace=True)

full_meta_list = []
useful_cols = ['DOI', 'references-count', 'publisher', 'published-print', 'is-referenced-by-count', 'title', 'author', 'published-online', 'reference', 'container-title', 'issued', 'ISSN', 'subject']

for i in range(len(second_run_df)):
    issn = issn_rry[i]
    year = year_rry[i]
    loop_meta = [w for w in ogi_works.filter(issn=issn, from_pub_date=year, until_pub_date=year, type="journal-article").sample(2).select(useful_cols)] 

    if len(loop_meta) == 0:
        second_run_df.loc[second_run_df.index[i],"Second_CR_retr"] = False
    else:
        second_run_df.loc[second_run_df.index[i],"Second_CR_retr"] = True

    full_meta_list.extend(loop_meta)


The code above will be modified for SumCetCet conditions.

In [3]:
cet_second_run = second_run_df[:15000]
sum_second_run = second_run_df[15000:30000]
ogi_second_run = second_run_df[30000:]

In [6]:
# The Ogi loop code:

issn_rry = ogi_second_run.Issn.values
year_rry = ogi_second_run.Year.values

ogi_meta_list = []
useful_cols = ['DOI', 'references-count', 'publisher', 'published-print', 'is-referenced-by-count', 'title', 'author', 'published-online', 'reference', 'container-title', 'issued', 'ISSN', 'subject']

for i in range(len(ogi_second_run)):
    issn = issn_rry[i]
    year = year_rry[i]
    loop_meta = [w for w in ogi_works.filter(issn=issn, from_pub_date=year, until_pub_date=year, type="journal-article").sample(2).select(useful_cols)] 

    if len(loop_meta) == 0:
        ogi_second_run.loc[ogi_second_run.index[i],"Second_CR_retr"] = False
    else:
        ogi_second_run.loc[ogi_second_run.index[i],"Second_CR_retr"] = True

    ogi_meta_list.extend(loop_meta)


## PART 2:
CR meta retr SumCetCet runs are complete, the analysis can continue below for SciHub get_dates() for retrieved articles.

In [12]:
%store -r ogi_meta_list
%store -r ogi_second_run

with open("cet_meta_list", "rb") as fp:
    cet_meta_list = pickle.load(fp)

with open("cet_second_run", "rb") as fp:
    cet_second_run = pickle.load(fp)

with open("sum_meta_list", "rb") as fp:
    sum_meta_list = pickle.load(fp)

with open("sum_second_run", "rb") as fp:
    sum_second_run = pickle.load(fp)

In [14]:
# to get "Second_CR_retr" column:
second_run_df = pd.concat([cet_second_run,sum_second_run, ogi_second_run], verify_integrity=True)

In [13]:
full_meta_list = cet_meta_list + sum_meta_list + ogi_meta_list

second_run_meta = pd.DataFrame(full_meta_list)

* Create a YEAR column based on pub-print
* .groupby(ISSN & YEAR)["DOI"].apply(list)
* Make a function where if first has date skip second
* Else try second one
* If can get a date from one save DOI & date info
 

In [14]:
second_run_meta["Year"] = second_run_meta.issued.map(lambda x: x["date-parts"][0][0])
second_run_meta["ISSN"] = second_run_meta.ISSN.map(tuple)

second_run_doi_group = second_run_meta.groupby(["ISSN","Year"])["DOI"].apply(list)

second_run_doi_rry = second_run_doi_group.values

In [17]:
second_run_doi_group.map(len).value_counts()

2    32204
1     1687
4        3
3        1
Name: DOI, dtype: int64

Here we have 3 important variables in this notebook:

1- second_run_df -> this is the now updated version as it also has "Second_CR_retr" column -> can be SAVEd

2- second_run_meta -> this df contains NEW article metadata retrieved for second_run tests

3- second_run_doi_rry -> this np.array contains .groupby(Issn,year)["DOI"] to be used in a SciHub get_dates() loop

As a result of the sh.get_dates() run, second_run_dates_dict will be created -> this will contain DOI Scihub_results, which then can be used to match with second_run_meta & second_run_df.

This will finalize the available journals for the FINAL_100K run!

In [87]:
with open("second_run_doi_rry","wb") as fp:
    pickle.dump(second_run_doi_rry, fp)

In [None]:
# Full Run Version:

sh = SciHub()
second_run_dates_dict = {}

for rry in second_run_doi_rry:
    for ind, doi in enumerate(rry):
        date = sh.get_dates(doi)
        second_run_dates_dict[doi] = date
        if type(date) == list:
            if len(date) == 3:
                for i in range (ind+1,len(rry)):
                    second_run_dates_dict[rry[i]] = "skipped"
                break



In [18]:
cet_doi_rry = second_run_doi_rry[:11000]
sum_doi_rry = second_run_doi_rry[11000:22000]
ogi_doi_rry = second_run_doi_rry[22000:]

In [12]:
# SumCetCet - Ogi Version

sh = SciHub()
ogi_dates_dict = {}

for rry in ogi_doi_rry:
    for ind, doi in enumerate(rry):
        date = sh.get_dates(doi)
        ogi_dates_dict[doi] = date
        if type(date) == list:
            if len(date) == 3:
                for i in range (ind+1,len(rry)):
                    ogi_dates_dict[rry[i]] = "skipped"
                break




Discovered CAPTCHA!




SumCetCet is also ready, the loop above is executed!

SumCetCet is completed, second_run_dates_dict can be created:

In [16]:
with open("ogi_dates_dict", "rb") as fp:
    ogi_dates_dict = pickle.load(fp)

with open("cet_dates_dict", "rb") as fp:
    cet_dates_dict = pickle.load(fp)

with open("sum_dates_dict", "rb") as fp:
    sum_dates_dict = pickle.load(fp)



In [25]:
ogi_dates_dict.update(cet_dates_dict)
ogi_dates_dict.update(sum_dates_dict)

In [33]:
second_run_dates_dict = ogi_dates_dict

with open("second_run_dates_dict","wb") as fp:
    pickle.dump(second_run_dates_dict, fp)

In [37]:
second_run_meta["Results"] = second_run_meta.DOI.map(second_run_dates_dict)

In [39]:
second_run_meta.Results.map(type).value_counts()

<class 'list'>    55778
<class 'str'>     10332
Name: Results, dtype: int64

In [32]:
len(second_run_meta)

66110

In [41]:
second_run_date_df = second_run_meta.copy()

In [42]:
%store second_run_date_df

Stored 'second_run_date_df' (DataFrame)


second_run_date_df is created and saved. This df will be merged with date_df in the next steps to create a singular df for all articles analysed so far.

The only caveat is that this df will include "skipped" articles as well, which have metadata but did NOT go through sh.get_dates(), and thus did NOT complete the pipeline.

These should be dropped!

---

As the final step of this notebook, jrnl_df(s) should also be ready for merging. So, second_run_df must aso have Results column to filter jrnls.

This will be conducted below:

In [171]:
second_run_df.head()

Unnamed: 0,Issn,Year,Total_Docs,Sample_Count,CrossRef_retr,DOI,Results,Second_CR_retr,2nd_Results
0,[00011541],2010,294,1,True,10.1002/aic.12400,[https://sci.bban.top/pdf/10.1002/aic.12400.pd...,True,[10.1002/aic.12388]
1,[00011541],2011,315,1,True,10.1002/aic.12671,[https://sci.bban.top/pdf/10.1002/aic.12671.pd...,True,[]
2,[00011541],2012,347,1,True,10.1002/aic.13810,[https://sci.bban.top/pdf/10.1002/aic.13810.pd...,True,[]
21,[00013765],2020,363,1,True,10.1590/0001-3675202020180560,article_not_in Scihub,True,[]
22,[00014273],2010,70,1,True,10.5465/amj.2010.52814362,[https://sci.bban.top/pdf/10.5465/amj.2010.528...,True,[]


During planning it is seen that, as there are multiple DOIs from a single journal; the process would require an unnecesarily long effort & run time. As a result, only successful (type==list & len==3) articles will be merged to second_run_df. 

The second run Journal dataset will therefore lack the information why the failed journals failed, BUT this is in no way important for the overall analysis.

Article df (date_df + second_run_date_df) will have ALL the information, but the resulting Journal df will not.

In [77]:
def is_useful(resu):
    if type(resu) == list:
        if len(resu) == 3:
            return True
        else:
            return False
    else:
        return False

In [80]:
second_run_date_df["ISSN"] = second_run_date_df.ISSN.map(lambda x: [iss.replace("-","") for iss in x])
second_run_date_df["is_useful"] = second_run_date_df.Results.map(is_useful)

In [88]:
useful_2nd_date_df = second_run_date_df[second_run_date_df.is_useful == True][["ISSN","Year","DOI","Results"]].copy()

In [99]:
useful_2nd_date_df.shape

(5474, 4)

In [92]:
def doi_retr(retrrow, artcldf):
    issn = retrrow.Issn
    year = retrrow.Year
    doi = artcldf[(artcldf.ISSN.map(lambda x: any(iss in x for iss in issn))) & (artcldf.Year == year)]["DOI"].tolist()
    return doi


In [93]:
doi_retr_trial3 = second_run_df.apply(lambda x: doi_retr(x,useful_2nd_date_df), axis=1, result_type="reduce")

In [96]:
second_run_df["2nd_Results"] = doi_retr_trial3

In [98]:
second_run_df["2nd_Results"].map(len).value_counts()

0    37955
1     5316
2       79
Name: 2nd_Results, dtype: int64

79 journals had different ISSNs returned, causing multiple article matches. These 79 duplicates will be dropped from second_run_date_df.

In [100]:
second_run_df[second_run_df["2nd_Results"].map(len)== 2]

Unnamed: 0,Issn,Year,Total_Docs,Sample_Count,CrossRef_retr,DOI,Results,Second_CR_retr,2nd_Results
1132,[00063525],2010,203,1,True,10.1002/bip.21349,[https://sci.bban.top/pdf/10.1002/bip.21349.pd...,True,"[10.1002/bip.21312, 10.1002/bip.21384]"
1137,[00063525],2015,153,1,True,10.1002/bip.22749,[https://sci.bban.top/pdf/10.1002/bip.22749.pd...,True,"[10.1002/bip.22740, 10.1002/bip.22581]"
1339,[00072745],2017,38,1,True,10.1639/0007-2745-120.2.261,[https://sci.bban.top/pdf/10.1639/0007-2745-12...,True,"[10.1639/0007-2745-120.3.311, 10.1639/0007-274..."
1476,[0008543X],2018,606,1,True,10.1002/cncr.31630,[https://sci.bban.top/pdf/10.1002/cncr.31630.p...,True,"[10.1002/cncr.31570, 10.1002/cncr.31357]"
2250,[00130427],2018,34,1,True,10.1111/ecca.12289,[https://sci.bban.top/pdf/10.1111/ecca.12289.p...,True,"[10.1111/ecca.12271, 10.1111/ecca.12268]"
...,...,...,...,...,...,...,...,...,...
68269,[20457758],2019,1118,1,True,10.1002/ece3.4324,[https://sci.bban.top/pdf/10.1002/ece3.4324.pd...,True,"[10.1002/ece3.5209, 10.1002/ece3.4775]"
68517,[20483694],2015,15,1,True,10.1002/fes3.53,[https://sci.bban.top/pdf/10.1002/fes3.53.pdf#...,True,"[10.1002/fes3.65, 10.1002/fes3.64]"
69074,[20556225],2016,41,1,True,10.1108/jstp-01-2015-0013,[https://sci.bban.top/pdf/10.1108/JSTP-01-2015...,True,"[10.1108/jstp-09-2014-0190, 10.1108/jstp-12-20..."
71047,[21911363],2016,80,1,True,10.1002/open.201600032,[https://sci.bban.top/pdf/10.1002/open.2016000...,True,"[10.1002/open.201600048, 10.1002/open.201600066]"


In [154]:
useful_dois = useful_2nd_date_df.DOI.tolist()

second_result_dois = [val for sublist in second_run_df["2nd_Results"].tolist() for val in sublist]

unmacthed_dois = set(useful_dois).difference(second_result_dois)


In [155]:
unmacthed_dois

set()

In [169]:
extra_dois = second_run_df[second_run_df["2nd_Results"].map(len)== 2]["2nd_Results"].map(lambda x: x[1]).tolist()

In [175]:
second_run_date_df.set_index("DOI",drop=True, inplace=True)
second_run_date_df.rename(columns={"references-count": "reference-count"},inplace=True)

In [178]:
second_run_date_df.drop("is_useful", axis=1, inplace=True)
second_run_date_df.drop(extra_dois,inplace=True)

In [183]:
useful_2nd_date_df.set_index("DOI",drop=True, inplace=True)
useful_2nd_date_df.drop(extra_dois,inplace=True)

In [180]:
second_run_df["2nd_Results"].map(len).value_counts()

0    37955
1     5316
2       79
Name: 2nd_Results, dtype: int64

In [None]:
%store second_run_date_df
%store second_run_df