In [6]:
import requests
import io
import pandas as pd
import regex as re

from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text

from unpy_get_dates import *
from scihub_upgraded import SciHub

from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

In [2]:
sh = SciHub()

In [3]:
#Successful DOI
sh.get_dates("10.1016/j.joi.2013.09.001")

['https://moscow.sci-hub.st/2341/219df39bc7707c05bbd4f1755fd9d3d6/10.1016@j.joi.2013.09.001.pdf#navpanes=0&view=FitH',
 '20  July  2013',
 '4  September  2013']

In [4]:
#No Date DOI
sh.get_dates("10.1111/aman.13399")

['http://sci-hub.st/downloads/2020-06-28//b0/10.1111@aman.13399.pdf#navpanes=0&view=FitH',
 'no_date_found']

In [5]:
# Successful but 10MB DOI
sh.get_dates("10.2138/am-2017-6003")

['https://twin.sci-hub.st/6374/664526fc806ec4a5039f11e7f00dbe16/jin2017.pdf#navpanes=0&view=FitH',
 'noveMber 2, 2016',
 'February 14, 2017']

In [14]:
%store -r Scihub_df


In [8]:
Scihub_df.Scihub_results.map(type).value_counts()

<class 'float'>    26691
<class 'list'>     19052
<class 'str'>        948
Name: Scihub_results, dtype: int64

As the whole dataset is taking too long, will seperate it to 10k chunks

## First Loop (for 10K):

In [4]:
for k in range(10):

    sh = SciHub()

    start = k*1000
    end = (k+1)*1000 
    doi_slice = Scihub_df[start:end].index.tolist()

    now = datetime.now()
    print(f"Starting loop {k+1}, start time: {now}")
    Scihub_df.loc[doi_slice,"Scihub_results"] = Scihub_df.loc[doi_slice,"doi"].map(sh.get_dates)
    now = datetime.now()
    print(f"{k+1} loop ended, endtime: {now}")
   

Starting loop 1, start time: 2022-01-05 13:01:48.165611




1 loop ended, endtime: 2022-01-05 14:32:21.898530
Starting loop 2, start time: 2022-01-05 14:32:21.899030




2 loop ended, endtime: 2022-01-05 15:54:50.545649
Starting loop 3, start time: 2022-01-05 15:54:50.546154




3 loop ended, endtime: 2022-01-05 17:11:31.362682
Starting loop 4, start time: 2022-01-05 17:11:31.363182




4 loop ended, endtime: 2022-01-05 18:32:27.902407
Starting loop 5, start time: 2022-01-05 18:32:27.902407




5 loop ended, endtime: 2022-01-05 19:51:44.526113
Starting loop 6, start time: 2022-01-05 19:51:44.526113




6 loop ended, endtime: 2022-01-05 21:08:40.420269
Starting loop 7, start time: 2022-01-05 21:08:40.420845




7 loop ended, endtime: 2022-01-05 22:26:20.042316
Starting loop 8, start time: 2022-01-05 22:26:20.042316




8 loop ended, endtime: 2022-01-05 23:48:06.437624
Starting loop 9, start time: 2022-01-05 23:48:06.438124




9 loop ended, endtime: 2022-01-06 01:11:46.376742
Starting loop 10, start time: 2022-01-06 01:11:46.376742




10 loop ended, endtime: 2022-01-06 02:27:40.314846


### Analysis of First Loop:

In [45]:
Scihub_df.Scihub_results.map(type).value_counts()

<class 'float'>    36691
<class 'str'>       6108
<class 'list'>      3892
Name: Scihub_results, dtype: int64

In [47]:
Scihub_df[Scihub_df.Scihub_results.map(type) == str]["Scihub_results"].value_counts()

cant_open_pdf            5584
article_not_in Scihub     524
Name: Scihub_results, dtype: int64

The first 10k loop is finished, however 5584 of the results are "cant_open_pdf


In [48]:
Scihub_df[Scihub_df.Scihub_results.map(type) == list]["Scihub_results"].map(len).value_counts()

2    2038
3    1854
Name: Scihub_results, dtype: int64

In [5]:
Scihub_df[Scihub_df.Scihub_results.map(type) == list].loc[0,"Scihub_results"]

['https://zero.sci-hub.ru/29/c838ad3b574e083a56c278b8f786cb53/yan2010.pdf#navpanes=0&view=FitH',
 'no_date_found']

In [42]:
Scihub_df[Scihub_df.Scihub_results == "cant_open_pdf"].head()

Unnamed: 0,doi,doi_url,title,genre,published_date,year,journal_name,journal_issns,journal_issn_l,journal_is_oa,journal_is_in_doaj,publisher,is_oa,oa_status,oa_locations,updated,z_authors,Unpy_filter,Scihub_results
38,10.2308/accr-51296,https://doi.org/10.2308/accr-51296,Mandatory Financial Reporting and Voluntary Di...,journal-article,2015-09-01,2015,The Accounting Review,"1558-7967,0001-4826",0001-4826,False,False,American Accounting Association,True,green,"[{'updated': '2021-08-22T11:52:53.217319', 'ur...",2021-01-20T18:25:56.939118,"[{'given': 'Xi', 'family': 'Li', 'sequence': '...",Scihub,cant_open_pdf
185,10.1080/00020184.2020.1755828,https://doi.org/10.1080/00020184.2020.1755828,The proliferation of overlapping sub-regional ...,journal-article,2020-01-02,2020,African Studies,"0002-0184,1469-2872",0002-0184,False,False,Informa UK Limited,False,closed,[],2021-01-18T07:32:01.616343,[{'ORCID': 'http://orcid.org/0000-0003-4404-47...,Scihub,cant_open_pdf
233,10.1108/afr-11-2019-0120,https://doi.org/10.1108/afr-11-2019-0120,Design of the rainfall index annual forage pro...,journal-article,2020-07-10,2020,Agricultural Finance Review,0002-1466,0002-1466,False,False,Emerald,False,closed,[],2021-02-14T01:36:33.643281,"[{'given': 'Abby', 'family': 'ShalekBriski', '...",Scihub,cant_open_pdf
260,10.1111/aman.13399,https://doi.org/10.1111/aman.13399,The Predatory Present,journal-article,2020-06-01,2020,American Anthropologist,"0002-7294,1548-1433",0002-7294,False,False,Wiley,False,closed,[],2021-01-19T11:03:17.758547,"[{'given': 'Alex W.', 'family': 'Barker', 'seq...",Scihub,cant_open_pdf
370,10.1093/aje/kwz107,https://doi.org/10.1093/aje/kwz107,The Impact of 3 Years of Targeted Indoor Resid...,journal-article,2019-05-07,2019,American Journal of Epidemiology,"0002-9262,1476-6256",0002-9262,False,False,Oxford University Press (OUP),True,green,"[{'updated': '2021-12-15T13:25:14.509602', 'ur...",2021-03-28T21:15:50.062251,[{'ORCID': 'http://orcid.org/0000-0003-2471-67...,Scihub,cant_open_pdf


However, when checked again, the function is actually able to extract the date information correctly as can be seen below. So, for Scihub, we have a problem, possibly CAPTCHA, that escapes all the checks and returns cant_open_pdf incorrectly. This must be solved somehow :(

In [11]:
sh = SciHub()

sh.get_dates("10.2308/accr-51296")

['http://sci-hub.ru/downloads/2019-08-24/b5/li2016.pdf#navpanes=0&view=FitH',
 'February 2014',
 'July 2015']

Storing the dataset so far, but not starting the second loop

In [49]:
%store Scihub_df

Stored 'Scihub_df' (DataFrame)


In [None]:

for k in range(10,20):
    
    start = k*1000
    end = (k+1)*1000 
    doi_slice = Scihub_df[start:end].index.tolist()

    now = datetime.now()
    print(f"Starting loop {k+1}, start time: {now}")
    Scihub_df.loc[doi_slice,"Scihub_results"] = Scihub_df.loc[doi_slice,"doi"].map(sh.get_dates)
    now = datetime.now()
    print(f"{k+1} loop ended, endtime: {now}")

    

In [23]:
pd.cut(Scihub_df[Scihub_df.Scihub_results == "cant_open_pdf"].index, bins=[0,1000,2000,3000,4000,5000,6000,7000,8000,9000,10000]).value_counts()

(0, 1000]         10
(1000, 2000]     143
(2000, 3000]     627
(3000, 4000]     561
(4000, 5000]     644
(5000, 6000]     591
(6000, 7000]     684
(7000, 8000]     634
(8000, 9000]     621
(9000, 10000]    119
dtype: int64

## Second Loop (10K - 15K )

In [5]:
for k in range(5):

    sh = SciHub()
    
    start = (k+10)*1000
    end = (k+11)*1000 
    doi_slice = Scihub_df[start:end].index.tolist()

    now = datetime.now()
    print(f"Starting loop {k+1}, start time: {now}")
    Scihub_df.loc[doi_slice,"Scihub_results"] = Scihub_df.loc[doi_slice,"doi"].map(sh.get_dates)
    now = datetime.now()
    print(f"{k+1} loop ended, endtime: {now}")
   

Starting loop 1, start time: 2022-01-26 15:34:46.320698
1 loop ended, endtime: 2022-01-26 16:20:04.340549
Starting loop 2, start time: 2022-01-26 16:20:04.342550
2 loop ended, endtime: 2022-01-26 17:04:08.846540
Starting loop 3, start time: 2022-01-26 17:04:08.848540
3 loop ended, endtime: 2022-01-26 17:54:07.392152
Starting loop 4, start time: 2022-01-26 17:54:07.394654
4 loop ended, endtime: 2022-01-26 18:35:31.117726
Starting loop 5, start time: 2022-01-26 18:35:31.119727
5 loop ended, endtime: 2022-01-26 19:16:41.131252


In [28]:
%store Scihub_df

Stored 'Scihub_df' (DataFrame)


### Analysis of Second Loop:

In [7]:
Scihub_df[10000:15000]["Scihub_results"].map(type).value_counts()

<class 'list'>    4731
<class 'str'>      269
Name: Scihub_results, dtype: int64

At a first glance, the second loop looks very promising and got much better results than the first loop with very little type str returns. 

In [26]:
Scihub_df[10000:15000][Scihub_df[10000:15000]["Scihub_results"].map(type)==str]["Scihub_results"].value_counts()

article_not_in Scihub    267
cant_open_pdf              2
Name: Scihub_results, dtype: int64

When the errors are analysed, it is seen that all except 2 were actually unavailable articles. However, 2 error articles are actually available on Scihub. So ?

In [27]:
Scihub_df[10000:15000][Scihub_df[10000:15000]["Scihub_results"]=="cant_open_pdf"]

Unnamed: 0,doi,doi_url,title,genre,published_date,year,journal_name,journal_issns,journal_issn_l,journal_is_oa,journal_is_in_doaj,publisher,is_oa,oa_status,oa_locations,updated,z_authors,Unpy_filter,Scihub_results
12863,10.2307/25750704,https://doi.org/10.2307/25750704,Improving Employees' Compliance Through Inform...,journal-article,2010-01-01,2010,MIS Quarterly,0276-7783,0276-7783,False,False,JSTOR,False,closed,[],2021-04-26T09:42:09.304629,"[{'family': 'Puhakainen', 'sequence': 'first'}...",Scihub,cant_open_pdf
12880,10.1002/pam.21919,https://doi.org/10.1002/pam.21919,Energy-Based Economic Development: How Clean E...,journal-article,2016-06-01,2016,Journal of Policy Analysis and Management,0276-8739,0276-8739,False,False,Wiley,False,closed,[],2021-01-13T15:02:12.369641,"[{'given': 'Karnamadakala Rahul', 'family': 'S...",Scihub,cant_open_pdf


In [11]:
Scihub_df[10000:15000][Scihub_df[10000:15000]["Scihub_results"].map(type)==list]["Scihub_results"].map(len).value_counts()

3    2519
2    2212
Name: Scihub_results, dtype: int64

When the successful (list) returns are analysed, it's seen that more than 50% have at least 1 date value, which is simply amazing

In [25]:
Scihub_df[10000:15000][(Scihub_df[10000:15000]["Scihub_results"].map(type)==list) & (Scihub_df[10000:15000]["Scihub_results"].map(len)==3)]["Scihub_results"].map(lambda x: x[0]).map(type).value_counts()

<class 'str'>    2519
Name: Scihub_results, dtype: int64

In [24]:
Scihub_df[10000:15000][(Scihub_df[10000:15000]["Scihub_results"].map(type)==list) & (Scihub_df[10000:15000]["Scihub_results"].map(len)==3)]["Scihub_results"].map(lambda x: x[1]).map(type).value_counts()

<class 'str'>         2319
<class 'NoneType'>     200
Name: Scihub_results, dtype: int64

Out of 2519, only 200 does not have "Received Date" information

In [23]:
Scihub_df[10000:15000][(Scihub_df[10000:15000]["Scihub_results"].map(type)==list) & (Scihub_df[10000:15000]["Scihub_results"].map(len)==3)]["Scihub_results"].map(lambda x: x[2]).map(type).value_counts()

<class 'str'>         2230
<class 'NoneType'>     289
Name: Scihub_results, dtype: int64

Out of 2519, only 289 does not have "Accepted Date" information

## Re-Run First Loop

As the upgrades & improvements had quite an impact on the process, it would be beneficial to re-run the first loop and compare the two results

In [29]:
first_run_df = Scihub_df[0:10000].copy()

In [30]:
%store first_run_df

Stored 'first_run_df' (DataFrame)


In [12]:
for k in range(10):

    sh = SciHub()

    start = k*1000
    end = (k+1)*1000 
    doi_slice = Scihub_df[start:end].index.tolist()

    now = datetime.now()
    print(f"Starting loop {k+1}, start time: {now}")
    Scihub_df.loc[doi_slice,"Scihub_results"] = Scihub_df.loc[doi_slice,"doi"].map(sh.get_dates)
    now = datetime.now()
    print(f"{k+1} loop ended, endtime: {now}")
   

Starting loop 1, start time: 2022-01-27 19:42:27.831609




ConnectionError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))

While re-running the first loop, we have encountered a new type of error. After some debugging, it is seen that the main reason for the error is that in rare cases, Sci-Hub does not use an "iframe" attribute but rather "embed". So, there is no ["src"] field in the soup, causing an error, which is only dealt with during the last "Unknown Error" check. 

However, as there was another issue with change_base_url, @retry causes the base_url to be None, which stopped the entire loop in loop 3. The results for the 1-2 loops are successfully saved.

As a result, the scihub_upgraded was updated to fix these 2 issues.

But, 2000-10000 loops should be run again!

In [140]:
%store Scihub_df

Stored 'Scihub_df' (DataFrame)


## Re-Run First Loop (2K - 3K)

All the issues in "re-run first loop" is (hopefully) fixed:

    1- for loop with ["iframe" ,"embed"]

    2- _change_base_url works succesfully

    3- zero.sci-hub -> moscow.sci-hub

Will run the problematic third loop (2K - 3K) , if executes succesfully, will run the rest.

In [14]:
sh = SciHub()

doi_slice = Scihub_df[2000:3000].index.tolist()

now = datetime.now()
print(f"Starting loop, start time: {now}")
Scihub_df.loc[doi_slice,"Scihub_results"] = Scihub_df.loc[doi_slice,"doi"].map(sh.get_dates)
now = datetime.now()
print(f"loop ended, endtime: {now}")
   

Starting loop, start time: 2022-01-28 10:31:51.083441




loop ended, endtime: 2022-01-28 12:37:47.713046


In [8]:
Scihub_df[2000:3000]["Scihub_results"].map(type).value_counts()

<class 'list'>     938
<class 'str'>       51
<class 'tuple'>     11
Name: Scihub_results, dtype: int64

In [9]:
Scihub_df[2000:3000][Scihub_df[2000:3000]["Scihub_results"].map(type)==list]["Scihub_results"].map(len).value_counts()

3    497
2    441
Name: Scihub_results, dtype: int64

In [10]:
Scihub_df[2000:3000][Scihub_df[2000:3000]["Scihub_results"].map(type)==str]["Scihub_results"].value_counts()

article_not_in Scihub    51
Name: Scihub_results, dtype: int64

In [11]:
Scihub_df[2000:3000][Scihub_df[2000:3000]["Scihub_results"].map(type)==tuple]["Scihub_results"]

2576    (https://moscow.sci-hub.st/3241/51d11ee361f5a5...
2657    (https://moscow.sci-hub.se/3316/351eab13f521e4...
2659    (https://moscow.sci-hub.se/5461/41b5ac3322b902...
2687    (https://moscow.sci-hub.se/3790/454c9651e91c34...
2766    (https://moscow.sci-hub.se/1532/d218032da8973b...
2791    (https://moscow.sci-hub.se/2097/fb20c00f76352e...
2802    (https://moscow.sci-hub.se/2055/e5b198b4efec35...
2824    (https://moscow.sci-hub.se/355/0c2bf696b294848...
2828    (https://moscow.sci-hub.se/2575/aa39429cbd16d0...
3283    (http://index.ggws.net/downloads/2021-05-26/21...
3647    (http://index.ggws.net/downloads/2020-02-04/d5...
Name: Scihub_results, dtype: object

FINALLY, it seems that loop is executed succesfully. 

Although it took longer than expected (125 mins), everything seems to be in order. 

Only, small problem is that, as we are replacing zero. domains with moscow. , this caused 11 tuple results with "cant open pdf" returns. This is fixed now in scihub_upgraded.

NOTE & UPDATE: It seems that the Zero-Moscow error is not due to an error in the code but rather due to the SciHub servers. So this fix is reverted and zero.sci-ub domains are not altered in any way. 

The cell below aims to fix these 11 tuple results, if this also works, the re-run for 3K - 10K can be run too.



In [13]:
Scihub_df.loc[Scihub_df.Scihub_results.map(type)==tuple, "Scihub_results"] = Scihub_df.loc[Scihub_df.Scihub_results.map(type)==tuple, "doi"].map(sh.get_dates)



In [11]:
Scihub_df[2000:3000]["Scihub_results"].map(type).value_counts()

<class 'list'>    948
<class 'str'>      52
Name: Scihub_results, dtype: int64

In [19]:
Scihub_df.loc[2828,"Scihub_results"]

['https://sci.bban.top/pdf/10.1016/j.indmarman.2014.05.012.pdf#view=FitH',
 '15 February 2014',
 '30 March 2014']

In [29]:
Scihub_df.to_csv("0-3and10-15done_Scihub_df.csv")

In [13]:
Scihub_df[:3000]["Scihub_results"].map(type).value_counts()

<class 'list'>    2818
<class 'str'>      182
Name: Scihub_results, dtype: int64

In [12]:
Scihub_df[:3000][Scihub_df[:3000]["Scihub_results"].map(type)==list]["Scihub_results"].map(len).value_counts()

2    1422
3    1396
Name: Scihub_results, dtype: int64

In [15]:
Scihub_df[10000:15000][Scihub_df[10000:15000]["Scihub_results"].map(type)==list]["Scihub_results"].map(len).value_counts()

3    2519
2    2212
Name: Scihub_results, dtype: int64

## Re-Run First Loop (Take Three) (3K - 10K)

In [13]:
for k in range(3,10):


    start = k*1000
    end = (k+1)*1000 
    doi_slice = Scihub_df[start:end].index.tolist()

    now = datetime.now()
    print(f"Starting loop {k+1}, start time: {now}")
    Scihub_df.loc[doi_slice,"Scihub_results"] = Scihub_df.loc[doi_slice,"doi"].map(sh.get_dates)
    now = datetime.now()
    print(f"{k+1} loop ended, endtime: {now}")

    sh = SciHub()
    
   

Starting loop 4, start time: 2022-01-28 15:00:32.482995




4 loop ended, endtime: 2022-01-28 16:03:47.718905
Starting loop 5, start time: 2022-01-28 16:03:47.721405




5 loop ended, endtime: 2022-01-28 17:19:00.455010
Starting loop 6, start time: 2022-01-28 17:19:00.460505




6 loop ended, endtime: 2022-01-28 18:19:27.182961
Starting loop 7, start time: 2022-01-28 18:19:27.187960




7 loop ended, endtime: 2022-01-28 19:34:40.265538
Starting loop 8, start time: 2022-01-28 19:34:40.270039




8 loop ended, endtime: 2022-01-28 20:33:13.027141
Starting loop 9, start time: 2022-01-28 20:33:13.029641




9 loop ended, endtime: 2022-01-28 21:42:14.315251
Starting loop 10, start time: 2022-01-28 21:42:14.317252




10 loop ended, endtime: 2022-01-28 22:42:54.297158


In [15]:
Scihub_df[3000:10000]["Scihub_results"].map(type).value_counts()

<class 'list'>    6592
<class 'str'>      408
Name: Scihub_results, dtype: int64

In [17]:
Scihub_df[3000:10000][Scihub_df[3000:10000]["Scihub_results"].map(type)==str]["Scihub_results"].value_counts()

article_not_in Scihub          350
cant_read_pdf (Zero-Moscow)     58
Name: Scihub_results, dtype: int64

In [23]:
Scihub_df[3000:10000][Scihub_df[3000:10000]["Scihub_results"].map(type)==list]["Scihub_results"].map(len).value_counts()

2    3320
3    3272
Name: Scihub_results, dtype: int64

In [22]:
Scihub_df[3000:10000][Scihub_df[3000:10000]["Scihub_results"]=="article_not_in Scihub"]

Unnamed: 0,doi,doi_url,title,genre,published_date,year,journal_name,journal_issns,journal_issn_l,journal_is_oa,journal_is_in_doaj,publisher,is_oa,oa_status,oa_locations,updated,z_authors,Unpy_filter,Scihub_results
3869,10.1162/jinh_r_01221,https://doi.org/10.1162/jinh_r_01221,Jihād in West Africa during the Age of Revolut...,journal-article,2018-02-01,2018,The Journal of Interdisciplinary History,"0022-1953,1530-9169",0022-1953,False,False,MIT Press - Journals,False,closed,[],2021-03-31T23:29:40.225107,"[{'given': 'John H.', 'family': 'Hanson', 'seq...",Scihub,article_not_in Scihub
3902,10.18647/2988/jjs-2010,https://doi.org/10.18647/2988/jjs-2010,Il tempio di Leontopoli in Egitto: Identità po...,journal-article,2010-10-01,2010,Journal of Jewish Studies,"0022-2097,2056-6689",0022-2097,False,False,Journal of Jewish Studies,False,closed,[],2021-01-17T12:33:58.528116,"[{'given': 'Corrado', 'family': 'Martone', 'se...",Scihub,article_not_in Scihub
3903,10.18647/3022/jjs-2011,https://doi.org/10.18647/3022/jjs-2011,Conceiving Israel: The Fetus in Rabbinic Narra...,journal-article,2011-04-01,2011,Journal of Jewish Studies,"0022-2097,2056-6689",0022-2097,False,False,Journal of Jewish Studies,False,closed,[],2021-01-16T22:39:08.750544,"[{'given': 'Ishay', 'family': 'Rosen-Zvi', 'se...",Scihub,article_not_in Scihub
3904,10.18647/3089/jjs-2012,https://doi.org/10.18647/3089/jjs-2012,Mémoires des Juifs de Roumanie,journal-article,2012-04-01,2012,Journal of Jewish Studies,"0022-2097,2056-6689",0022-2097,False,False,Journal of Jewish Studies,False,closed,[],2021-01-16T11:46:57.254121,"[{'given': 'Dana', 'family': 'Mihăilescu', 'se...",Scihub,article_not_in Scihub
3905,10.18647/3185/jjs-2014,https://doi.org/10.18647/3185/jjs-2014,And I shall dwell in their midst: God’s presen...,journal-article,2014-10-01,2014,Journal of Jewish Studies,"0022-2097,2056-6689",0022-2097,False,False,Journal of Jewish Studies,False,closed,[],2021-01-16T04:13:24.094656,"[{'given': 'Jeffrey R.', 'family': 'Woolf', 's...",Scihub,article_not_in Scihub
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11872,10.3366/para.2015.0158,https://doi.org/10.3366/para.2015.0158,A Vocabulary and Its Vicissitudes: Notes towar...,journal-article,2015-07-01,2015,Paragraph,"0264-8334,1750-0176",0264-8334,False,False,Edinburgh University Press,False,closed,[],2021-01-18T14:29:29.084337,"[{'given': 'Jeffrey', 'family': 'Mehlman', 'se...",Scihub,article_not_in Scihub
11873,10.3366/para.2016.0203,https://doi.org/10.3366/para.2016.0203,The Uses of Maurice Blanchot in Bernard Stiegl...,journal-article,2016-11-01,2016,Paragraph,"0264-8334,1750-0176",0264-8334,False,False,Edinburgh University Press,False,closed,[],2021-03-26T12:57:11.226693,"[{'given': 'Calum', 'family': 'Watt', 'sequenc...",Scihub,article_not_in Scihub
11897,10.1097/eja.0000000000001320,https://doi.org/10.1097/eja.0000000000001320,The ‘ephemeral’ intensive care units that save...,journal-article,2020-08-26,2020,European Journal of Anaesthesiology,"0265-0215,1365-2346",0265-0215,False,False,Ovid Technologies (Wolters Kluwer Health),False,closed,[],2021-03-03T21:20:59.201295,"[{'given': 'Franck', 'family': 'Verdonk', 'seq...",Scihub,article_not_in Scihub
11925,10.1108/imr.2010.03627aaa.001,https://doi.org/10.1108/imr.2010.03627aaa.001,Editorial,journal-article,2010-02-23,2010,International Marketing Review,0265-1335,0265-1335,False,False,Emerald,False,closed,[],2021-01-14T13:54:51.062465,"[{'given': 'Jeryl', 'family': 'Whitelock', 'se...",Scihub,article_not_in Scihub


In [29]:
Scihub_df[3000:10000][Scihub_df[3000:10000]["Scihub_results"]=="cant_read_pdf (Zero-Moscow)"]

Unnamed: 0,doi,doi_url,title,genre,published_date,year,journal_name,journal_issns,journal_issn_l,journal_is_oa,journal_is_in_doaj,publisher,is_oa,oa_status,oa_locations,updated,z_authors,Unpy_filter,Scihub_results
5040,10.1080/00288306.2010.500321,https://doi.org/10.1080/00288306.2010.500321,Current New Zealand mine drainage research,journal-article,2010-06-01,2010,New Zealand Journal of Geology and Geophysics,"0028-8306,1175-8791",0028-8306,False,False,Informa UK Limited,False,closed,[],2021-01-17T18:12:09.199948,"[{'given': 'J', 'family': 'Pope', 'sequence': ...",Scihub,cant_read_pdf (Zero-Moscow)
5041,10.1080/00288306.2013.772906,https://doi.org/10.1080/00288306.2013.772906,Quaternary shelf structures SE of the South Is...,journal-article,2013-06-01,2013,New Zealand Journal of Geology and Geophysics,"0028-8306,1175-8791",0028-8306,False,False,Informa UK Limited,False,closed,[],2021-01-17T07:22:13.184694,"[{'given': 'AR', 'family': 'Gorman', 'sequence...",Scihub,cant_read_pdf (Zero-Moscow)
5043,10.1080/00288306.2015.1077872,https://doi.org/10.1080/00288306.2015.1077872,Subsidence-driven environmental change in thre...,journal-article,2015-10-02,2015,New Zealand Journal of Geology and Geophysics,"0028-8306,1175-8791",0028-8306,False,False,Informa UK Limited,False,closed,[],2021-01-18T14:58:21.503475,"[{'given': 'Bruce W', 'family': 'Hayward', 'se...",Scihub,cant_read_pdf (Zero-Moscow)
5047,10.1080/00293652.2015.1039569,https://doi.org/10.1080/00293652.2015.1039569,Mikkel Sørensen:Technology and Tradition in th...,journal-article,2015-01-02,2015,Norwegian Archaeological Review,"0029-3652,1502-7678",0029-3652,False,False,Informa UK Limited,False,closed,[],2021-01-16T13:32:00.194747,"[{'given': 'Astrid J.', 'family': 'Nyland', 's...",Scihub,cant_read_pdf (Zero-Moscow)
5052,10.1093/notesj/gjq125,https://doi.org/10.1093/notesj/gjq125,"ANDREW MURPHY, Shakespeare for the People: Wor...",journal-article,2010-06-24,2010,Notes and Queries,"0029-3970,1471-6941",0029-3970,False,False,Oxford University Press (OUP),False,closed,[],2021-01-19T03:45:42.043515,"[{'given': 'L.', 'family': 'Howsam', 'sequence...",Scihub,cant_read_pdf (Zero-Moscow)
5698,10.1097/psy.0000000000000377,https://doi.org/10.1097/psy.0000000000000377,Low-Grade Inflammation and Ambulatory Cortisol...,journal-article,2017-02-01,2017,Psychosomatic Medicine,"1534-7796,0033-3174",0033-3174,False,False,Ovid Technologies (Wolters Kluwer Health),True,green,"[{'updated': None, 'url': 'https://europepmc.o...",2021-11-27T10:44:18.234423,"[{'given': 'Hannah M.C.', 'family': 'Schreier'...",Scihub,cant_read_pdf (Zero-Moscow)
5995,10.1111/rssc.12417,https://doi.org/10.1111/rssc.12417,A hybrid approach for the stratified mark‐spec...,journal-article,2020-05-22,2020,Journal of the Royal Statistical Society: Seri...,"0035-9254,1467-9876",0035-9254,False,False,Wiley,True,green,"[{'updated': '2021-12-16T12:06:07.709312', 'ur...",2021-08-02T13:34:54.316472,"[{'given': 'Yanqing', 'family': 'Sun', 'sequen...",Scihub,cant_read_pdf (Zero-Moscow)
6140,10.1053/j.seminhematol.2020.05.001,https://doi.org/10.1053/j.seminhematol.2020.05...,The virome in hematology—Stem cell transplanta...,journal-article,2020-01-01,2020,Seminars in Hematology,0037-1963,0037-1963,False,False,Elsevier BV,False,closed,[],2021-04-01T18:23:58.859952,[{'ORCID': 'http://orcid.org/0000-0001-7111-18...,Scihub,cant_read_pdf (Zero-Moscow)
6154,10.1093/sq/quz014,https://doi.org/10.1093/sq/quz014,The Dynamics of Inheritance on the Shakespeare...,journal-article,2019-06-01,2019,Shakespeare Quarterly,"0037-3222,1538-3555",0037-3222,False,False,Oxford University Press (OUP),False,closed,[],2021-06-21T14:32:52.432891,"[{'given': 'Margo', 'family': 'Kolenda-Mason',...",Scihub,cant_read_pdf (Zero-Moscow)
6287,10.1016/s0038-1101(13)00240-2,https://doi.org/10.1016/s0038-1101(13)00240-2,Editorial Board,journal-article,2013-09-01,2013,Solid-State Electronics,0038-1101,0038-1101,False,False,Elsevier BV,False,closed,[],2021-01-14T13:39:34.850822,,Scihub,cant_read_pdf (Zero-Moscow)


Initial analysis shows that the 3-10K loop was a success despite the long warning/error messages during execution.

Only 58 got Zero-Moscow error, which will be fixed below.

The date success rate seems to be again around 50%


In [24]:
Scihub_df.to_csv("15K_done_Scihub_df.csv")

In [55]:
%store Scihub_df

Stored 'Scihub_df' (DataFrame)


In [13]:
sh = SciHub()

In [9]:
Scihub_df.loc[Scihub_df.Scihub_results=="cant_read_pdf (Zero-Moscow)", "Scihub_results"] = Scihub_df.loc[Scihub_df.Scihub_results=="cant_read_pdf (Zero-Moscow)", "doi"].map(sh.get_dates)



In [11]:
Scihub_df.loc[Scihub_df.Scihub_results=="cant_read_pdf (Zero-Moscow)", :]

Unnamed: 0,doi,doi_url,title,genre,published_date,year,journal_name,journal_issns,journal_issn_l,journal_is_oa,journal_is_in_doaj,publisher,is_oa,oa_status,oa_locations,updated,z_authors,Unpy_filter,Scihub_results
5698,10.1097/psy.0000000000000377,https://doi.org/10.1097/psy.0000000000000377,Low-Grade Inflammation and Ambulatory Cortisol...,journal-article,2017-02-01,2017,Psychosomatic Medicine,"1534-7796,0033-3174",0033-3174,False,False,Ovid Technologies (Wolters Kluwer Health),True,green,"[{'updated': None, 'url': 'https://europepmc.o...",2021-11-27T10:44:18.234423,"[{'given': 'Hannah M.C.', 'family': 'Schreier'...",Scihub,cant_read_pdf (Zero-Moscow)
7037,10.1353/sub.2012.0010,https://doi.org/10.1353/sub.2012.0010,How To (Un)Globe the Earth in Four Easy Lessons,journal-article,2012-01-01,2012,SubStance,"0049-2426,1527-2095",0049-2426,False,False,Project Muse,False,closed,[],2021-01-18T22:49:18.155121,"[{'given': 'R.', 'family': 'Ghosh', 'sequence'...",Scihub,cant_read_pdf (Zero-Moscow)
7038,10.1353/sub.2014.0026,https://doi.org/10.1353/sub.2014.0026,Sovereignty's Ontological Indecision: Derrida ...,journal-article,2014-01-01,2014,SubStance,"0049-2426,1527-2095",0049-2426,False,False,Project Muse,False,closed,[],2021-01-15T02:54:35.260580,"[{'given': 'M.', 'family': 'Chrulew', 'sequenc...",Scihub,cant_read_pdf (Zero-Moscow)
9105,10.1097/sap.0000000000000959,https://doi.org/10.1097/sap.0000000000000959,Pediatric Hand Surgery in Global Health,journal-article,2017-02-01,2017,Annals of Plastic Surgery,0148-7043,0148-7043,False,False,Ovid Technologies (Wolters Kluwer Health),False,closed,[],2021-01-19T10:47:17.809953,"[{'given': 'Karen Y.', 'family': 'Chung', 'seq...",Scihub,cant_read_pdf (Zero-Moscow)


All but 4 Zero-Moscow's are fixed. However, it is seen that these 4 PDFs have a different font which is not-readable by both pdfminer and PyPdf2. These will not be fixed and will be left as an error. However, this will require an update on SciHub class.

Likewise, during this run it is seen that the errors raised during the mapping process are not exactly working as intended. Although the "retry"s fix the problem for the next DOI, the article at hand is a "lost cause" as all the variables for that specific loop stay the same. So, the error & retry processes need to be changed entirely, so the problematic DOI at hand is also fixed, and no retried unnecessarily.

scihub_upgraded

## Third Loop (15K - 20K)

In [9]:
for k in range(5):

    sh = SciHub()
    
    start = (k+15)*1000
    end = (k+16)*1000 
    doi_slice = Scihub_df[start:end].index.tolist()

    now = datetime.now()
    print(f"Starting loop {k+1}, start time: {now}")
    Scihub_df.loc[doi_slice,"Scihub_results"] = Scihub_df.loc[doi_slice,"doi"].map(sh.get_dates)
    now = datetime.now()
    print(f"{k+1} loop ended, endtime: {now}")
   

Starting loop 1, start time: 2022-01-31 16:03:37.656035




1 loop ended, endtime: 2022-01-31 16:53:23.746616
Starting loop 2, start time: 2022-01-31 16:53:23.750616




2 loop ended, endtime: 2022-01-31 17:43:56.142533
Starting loop 3, start time: 2022-01-31 17:43:56.147035




3 loop ended, endtime: 2022-01-31 18:36:04.841939
Starting loop 4, start time: 2022-01-31 18:36:04.845436




4 loop ended, endtime: 2022-01-31 19:27:08.726726
Starting loop 5, start time: 2022-01-31 19:27:08.730726




5 loop ended, endtime: 2022-01-31 20:16:40.794154


In [11]:
Scihub_df[15000:20000]["Scihub_results"].map(type).value_counts()

<class 'list'>    4856
<class 'str'>      144
Name: Scihub_results, dtype: int64

In [12]:
Scihub_df[15000:20000][Scihub_df[15000:20000]["Scihub_results"].map(type)==list]["Scihub_results"].map(len).value_counts()

3    2634
2    2222
Name: Scihub_results, dtype: int64

In [13]:
Scihub_df.to_csv("20K_Scihub_df.csv")

In [14]:
%store Scihub_df

Stored 'Scihub_df' (DataFrame)


First 20K of complete_one Scihub articles are completed. The rest 25K will be gathered from Çeto & Summan. In the meantime, run a loop for the failed direct url articles.

In [9]:
%store -r Unpy_one_fin

In [10]:
Unpy_suc_df = Unpy_one_fin.loc[Unpy_one_fin.Unpy_results.notna(),:].copy()

In [11]:
Unpy_suc_df.shape

(5966, 22)

In [13]:
Unpy_suc_df.to_csv("Unpy_suc_run_v2.csv")

In [3]:
Unpy_one_fin.Unpy_results.map(type).value_counts()

<class 'NoneType'>    11461
<class 'list'>         5966
Name: Unpy_results, dtype: int64

In [23]:
Unpy_fail_df = Unpy_one_fin.loc[Unpy_one_fin.Unpy_results.isna(),:].copy()

In [25]:
for k in range(12):

    sh = SciHub()
    
    start = (k)*1000
    end = (k+1)*1000

    if k == 11:
        doi_slice = Unpy_fail_df[start:].index.tolist()
    else:
        doi_slice = Unpy_fail_df[start:end].index.tolist()

    now = datetime.now()
    print(f"Starting loop {k+1}, start time: {now}")
    Unpy_fail_df.loc[doi_slice,"Scihub_results"] = Unpy_fail_df.loc[doi_slice,"doi"].map(sh.get_dates)
    now = datetime.now()
    print(f"{k+1} loop ended, endtime: {now}")
   

Starting loop 1, start time: 2022-01-31 23:46:03.984852




1 loop ended, endtime: 2022-02-01 00:46:52.189736
Starting loop 2, start time: 2022-02-01 00:46:52.195237




2 loop ended, endtime: 2022-02-01 01:53:49.334478
Starting loop 3, start time: 2022-02-01 01:53:49.339978




KeyboardInterrupt: 

In [27]:
Unpy_fail_df[:2000].Scihub_results.map(type).value_counts()

<class 'list'>    1705
<class 'str'>      295
Name: Scihub_results, dtype: int64

In [33]:
Unpy_fail_df[:2000][Unpy_fail_df[:2000].Scihub_results.map(type)== str]["Scihub_results"].value_counts()

article_not_in Scihub    288
cant_read_pdf              7
Name: Scihub_results, dtype: int64

Due to an error in domain list (sci-hub.ru), need to restart the loop from [2000:]


In [6]:
%store -r Unpy_fail_df

In [8]:
for k in range(2,12):

    sh = SciHub()
    
    start = (k)*1000
    end = (k+1)*1000

    if k == 11:
        doi_slice = Unpy_fail_df[start:].index.tolist()
    else:
        doi_slice = Unpy_fail_df[start:end].index.tolist()

    now = datetime.now()
    print(f"Starting loop {k+1}, start time: {now}")
    Unpy_fail_df.loc[doi_slice,"Scihub_results"] = Unpy_fail_df.loc[doi_slice,"doi"].map(sh.get_dates)
    now = datetime.now()
    print(f"{k+1} loop ended, endtime: {now}")
   

Starting loop 3, start time: 2022-02-01 09:24:35.486427




3 loop ended, endtime: 2022-02-01 10:19:10.040015
Starting loop 4, start time: 2022-02-01 10:19:10.044514




4 loop ended, endtime: 2022-02-01 11:28:23.920809
Starting loop 5, start time: 2022-02-01 11:28:23.923310




5 loop ended, endtime: 2022-02-01 12:14:37.506195
Starting loop 6, start time: 2022-02-01 12:14:37.510196




6 loop ended, endtime: 2022-02-01 13:06:13.020522
Starting loop 7, start time: 2022-02-01 13:06:13.026022




7 loop ended, endtime: 2022-02-01 13:58:19.051415
Starting loop 8, start time: 2022-02-01 13:58:19.055916




8 loop ended, endtime: 2022-02-01 14:57:14.495951
Starting loop 9, start time: 2022-02-01 14:57:14.500451




9 loop ended, endtime: 2022-02-01 16:09:30.152721
Starting loop 10, start time: 2022-02-01 16:09:30.154222




10 loop ended, endtime: 2022-02-01 17:37:36.663456
Starting loop 11, start time: 2022-02-01 17:37:36.667455




11 loop ended, endtime: 2022-02-01 18:44:29.498909
Starting loop 12, start time: 2022-02-01 18:44:29.500909




12 loop ended, endtime: 2022-02-01 19:30:32.901064


In [10]:
Unpy_fail_df.Scihub_results.map(type).value_counts()

<class 'list'>    10205
<class 'str'>      1256
Name: Scihub_results, dtype: int64

In [11]:
Unpy_fail_df[Unpy_fail_df.Scihub_results.map(type)==str]["Scihub_results"].value_counts()

article_not_in Scihub    1211
cant_read_pdf              41
direct_url_error            4
Name: Scihub_results, dtype: int64

In [14]:
Unpy_fail_df[Unpy_fail_df.Scihub_results.map(type)==list]["Scihub_results"].map(len).value_counts(normalize=True)

2    0.64831
3    0.35169
Name: Scihub_results, dtype: float64

In [17]:
Unpy_fail_df[Unpy_fail_df.Scihub_results=="article_not_in Scihub"]

Unnamed: 0,doi,doi_url,title,genre,published_date,year,journal_name,journal_issns,journal_issn_l,journal_is_oa,...,oa_status,oa_locations,updated,z_authors,Unpy_filter,Unpy_results,OK_link,Rec_date,Acc_date,Scihub_results
21,10.1590/0001-3675202020180560,https://doi.org/10.1590/0001-3675202020180560,Size at onset of sexual maturity in Macrobrach...,journal-article,2020-01-01,2020,Anais da Academia Brasileira de Ciências,"1678-2690,0001-3765",0001-3765,True,...,gold,"[{'updated': '2021-12-15T13:16:50.729105', 'ur...",2021-07-19T13:49:59.142924,[{'ORCID': 'http://orcid.org/0000-0003-2461-46...,[http://www.scielo.br/j/aabc/a/dzJK5RVbSRJCQPN...,,,,,article_not_in Scihub
33,10.2308/accr.2010.85.2.749,https://doi.org/10.2308/accr.2010.85.2.749,Editorial Policy and Style Information,journal-article,2010-03-01,2010,The Accounting Review,"0001-4826,1558-7967",0001-4826,False,...,bronze,"[{'updated': '2021-12-05T16:44:22.487022', 'ur...",2021-12-05T16:44:43.153000,,[https://meridian.allenpress.com/accounting-re...,,,,,article_not_in Scihub
69,10.4310/acta.2017.v218.n1.a1,https://doi.org/10.4310/acta.2017.v218.n1.a1,Tits geometry and positive curvature,journal-article,2017-01-01,2017,Acta Mathematica,"0001-5962,1871-2509",0001-5962,True,...,gold,"[{'updated': '2019-11-21T08:13:18.325427', 'ur...",2021-09-15T23:20:42.591771,"[{'given': 'Fuquan', 'family': 'Fang', 'sequen...",[https://www.intlpress.com/site/pub/files/_ful...,,,,,article_not_in Scihub
70,10.4310/acta.2018.v221.n1.a5,https://doi.org/10.4310/acta.2018.v221.n1.a5,Isoperimetric characterization of upper curvat...,journal-article,2018-01-01,2018,Acta Mathematica,"0001-5962,1871-2509",0001-5962,True,...,gold,"[{'updated': '2019-10-16T03:57:34.414624', 'ur...",2021-02-18T20:19:49.581593,"[{'given': 'Alexander', 'family': 'Lytchak', '...",[https://www.intlpress.com/site/pub/files/_ful...,,,,,article_not_in Scihub
71,10.4310/acta.2019.v222.n2.a1,https://doi.org/10.4310/acta.2019.v222.n2.a1,Bogoliubov theory in the Gross–Pitaevskii limit,journal-article,2019-01-01,2019,Acta Mathematica,"0001-5962,1871-2509",0001-5962,True,...,gold,"[{'updated': '2021-01-04T02:24:23.113166', 'ur...",2021-09-07T04:56:39.296485,"[{'given': 'Chiara', 'family': 'Boccato', 'seq...",[https://www.intlpress.com/site/pub/files/_ful...,,,,,article_not_in Scihub
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64001,10.28991/esj-2020-01246,https://doi.org/10.28991/esj-2020-01246,Analysis of Host Resources Utilization by Open...,journal-article,2020-12-01,2020,Emerging Science Journal,2610-9182,2610-9182,True,...,gold,"[{'updated': '2021-12-29T15:36:11.375406', 'ur...",2021-02-04T19:24:03.926555,[{'ORCID': 'http://orcid.org/0000-0001-8271-66...,[https://ijournalse.org/index.php/ESJ/article/...,,,,,article_not_in Scihub
64002,10.32609/j.ruje.6.50824,https://doi.org/10.32609/j.ruje.6.50824,Prospects of the Chinese market for Russian ag...,journal-article,2020-03-25,2020,Russian Journal of Economics,"2405-4739,2618-7213",2405-4739,True,...,gold,"[{'updated': '2021-12-29T15:36:12.924745', 'ur...",2021-02-04T01:07:20.367706,"[{'given': 'Natalia', 'family': 'Karlova', 'se...",[https://zenodo.org/record/3737361/files/RUJEC...,,,,,article_not_in Scihub
64003,10.22162/2619-0990-2019-45-5-855-871,https://doi.org/10.22162/2619-0990-2019-45-5-8...,Импортная поливная керамика Маджара как источн...,journal-article,2019-12-11,2019,Oriental Studies,"2619-0990,2619-1008",2619-0990,True,...,gold,"[{'updated': '2021-12-29T15:36:14.611364', 'ur...",2021-02-04T11:31:51.203349,[{'ORCID': 'http://orcid.org/0000-0001-9776-36...,[https://kigiran.elpub.ru/jour/article/downloa...,,,,,article_not_in Scihub
64004,10.22162/2619-0990-2020-49-3-779-789,https://doi.org/10.22162/2619-0990-2020-49-3-7...,Мотив «богатырский поединок (сражение)» и его ...,journal-article,2020-12-24,2020,Oriental Studies,"2619-0990,2619-1008",2619-0990,True,...,gold,"[{'updated': '2021-12-29T15:36:16.164657', 'ur...",2021-03-15T19:59:58.607082,[{'ORCID': 'http://orcid.org/0000-0002-2113-68...,[https://kigiran.elpub.ru/jour/article/downloa...,,,,,article_not_in Scihub


In [15]:
Unpy_fail_df.to_csv("Unpy_scihub_run.csv")

In [16]:
%store Unpy_fail_df

Stored 'Unpy_fail_df' (DataFrame)


MANUAL GET_DATES TESTING:

In [37]:
base_url = "https://sci-hub.se/"
doi = '10.1097/psy.0000000000000377'
doi2 = "10.1353/sub.2012.0010"
doi4 = "10.1097/sap.0000000000000959"

total_url = base_url + doi4

sh_req = requests.get(total_url)
sh_soup = BeautifulSoup(sh_req.content, "html.parser")
iframe = sh_soup.find("embed", {"id": "pdf"})["src"]


In [38]:
iframe

'https://twin.sci-hub.se/6624/192e4f171d0db2136e32b26a6dc80550/chung2017.pdf#navpanes=0&view=FitH'

In [39]:
pdf_req = requests.get(iframe)


In [40]:
pdf_io = io.BytesIO(pdf_req.content)


In [41]:
pdf_read = extract_text(pdf_io,page_numbers=[]).replace("\n", "")


RecursionError: maximum recursion depth exceeded

In [34]:
pdf2_read = pdf2.PdfFileReader(pdf_io)

In [36]:
pdf2_read.getPage(1)

{'/TrimBox': [0, 0, 432, 648],
 '/Resources': {'/Font': {'/C2_0': {'/Encoding': '/Identity-H',
    '/ToUnicode': {'/Filter': '/FlateDecode'},
    '/Subtype': '/Type0',
    '/Type': '/Font',
    '/DescendantFonts': [IndirectObject(57, 0)],
    '/BaseFont': '/FGDIQC+Palatino-Roman'},
   '/TT4': {'/Encoding': '/WinAnsiEncoding',
    '/ToUnicode': {'/Filter': '/FlateDecode'},
    '/Widths': [250,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     250,
     333,
     250,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     722,
     611,
     667,
     778,
     611,
     556,
     722,
     0,
     333,
     0,
     0,
     556,
     944,
     778,
     0,
     611,
     0,
     667,
     556,
     611,
     778,
     0,
     944,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     444,
     0,
     407,
     500,
     389,
    

In [19]:
sh.get_dates("10.1016/j.jinsphys.2011.12.018")

['https://sci.bban.top/pdf/10.1016/j.jinsphys.2011.12.018.pdf#view=FitH',
 'no_date_found']

TESTING AREA:

In [14]:
from retry import retry

In [15]:
@retry((TypeError), tries=2, delay=10)
@retry((ValueError), tries=2, delay=15)
def bokbok(inpo):
    if inpo == 1:
        raise TypeError("1")
    elif inpo == 2:
        raise ValueError("2")

In [17]:
bokbok(1)




TypeError: 1

In [18]:
sh.get_dates("askjdhsakd")

ReadTimeout: HTTPSConnectionPool(host='sci-hub.wf', port=443): Read timed out. (read timeout=15)

In [37]:
Scihub_df[:2000]["Scihub_results"].map(type).value_counts()

<class 'list'>    1870
<class 'str'>      130
Name: Scihub_results, dtype: int64

In [40]:
Scihub_df[:2000][Scihub_df[:2000]["Scihub_results"].map(type)==str]["Scihub_results"].value_counts()

article_not_in Scihub    130
Name: Scihub_results, dtype: int64

In [38]:
first_run_df[:2000]["Scihub_results"].map(type).value_counts()

<class 'list'>    1393
<class 'str'>      607
Name: Scihub_results, dtype: int64

In [67]:
first_run_df[:2000][first_run_df[:2000]["Scihub_results"].map(type)==str]["Scihub_results"].value_counts()

cant_open_pdf            493
article_not_in Scihub    114
Name: Scihub_results, dtype: int64

In [76]:
run2_niS = Scihub_df[:2000][Scihub_df[:2000]["Scihub_results"] == "article_not_in Scihub"]

In [77]:
run1_niS = first_run_df[:2000][first_run_df[:2000]["Scihub_results"] == "article_not_in Scihub"]

In [84]:
run2_niS.drop(run1_niS.index.tolist())

Unnamed: 0,doi,doi_url,title,genre,published_date,year,journal_name,journal_issns,journal_issn_l,journal_is_oa,journal_is_in_doaj,publisher,is_oa,oa_status,oa_locations,updated,z_authors,Unpy_filter,Scihub_results
98,10.1080/00016357.2020.1859132,https://doi.org/10.1080/00016357.2020.1859132,Association between periodontal disease and in...,journal-article,2020-12-28,2020,Acta Odontologica Scandinavica,"0001-6357,1502-3850",0001-6357,False,False,Informa UK Limited,False,closed,[],2021-01-14T04:41:27.117245,[{'ORCID': 'http://orcid.org/0000-0002-9180-47...,Scihub,article_not_in Scihub
170,10.1017/s0001972020000790,https://doi.org/10.1017/s0001972020000790,"Devaka Premawardhana, Faith in Flux: Pentecost...",journal-article,2020-11-01,2020,Africa,"0001-9720,1750-0184",0001-9720,False,False,Cambridge University Press (CUP),False,closed,[],2021-01-15T18:07:51.993677,"[{'given': 'Karen', 'family': 'Lauterbach', 's...",Scihub,article_not_in Scihub
360,10.5344/ajev.2020.19050,https://doi.org/10.5344/ajev.2020.19050,Meteorological-Based Modeling of δ18O Values f...,journal-article,2020-03-13,2020,American Journal of Enology and Viticulture,0002-9254,0002-9254,False,False,American Society for Enology and Viticulture,True,green,"[{'updated': '2021-11-04T16:17:24.522123', 'ur...",2021-04-03T19:37:59.421062,"[{'given': 'Monica', 'family': 'Bononi', 'sequ...",Scihub,article_not_in Scihub
460,10.1086/710566,https://doi.org/10.1086/710566,Syndicate Women: Gender and Networks in Chicag...,journal-article,2020-11-01,2020,American Journal of Sociology,"0002-9602,1537-5390",0002-9602,False,False,University of Chicago Press,False,closed,[],2021-02-17T01:05:24.865823,"[{'given': 'Anya', 'family': 'Degenshein', 'se...",Scihub,article_not_in Scihub
487,10.2138/am-2017-6003,https://doi.org/10.2138/am-2017-6003,Study on structure variations of incommensurat...,journal-article,2017-06-01,2017,American Mineralogist,0003-004X,0003-004X,False,False,Mineralogical Society of America,False,closed,[],2021-01-18T10:24:07.169653,"[{'given': 'Shiyun', 'family': 'Jin', 'sequenc...",Scihub,article_not_in Scihub
798,10.1145/3366018,https://doi.org/10.1145/3366018,On the Complexity of Cache Analysis for Differ...,journal-article,2019-12-06,2019,Journal of the ACM,"0004-5411,1557-735X",0004-5411,False,False,Association for Computing Machinery (ACM),True,green,"[{'updated': '2021-11-15T12:26:08.679996', 'ur...",2021-12-07T07:22:25.123843,"[{'given': 'David', 'family': 'Monniaux', 'seq...",Scihub,article_not_in Scihub
799,10.1145/3398745,https://doi.org/10.1145/3398745,A Simple and Approximately Optimal Mechanism f...,journal-article,2020-08-13,2020,Journal of the ACM,"0004-5411,1557-735X",0004-5411,False,False,Association for Computing Machinery (ACM),True,green,"[{'updated': '2020-07-31T04:08:45.953478', 'ur...",2021-05-17T07:08:09.853340,"[{'given': 'Moshe', 'family': 'Babaioff', 'seq...",Scihub,article_not_in Scihub
1059,10.1093/aesthj/ayaa043,https://doi.org/10.1093/aesthj/ayaa043,White Negroes: When Cornrows Were in Vogue … a...,journal-article,2020-12-17,2020,The British Journal of Aesthetics,"0007-0904,1468-2842",0007-0904,False,False,Oxford University Press (OUP),False,closed,[],2021-08-15T10:25:10.954326,"[{'given': 'Nadia', 'family': 'Mehdi', 'sequen...",Scihub,article_not_in Scihub
1155,10.1017/s000712342000006x,https://doi.org/10.1017/s000712342000006x,An ‘Institution-First’ Conception of Public In...,journal-article,2020-11-17,2020,British Journal of Political Science,"0007-1234,1469-2112",0007-1234,False,False,Cambridge University Press (CUP),False,closed,[],2021-01-16T15:00:42.884252,[{'ORCID': 'http://orcid.org/0000-0001-5759-91...,Scihub,article_not_in Scihub
1177,10.1111/1468-4446.12739,https://doi.org/10.1111/1468-4446.12739,"What do stress tests test? Experimentation, de...",journal-article,2020-02-12,2020,The British Journal of Sociology,"0007-1315,1468-4446",0007-1315,False,False,Wiley,False,closed,[],2021-01-19T07:15:54.012699,"[{'given': 'Nathan', 'family': 'Coombs', 'sequ...",Scihub,article_not_in Scihub


In [118]:
sh = SciHub()

sh.get_dates("10.1093/ehr/ceaa223")



TypeError: unsupported operand type(s) for +: 'NoneType' and 'str'

In [128]:
base_url = 'https://sci-hub.ru/'
doi = "10.1093/ehr/ceaa223"

total_url = base_url + doi

sh_req = requests.get(total_url)
sh_soup = BeautifulSoup(sh_req.content, "html.parser")
iframe = sh_soup.find("iframe", {"id": "pdf"})["src"]



TypeError: 'NoneType' object is not callable

In [131]:
iframe = sh_soup.find("iframe", {"id": "pdf"})["src"]

TypeError: 'NoneType' object is not subscriptable

In [133]:

iframe = sh_soup.find("embed", {"id": "pdf"})["src"]

In [134]:
iframe

'//sci-hub.ru/downloads/2021-05-26/3553/oup-accepted-manuscript-2020.pdf#navpanes=0&view=FitH'

In [None]:
if iframe.startswith("//"):
    iframe = 'http:' + iframe

pdf_req = requests.get(iframe)
pdf_io = io.BytesIO(pdf_req.content)
pdf_read = extract_text(pdf_io,page_numbers=[]).replace("\n", "")

print(pdf_read[:10])