In [23]:
import requests
import time
from bs4 import BeautifulSoup
import concurrent.futures
import pandas as pd
import functools

In [24]:
def create_url(page):
    return "https://www.nature.com/nmat/articles?searchType=journalSearch&sort=PubDate&type=article&page=" + str(page)

In [25]:
def retrieve_page(url):
    titles = []
    links = []
    abstracts  = []
    biblios = []
    authors = []
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, "lxml")
    for link in soup.findAll('a' ,  {'data-track-action' : 'view article'}):
            href = 'https://www.nature.com' + link.get('href')
            title = link.text.replace('\n', '').lstrip()
            links.append(href)
            titles.append(title)
            source_code_link = requests.get(href) 
            plain_text_link = source_code_link.text
            soup_link = BeautifulSoup(plain_text_link, "lxml")            
            abstracts.append(get_single_article_abstract(soup_link, href))
            biblios.append(get_single_article_datepublished(soup_link, href))
            authors.append(get_single_article_firstauthor(soup_link, href))
    return links, titles, abstracts, biblios, authors

In [26]:
def get_single_article_abstract(soup, article_url):
#     source_code = requests.get(article_url) 
#     plain_text = source_code.text
#     soup = BeautifulSoup(plain_text, "lxml")
    abstract = ""
    for article_abstract in soup.findAll('div', {'id':['Abs1-content', 'abstract-content']}):
        abstract = article_abstract.text
        #print(abstract)
    return abstract

In [27]:
def get_single_article_datepublished(soup, article_url):
#     source_code = requests.get(article_url) 
#     plain_text = source_code.text
#     soup = BeautifulSoup(plain_text, "lxml")
    biblio = ""
    for article_biblio in soup.findAll('a', {'href':"#article-info"}):
        biblio = article_biblio.text
        #print(biblio)
    return biblio

In [28]:
def get_single_article_firstauthor(soup, article_url):
    author = ""
    for article_author in soup.findAll('a', {'href':"#auth-1"}):
        author = article_author.text
        #print(author)
    return author

In [29]:
urls = [create_url(page) for page in range(1,26)]

In [30]:
def concat(x,y):
    return list(x[i]+y[i] for i in range(len(x)))

In [31]:
%%time
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    result = list(executor.map(retrieve_page, urls))

CPU times: user 5min 14s, sys: 13.9 s, total: 5min 28s
Wall time: 7min 56s


In [18]:
result_concat = functools.reduce(concat, result)
d = list(map(list, zip(*result_concat)))
result_df =pd.DataFrame(d)
result_df.head()

Unnamed: 0,0,1,2,3,4
0,https://www.nature.com/articles/s41563-019-0423-3,Quantum jamming transition to a correlated ele...,Distinct many-body states may be created under...,Published: 15 July 2019,Yaroslav A. Gerasimenko
1,https://www.nature.com/articles/s41563-019-0421-5,Spatially dispersive circular photogalvanic ef...,Weyl semimetals (WSMs) are gapless topological...,Published: 15 July 2019,Zhurun Ji
2,https://www.nature.com/articles/s41563-019-0422-4,Predictive model of hydrogen trapping and bubb...,"The interplay between hydrogen and nanovoids, ...",Published: 15 July 2019,Jie Hou
3,https://www.nature.com/articles/s41563-019-0427-z,Intermediate-sized molecular sieving of styren...,Molecular sieving can lead to ultrahigh select...,Published: 15 July 2019,Dong-Dong Zhou
4,https://www.nature.com/articles/s41563-019-0415-3,Formation of two-dimensional transition metal ...,Two-dimensional (2D) materials have attracted ...,Published: 08 July 2019,Juan Yang


In [19]:
result_df

Unnamed: 0,0,1,2,3,4
0,https://www.nature.com/articles/s41563-019-0423-3,Quantum jamming transition to a correlated ele...,Distinct many-body states may be created under...,Published: 15 July 2019,Yaroslav A. Gerasimenko
1,https://www.nature.com/articles/s41563-019-0421-5,Spatially dispersive circular photogalvanic ef...,Weyl semimetals (WSMs) are gapless topological...,Published: 15 July 2019,Zhurun Ji
2,https://www.nature.com/articles/s41563-019-0422-4,Predictive model of hydrogen trapping and bubb...,"The interplay between hydrogen and nanovoids, ...",Published: 15 July 2019,Jie Hou
3,https://www.nature.com/articles/s41563-019-0427-z,Intermediate-sized molecular sieving of styren...,Molecular sieving can lead to ultrahigh select...,Published: 15 July 2019,Dong-Dong Zhou
4,https://www.nature.com/articles/s41563-019-0415-3,Formation of two-dimensional transition metal ...,Two-dimensional (2D) materials have attracted ...,Published: 08 July 2019,Juan Yang
5,https://www.nature.com/articles/s41563-019-0418-0,Scalable in operando strain tuning in nanophot...,The quest for an integrated quantum optics pla...,Published: 08 July 2019,Joel Q. Grim
6,https://www.nature.com/articles/s41563-019-0412-6,Regioselective generation and reactivity contr...,Subnanometric metal species (single atoms and ...,Published: 01 July 2019,Lichen Liu
7,https://www.nature.com/articles/s41563-019-0398-0,Resorcinol–formaldehyde resins as metal-free s...,Artificial photosynthesis is a critical challe...,Published: 01 July 2019,Yasuhiro Shiraishi
8,https://www.nature.com/articles/s41563-019-0416-2,High-resolution remote thermometry and thermog...,Although metal-halide perovskites have recentl...,Published: 01 July 2019,Sergii Yakunin
9,https://www.nature.com/articles/s41563-019-0407-3,Shape-encoded dynamic assembly of mobile micro...,Field-directed and self-propelled colloidal as...,Published: 24 June 2019,Yunus Alapan


In [21]:
result_df.columns=['links', 'titles','abstracts', 'date_published', 'author']

In [22]:
result_df.head()

Unnamed: 0,links,titles,abstracts,date_published,author
0,https://www.nature.com/articles/s41563-019-0423-3,Quantum jamming transition to a correlated ele...,Distinct many-body states may be created under...,Published: 15 July 2019,Yaroslav A. Gerasimenko
1,https://www.nature.com/articles/s41563-019-0421-5,Spatially dispersive circular photogalvanic ef...,Weyl semimetals (WSMs) are gapless topological...,Published: 15 July 2019,Zhurun Ji
2,https://www.nature.com/articles/s41563-019-0422-4,Predictive model of hydrogen trapping and bubb...,"The interplay between hydrogen and nanovoids, ...",Published: 15 July 2019,Jie Hou
3,https://www.nature.com/articles/s41563-019-0427-z,Intermediate-sized molecular sieving of styren...,Molecular sieving can lead to ultrahigh select...,Published: 15 July 2019,Dong-Dong Zhou
4,https://www.nature.com/articles/s41563-019-0415-3,Formation of two-dimensional transition metal ...,Two-dimensional (2D) materials have attracted ...,Published: 08 July 2019,Juan Yang


In [45]:
result_df.to_csv("complete_result")