In [112]:
import requests
from bs4 import BeautifulSoup
import re
import json
import pandas

# Resolve DOI metadata

In [122]:
doi = '10.1029/2020JD033923'
doi_url = 'https://doi.org/{doi}'.format(doi=doi)
headers = {'Accept': 'application/vnd.citationstyles.csl+json'}
ret = requests.get(doi_url, headers=headers, timeout=60)
doi_json = ret.json()

## Pulling out the title

In [116]:
title = doi_json['title']
title

'Modeling the Radiative Effect on Microphysics in Cirrus Clouds Against Satellite Observations'

## Pulling out authors ... dataframes for the win

In [121]:
author = doi_json['author']
author = pandas.DataFrame(author)
author

Unnamed: 0,ORCID,authenticated-orcid,given,family,sequence,affiliation
0,http://orcid.org/0000-0002-2947-4790,False,Xiping,Zeng,first,[{'name': 'Army Research Laboratory Adelphi M...
1,http://orcid.org/0000-0001-5897-7023,False,Jie,Gong,additional,[{'name': 'NASA Goddard Space Flight Center G...
2,http://orcid.org/0000-0001-8660-3387,False,Xiaowen,Li,additional,[{'name': 'NASA Goddard Space Flight Center G...
3,http://orcid.org/0000-0002-3490-9437,False,Dong L.,Wu,additional,[{'name': 'NASA Goddard Space Flight Center G...


## Getting the URL of where the doc lives

In [None]:
url = doi_json['url']

# Retrieving the HTML

In [96]:
# We need to forge the header to pretent to be a normal browser rather than a script. 
session = requests.Session()
user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0'
session.headers['User-Agent'] = user_agent
ret = session.get(url)

# Parsing the HTML to extract section titles

In [97]:
soup = BeautifulSoup(ret.text)

In [98]:
section_class = "article-section__title section__title section1"
sections = soup.find_all('h2', class_=section_class)

subsection_class = 'article-section__sub-title section2'
subsections = soup.find_all('h3', class_=subsection_class)

In [99]:
section_titles = []
for section in sections:
    section_title =  section.get_text() # sane as section_title =  section.contents[0]
    section_title = re.sub("\d+", "", section_title)
    section_title = section_title.strip()
    section_titles.append(section_title)
section_titles

['Introduction',
 'Case Study',
 'REM Modeling',
 'Modeling Against Observational Statistics',
 'Discussions',
 'Conclusions',
 'Acknowledgments',
 'Two Radiative Ratios']

# We can also get the text after a section

In [80]:
text = ''
for paragraph in sections[0].parent.find_all('p'):
    text += paragraph.text
text

"Optically thin and thick ice clouds in the upper troposphere affect the Earth's radiation oppositely: the former warms the surface with their inefficient blocking of incoming solar radiation, whereas the latter cools the surface with their strong reflection of solar radiation back to space (Wall & Hartmann,\xa02018; Zeng et\xa0al.,\xa02009). However, these ice clouds are not represented well in the current weather and climate models (Jiang et\xa0al.,\xa02012, 2015; Klein et\xa0al., 2009; Zhang et\xa0al., 2005). For example, the Weather Research and Forecasting model with several sophisticated microphysics parameterization schemes, compared to the W-band radar observations, tends to generate excessive thick anvil clouds and insufficient thin cirrus clouds (Franklin et\xa0al.,\xa02016; Powell et\xa0al.,\xa02012; Zeng et\xa0al.,\xa02013). The global models have a more serious bias in the same direction (Barahona et\xa0al.,\xa02013; Eidhammer et\xa0al.,\xa02017; Pincus et\xa0al.,\xa02012)