# Carbon Brief Scraper Development

<br>

### Imports

In [1]:
import json
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns

import requests
from bs4 import BeautifulSoup

import datetime
import re

from ipypb import track
from IPython.core.display import display, HTML, JSON

<br>

### Categories

Retrieving a webpage

In [2]:
def request_CB_category_page(category='science'):
    CB_url = f'https://www.carbonbrief.org/category/{category}'

    headers = {
        'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
    }

    r = requests.get(CB_url, headers=headers)
    
    return r

r = request_CB_category_page()

r

<Response [200]>

<br>

We'll then extract the top-level articles

In [3]:
def extract_topcat(soup):
    topcats = soup.findAll('div', {'class':'ePostC topCat'})
    assert len(topcats) == 1, 'More than one topcat was found'
    topcat = topcats[0]

    article = dict()

    article['date'] = pd.to_datetime(re.sub('\s\s+' , ' ', topcat.find('div', {'class':'dateCat'}).text).strip().replace('.', '')).strftime('%Y-%m-%d')
    article['category'] = topcat.find('div', {'class':'catDate'}).text.split('|')[0].replace('\n', '').strip()
    article['title'] = topcat.find('h3').text.replace('\n', '')
    article['article_url'] = topcat.find('h3').find('a')['href']
    article['image_url'] = topcat.find('img')['src']

    return article

soup = BeautifulSoup(r.content)

articles = list()
topcat_article = extract_topcat(soup)
articles += [topcat_article]

articles

[{'date': '2020-10-30',
  'category': 'Guest posts',
  'title': 'Guest post: How declining ice in clouds makes high ‘climate sensitivity’ plausible',
  'article_url': 'https://www.carbonbrief.org/guest-post-how-declining-ice-in-clouds-makes-high-climate-sensitivity-plausible',
  'image_url': 'https://www.carbonbrief.org/wp-content/uploads/2020/10/Cumulus-clouds-seen-from-above-over-the-English-Channel-583x372.jpg'}]

<br>

Then extract the mid-level articles

In [4]:
def extract_midcat_data(midcat):
    cat_date = midcat.find('div', {'class':'catDate'})
    category, date = cat_date.text.split('|')

    article = dict()
    
    article['date'] = pd.to_datetime(date.replace('.', '').strip()).strftime('%Y-%m-%d')
    article['category'] = category
    article['title'] = midcat.find('h3').text.replace('\n', '')
    article['article_url'] = midcat.find('h3').find('a')['href']
    article['image_url'] = midcat.find('img')['src']

    return article

midcats = soup.findAll('div', {'class':'ePost3'})
print(f'There were {len(midcats)} articles found in the middle section')

for midcat in midcats:
    midcat_article = extract_midcat_data(midcat)
    articles += [midcat_article]

midcat_article

There were 6 articles found in the middle section


{'date': '2020-09-18',
 'category': 'Webinars ',
 'title': 'Webinar: Do we need to stop eating meat and dairy to tackle climate change?',
 'article_url': 'https://www.carbonbrief.org/webinar-do-we-need-to-stop-eating-meat-and-dairy-to-tackle-climate-change',
 'image_url': 'https://www.carbonbrief.org/wp-content/uploads/2020/09/Food-webinar-583x372.png'}

<br>

Then extract the bottom-level articles

In [5]:
bottomcats = soup.find('div', {'class':'ePostC catSmll'}).findAll('div', {'class':'col-md-6'})
print(f'There were {len(bottomcats)} articles found in the bottom section')

for bottomcat in bottomcats:
    bottomcat_article = extract_midcat_data(bottomcat)
    articles += [bottomcat_article]
    
bottomcat_article

There were 6 articles found in the bottom section


{'date': '2020-09-23',
 'category': 'Extreme weather ',
 'title': 'US sees ‘alarming’ increase in combined heatwaves and droughts',
 'article_url': 'https://www.carbonbrief.org/us-sees-alarming-increase-in-combined-heatwaves-and-droughts',
 'image_url': 'https://www.carbonbrief.org/wp-content/uploads/2020/09/Firefighters-watch-the-intense-flames-at-the-Pine-Gulch-Fire-Colorado-107x71.jpg'}

<br>

We'll then put these all together in a a single function

In [6]:
def response_to_articles(r):
    articles = list()
    soup = BeautifulSoup(r.content)

    ## Topcat
    topcat_article = extract_topcat(soup)
    articles += [topcat_article]

    ## Midcats
    midcats = soup.findAll('div', {'class':'ePost3'})

    for midcat in midcats:
        midcat_article = extract_midcat_data(midcat)
        articles += [midcat_article]

    ## Bottomcats
    bottomcats = soup.find('div', {'class':'ePostC catSmll'}).findAll('div', {'class':'col-md-6'})

    for bottomcat in bottomcats:
        bottomcat_article = extract_midcat_data(bottomcat)
        articles += [bottomcat_article]
        
    return articles

articles = response_to_articles(r)

JSON(articles)

<IPython.core.display.JSON object>

<br>

We can iterate over all of the available categories as well convert the resulting dictionary into a dataframe

In [7]:
def retrieve_all_current_articles():
    articles = list()
    sections = ['science', 'energy', 'policy']

    for section in sections:
        r = request_CB_category_page(section)

        section_articles = response_to_articles(r)
        [article.update({'section':section}) for article in section_articles]
        articles += section_articles
        
    return articles

articles = retrieve_all_current_articles()
    
df_articles = (pd
               .DataFrame(articles)
               .drop_duplicates())

df_articles.head()

Unnamed: 0,date,category,title,article_url,image_url,section
0,2020-10-30,Guest posts,Guest post: How declining ice in clouds makes ...,https://www.carbonbrief.org/guest-post-how-dec...,https://www.carbonbrief.org/wp-content/uploads...,science
1,2020-10-29,Features,Mapped: How climate change disproportionately ...,https://www.carbonbrief.org/mapped-how-climate...,https://www.carbonbrief.org/wp-content/uploads...,science
2,2020-10-23,State of the climate,State of the climate: 2020 on course to be war...,https://www.carbonbrief.org/state-of-the-clima...,https://www.carbonbrief.org/wp-content/uploads...,science
3,2020-10-19,Guest posts,Guest post: Demand for cooling is blind spot f...,https://www.carbonbrief.org/guest-post-demand-...,https://www.carbonbrief.org/wp-content/uploads...,science
4,2020-09-21,Sea ice,Arctic sea ice shrinks to second-lowest summer...,https://www.carbonbrief.org/arctic-sea-ice-shr...,https://www.carbonbrief.org/wp-content/uploads...,science


<br>

We can also work back from the article dictionaries to the original HTML

In [8]:
article_to_html_str = lambda article: f'<br><div class="ePost3"><a href="{article["article_url"]}"><img src="{article["image_url"]}" alt=""></a><h3 class="deTitle"><a href="{article["article_url"]}">{article["title"]}</a></h3><div class="catDate"><a href="https://www.carbonbrief.org/category/{article["category"].replace(" ", "-")}">{article["category"]}</a> | {article["date"]}</div></div><br>'

article = df_articles.iloc[0]
article_str = article_to_html_str(article)

display(HTML(article_str))

<br>

### Daily Brief

Carbon Brief also provides a daily briefing that combines headlines from a number of news sources. Before we can retrieve it we need to identify the relevant url

In [10]:
def get_daily_briefing_url(dt=pd.Timestamp.now()):
    daily_briefing_request_url = 'https://www.carbonbrief.org/wp-admin/admin-ajax.php'

    days_to_go_back = max(dt.dayofweek-4, 0)
    briefing_dt = (dt-pd.Timedelta(days=days_to_go_back)).strftime('%Y-%m-%d')

    data = {
        'action': 'getbriefurl',
        'nonce': '9e4988efb4',
        'date': briefing_dt
    }

    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
    }

    r = requests.post(daily_briefing_request_url, data=data, headers=headers)
    daily_briefing_url = r.text
    
    if daily_briefing_url == '':
        daily_briefing_url = get_daily_briefing_url(pd.Timestamp.now()-pd.Timedelta(days=1))
    
    return daily_briefing_url

get_daily_briefing_url()

'https://www.carbonbrief.org/daily-brief/macron-hails-chance-to-make-our-planet-great-again-after-biden-win'

<br>

We can then make the request to the relevant page

In [10]:
def request_CB_daily_brief_page():
    CB_url = get_daily_briefing_url()

    headers = {
        'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
    }

    r = requests.get(CB_url, headers=headers)
    
    return r

r = request_CB_daily_brief_page()

r

<Response [200]>

<br>

Then extract the relevant information

In [12]:
def extract_daily_briefing():
    r = request_CB_daily_brief_page()
    
    content = BeautifulSoup(r.content).find('div', {'class':'innerArt'})
    daily_briefing = dict()
    
    daily_briefing['title'] = content.find('div', {'class': 'miscTitle'}).text.title()
    daily_briefing['headlines'] = [elem.text.replace('\n', ' ') for elem in content.find('div', {'class': 'dailyheadlinesbox'}).findAll('li')]

    return daily_briefing

daily_briefing = extract_daily_briefing()

daily_briefing

{'title': 'Macron Hails Chance To ‘Make Our Planet Great Again’ After Biden Win',
 'headlines': ["Macron hails chance to 'make our planet great again' after Biden win",
  'Government targets two million new UK green jobs by 2030',
  'Unhaltable global warming claim withdrawn by Scientific Reports journal',
  'Samsung financial units pledge to halt coal investments',
  'Rising levels of CO2 increasing extreme weather events in Australia, report finds',
  'When will electricity companies finally quite natural gas?',
  'Adapting to the challenges of warming',
  'Hitting net zero is not enough – we must restore the climate',
  "Ma Jun: China has started to 'walk the walk' on climate crisis",
  'How aerosols and greenhouse gases influence the diurnal temperature range']}

<br>

We'll wrap this in a function that returns a raw string to us

In [13]:
def daily_briefing_url_to_text():
    daily_briefing = extract_daily_briefing()
    daily_briefing_text = f"{daily_briefing['title']}\n\n* {(chr(10)+'* ').join(daily_briefing['headlines'])}"
    
    return daily_briefing_text

print(daily_briefing_url_to_text())

Macron Hails Chance To ‘Make Our Planet Great Again’ After Biden Win

* Macron hails chance to 'make our planet great again' after Biden win
* Government targets two million new UK green jobs by 2030
* Unhaltable global warming claim withdrawn by Scientific Reports journal
* Samsung financial units pledge to halt coal investments
* Rising levels of CO2 increasing extreme weather events in Australia, report finds
* When will electricity companies finally quite natural gas?
* Adapting to the challenges of warming
* Hitting net zero is not enough – we must restore the climate
* Ma Jun: China has started to 'walk the walk' on climate crisis
* How aerosols and greenhouse gases influence the diurnal temperature range


<br>

### All Articles

We can use the website's search functionality to identify all historical articles

In [13]:
def search_CB_articles(page=0, items=100):
    search_url = 'https://www.carbonbrief.org/wp-admin/admin-ajax.php'

    form_data = {
        'currpage' : page,
        'offset' : items,
        'items' : items,
        'layout' : 'large',
        'action' : 'loadmore',
        'allowsorting' : 'true',

    }

    headers = {
        'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
    }

    r = requests.get(search_url, params=form_data, headers=headers)
    
    return r

r = search_CB_articles()

r

<Response [200]>

<br>

The response is a JSON that contains the HTML content for each article

In [14]:
r_page_html = r.json()['content']
r_page_soup = BeautifulSoup(r_page_html)

page_articles = r_page_soup.findAll('div', {'class':'esubDtls'})
article_soup = page_articles[0]

print(article_soup.prettify())

<div class="esubDtls carbonfullwidth">
 <h3>
  <a href="https://www.carbonbrief.org/daily-brief/macron-hails-chance-to-make-our-planet-great-again-after-biden-win">
   Macron hails chance to ‘make our planet great again’ after Biden win
  </a>
 </h3>
 <p>
  French president Emmanuel Macron has welcomed US president-elect Joe Biden’s pledge to rejoin the Paris climate deal on taking office by saying countries now have the chance to…
  <a class="normalLink" href="https://www.carbonbrief.org/daily-brief/macron-hails-chance-to-make-our-planet-great-again-after-biden-win">
   Read More
  </a>
 </p>
 <div class="catDate">
  <a href="https://www.carbonbrief.org/daily-brief/macron-hails-chance-to-make-our-planet-great-again-after-biden-win">
   Daily Briefing
  </a>
  | 13.11.20
 </div>
</div>


<br>

We'll define a function that extracts the relevant high-level information from an article into a dictionary

In [15]:
def search_article_to_data(article_soup):
    article = dict()

    cat_date = article_soup.find('div', {'class':'catDate'})
    cat_dates = cat_date.text.split('|')
    
    if len(cat_dates) >= 2:
        category, date = cat_dates[-2:]
    else:
        category, date = np.nan, cat_dates[0]

    article['date'] = pd.to_datetime(date.replace('.', '-').strip(), format='%d-%m-%y').strftime('%Y-%m-%d')
    article['category'] = category
    article['title'] = article_soup.find('h3').text.replace('\n', '')
    article['article_url'] = article_soup.find('h3').find('a')['href']

    return article

article = search_article_to_data(article_soup)

article

{'date': '2020-11-13',
 'category': 'Daily Briefing ',
 'title': 'Macron hails chance to ‘make our planet great again’ after Biden win',
 'article_url': 'https://www.carbonbrief.org/daily-brief/macron-hails-chance-to-make-our-planet-great-again-after-biden-win'}

<br>

We'll extract this metadata for the first 25 pages of search results

In [24]:
all_articles = list()

for page in track(range(25)):
    try:
        r = search_CB_articles(page)

        r_page_html = r.json()['content']
        r_page_soup = BeautifulSoup(r_page_html)

        page_articles = r_page_soup.findAll('div', {'class':'esubDtls'})
        all_articles += page_articles
        
    except:
        pass
    
print(f'{len(all_articles)} articles were returned')

2500 articles were returned


<br>

Then convert it into a dataframe

In [25]:
%%time

all_article_data = [search_article_to_data(article) for article in all_articles]
df_all_article_data = pd.DataFrame(all_article_data)

df_all_article_data.head()

Wall time: 635 ms


Unnamed: 0,date,category,title,article_url
0,2020-11-13,Daily Briefing,Macron hails chance to ‘make our planet great ...,https://www.carbonbrief.org/daily-brief/macron...
1,2020-11-12,Daily Briefing,"Biden launches transition teams of Obama alum,...",https://www.carbonbrief.org/daily-brief/biden-...
2,2020-11-11,Daily Briefing,Biden talks climate in calls with foreign leaders,https://www.carbonbrief.org/daily-brief/biden-...
3,2020-11-10,Media analysis,Media reaction: What Joe Biden’s US election v...,https://www.carbonbrief.org/media-reaction-wha...
4,2020-11-10,Daily Briefing,World’s largest coal producer warns of bankrup...,https://www.carbonbrief.org/daily-brief/worlds...


<br>

### Scraping Article Content

We'll subset just the science articles for this example

In [26]:
s_science_article_urls = df_all_article_data.loc[df_all_article_data['category']=='Science ', 'article_url']

s_science_article_urls

96      https://www.carbonbrief.org/vacancy-science-jo...
1591    https://www.carbonbrief.org/donald-trumps-budg...
Name: article_url, dtype: object

<br>

Just as we did for the daily briefing we'll create a wrapper that converts article data into plain text

In [29]:
def article_url_to_text(article_url):
    headers = {
        'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
    }
    
    r = requests.get(article_url, headers=headers)
    article_content = BeautifulSoup(r.content).find('div', {'class':'innerArt'}).findAll('p', recursive=False)

    article_text = ''.join([article_content_item.text for article_content_item in article_content if hasattr(re.search('[a-zA-Z0-9_]', article_content_item.text), 'start')])
    article_text = article_text.encode('ascii', 'replace').decode().replace('?', ' ')

    return article_text

article_url = s_science_article_urls.iloc[0]
article_text = article_url_to_text(article_url)

num_words_to_print = 500
print(' '.join(article_text.split(' ')[:num_words_to_print]))

This vacancy is now closed.This is an exciting opportunity to become Carbon Brief s new science journalist, helping us to analyse and report climate change and society s response to it.Working closely with the Science Editor and the rest of the team, you ll report on the latest scientific developments relevant to climate change. This could range from disentangling claims in the media about heatwaves and climate change, to summarising the latest study on Arctic sea ice loss, or working with our multimedia journalist on an infographic about wildfires.Carbon Brief s award-winning science coverage, data journalism and analysis is respected by scientists, journalists and policymakers around the world. You ll be comfortable conveying the point of a story while representing the subtleties of scientific information in a clear and engaging way. With a strong grounding in science, you ll be meticulous about accuracy and fact-checking. EssentialDesirableLocation: Carbon Brief s office is in centr

<br>

We can then iterate through them and save all of the raw text in a dictionary

In [34]:
articles = {}

for idx, article_url in track(s_science_article_urls.items(), total=s_science_article_urls.shape[0]):
    article_text = article_url_to_text(article_url)
    articles[idx] = article_text
    
JSON([articles])

<IPython.core.display.JSON object>