# Energy Live News

<br>

### Imports

In [1]:
import pandas as pd

import requests
from bs4 import BeautifulSoup

from IPython.core.display import JSON

In [2]:
topic_to_bbc_url = lambda topic: f'https://www.bbc.co.uk/news/topics/{topic}'

url = topic_to_bbc_url('cdl8n2edl43t/energy-industry')
r = requests.get(url)

r

<Response [200]>

In [3]:
soup = BeautifulSoup(r.content)

articles_soup = soup.findAll('article')

len(articles_soup)

20

In [4]:
def extract_article_data(soup):
    attr_to_extractor_func = {
        'date': lambda soup: pd.to_datetime(soup.find('time').findAll('span')[-1].text).strftime('%Y-%m-%d %H:%M'),
        'title': lambda soup: soup.find('a').text,
        'lead': lambda soup: soup.findAll('p')[-1].text,
        'article_url': lambda soup: 'https://www.bbc.co.uk' + soup.find('a')['href'],
        'image_url': lambda soup: soup.find('img')['src']
    }
    
    article = dict()
    
    for attr, extractor_func in attr_to_extractor_func.items():
        try:
            article[attr] = extractor_func(soup)
        except:
            pass
    
    return article

article = extract_article_data(articles_soup[0])

article

{'date': '2020-11-11 17:21',
 'title': "Offshore projects restart 'could take three years'",
 'article_url': 'https://www.bbc.co.uk/news/uk-scotland-scotland-business-54905615',
 'image_url': 'https://ichef.bbci.co.uk/live-experience/cps/320/cpsprodpb/CAC3/production/_114670915_capture.png'}

In [5]:
def response_to_articles(r):
    soup = BeautifulSoup(r.content)
    articles_soup = soup.findAll('article')

    articles = []

    for article_soup in articles_soup:
        article = extract_article_data(article_soup)
        articles += [article]
        
    return articles

articles = response_to_articles(r)
    
JSON([articles])

<IPython.core.display.JSON object>

In [6]:
def retrieve_all_current_articles(topics=['cdl8n2edl43t/energy-industry', 'cx1m7zg0gpet/renewable-energy']):
    articles = []

    for topic in topics:
        topic_readable = topic.split('/')[1]
        topic_url = topic_to_bbc_url(topic)
        topic_r = requests.get(topic_url)
        topic_articles = response_to_articles(topic_r)

        for article in topic_articles:
            article.update( {'section': topic_readable})

        articles += topic_articles
        
    return articles

articles = retrieve_all_current_articles()
    
JSON([articles])

<IPython.core.display.JSON object>