# Apollo Energy Data Scraping

<br>

### Imports

In [75]:
import json
import pandas as pd

import os
import re
import requests
import FEAutils as hlp
from warnings import warn
from dotenv import load_dotenv

from IPython.display import JSON

In [83]:
load_dotenv('.env')

webhook_url = os.getenv('SLACK_WEBHOOK_URL')

In [77]:
def get_price_data():
    prices_url = 'https://www.apolloenergy.co.uk/news/current-uk-energy-prices'
    markets = ['Power', 'Gas', 'Brent & Coal']

    price_data = [df.set_index(df.columns[0]).T.to_dict() for df in pd.read_html(prices_url)]
    price_data = dict(zip(markets, price_data))
    
    return price_data

try:
    price_data = get_price_data()
except:
    message = 'The latest price data could not be retrieved'
    price_data = handle_error_message(message, webhook_url)

JSON(price_data)

<IPython.core.display.JSON object>

In [79]:
create_analysis_url = lambda date: f"https://www.apolloenergy.co.uk/news/energy-market-analysis-{pd.to_datetime(date).strftime('%d-%m-%Y')}"

def get_analysis_date():
    current_weekday = pd.Timestamp.now().weekday()

    if current_weekday > 4:
        days_to_offset = current_weekday - 4
        date = (pd.Timestamp.now() - pd.Timedelta(days=days_to_offset)).strftime('%Y-%m-%d')

    return date

def handle_error_message(message, webhook_url=None):
    warn(message)

    if webhook_url is not None:
        hlp.send_slack_msg(message, webhook_url)    
    
    json_message = {'message': message}
    
    return json_message

def extract_market_analysis(analysis_url):
    power_data = pd.read_html(analysis_url)[0].iloc[:, 0].to_list()
    gas_data = pd.read_html(analysis_url)[0].iloc[:, 1].to_list()

    brent_analysis_txt = pd.read_html(analysis_url)[1].iloc[0, 0]
    brent_sections = ['Brent Summary', '1-year forward prices']
    brent_content = [elem.strip() for elem in re.split(' |'.join(brent_sections), brent_analysis_txt) if elem != '']
    brent_data = dict(zip(brent_sections, brent_content))

    market_analysis = {
        power_data[0]: {
            power_data[1]: power_data[2],
            power_data[3]: power_data[4]
        },
        gas_data[0]: {
            gas_data[1]: gas_data[2],
            gas_data[3]: gas_data[4]
        },
        'Brent': brent_data
    }
    
    return market_analysis 

def clean_market_analysis(market_analysis):
    char_replacements = {
        'â\x80\x98': '\'',
        'â\x80\x99': '\'',
        'Â': '',
        "Today's prices can also be found in an easy to read table on our 'current UK energy price' page.": ''
    }

    for mkt, analysis in market_analysis.items():
        for analysis_section, section_content in analysis.items():
            for old, new in char_replacements.items():
                section_content = section_content.replace(old, new)

            market_analysis[mkt][analysis_section] = section_content

    return market_analysis

def retrieve_cleaned_market_analysis(webhook_url=None):
    date = get_analysis_date()
    analysis_url = create_analysis_url(date)
    
    try:
        requests.get(analysis_url).raise_for_status() # checks page can be retrieved
    except:
        message = f'A market analysis page could not be found for {date}'
        json_message = handle_error_message(message, webhook_url)
            
        return json_message
        
    market_analysis = extract_market_analysis(analysis_url)
    market_analysis = clean_market_analysis(market_analysis)
    
    return market_analysis

try:
    market_analysis = retrieve_cleaned_market_analysis(webhook_url)
except:
    message = f'The market analysis page for {date} could not be retrieved/processed'
    market_analysis = handle_error_message(message, webhook_url)

JSON(market_analysis)

<IPython.core.display.JSON object>