# Market Data
> Updated NEM data

- toc: true 
- badges: false
- comments: true
- categories: [jupyter]
- image: images/chart-preview.png

In [12]:
#hide_input
from datetime import datetime
from datetime import date
from datetime import timedelta
import requests
import re
from tqdm import tqdm
from collections import namedtuple
from io import BytesIO
from zipfile import ZipFile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import ipywidgets as widgets
from IPython.display import display

In [13]:
#hide_input
## GLOBAL VARIABLES ##
current_date = datetime.date(datetime.now())
yesterday = current_date - timedelta(days=1)
states = ["QLD","NSW","VIC","SA","TAS","ACT","WA","NT"]
capital_cities = ["Brisbane","Sydney","Melbourne","Adelaide","Hobart","Canberra","Perth","Darwin"]

In [14]:
#hide_input
print('Last ran: ', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

Last ran:  2021-02-03 13:03:29


# NEM Price Data

In [15]:
#hide_input
base_url = "http://www.nemweb.com.au"
section = "Reports/CURRENT"


start_date=datetime.today().strftime("%Y%m%d")
start_date = datetime.strptime(start_date, '%Y%m%d')

end_date='30001225'
end_date = datetime.strptime(end_date, '%Y%m%d')




CurrentDataset = namedtuple("NemwebCurrentFile",
                            ["dataset_name",
                             "nemfile_pattern",
                             "datetime_format",
                             "datetime_column",
                             "tables"])

DATASETS = {
    "pd7day_gpg": CurrentDataset(
        dataset_name="PD7DAY",
        nemfile_pattern="PUBLIC_PD7DAY_GPG_([0-9]{14})_[0-9]{16}.zip",
        datetime_format="%Y%m%d%H%M%S",
        datetime_column="INTERVAL_DATETIME",
        tables=['GPG_PRICESOLUTION'])   
}



dataset = DATASETS['pd7day_gpg']

In [16]:
#hide_input
class ZipFileStreamer(ZipFile):
    """ZipFile subclass, with method to extract ZipFile as byte stream to memory"""

    def __init__(self, filename):
        """Initialises ZipFile object, and adds member_count attribute"""
        ZipFile.__init__(self, filename)
        self.member_count = len(self.filelist)

    def extract_stream(self, member):
        """Extract a member from the archive as a byte stream or string steam, using
        its full name. 'member' may be a filename or a ZipInfo object. """
        return BytesIO(self.read(member))






page = requests.get("{0}/{1}/{2}/".format(base_url,
                                          section,
                                          "PD7DAY"))

regex = re.compile("/{0}/{1}/{2}".format(section,
                                         "PD7DAY",
                                         "PUBLIC_PD7DAY_GPG_([0-9]{14})_[0-9]{16}.zip"))




results_df = []     # initi empty dict to store results
results_table = []



for match in tqdm(regex.finditer(page.text)):
    file_datetime = datetime.strptime(match.group(1), dataset.datetime_format)
    final_match = match
    

# add function for timing code
import pprofile
profiler = pprofile.Profile()
    
''' seperate the for loop so that we only get the most recent file'''
''' only do this for the pd7day file '''
''' TO DO: re-attach this so that the daily reports show how the PD7Day has changed throughout the day '''
with profiler:
    if end_date > file_datetime > start_date:
        ''' unpack the download function here'''
        response = requests.get("{0}{1}".format(base_url, final_match.group(0)))
        zip_bytes = BytesIO(response.content)
        ''' function then jumps to nemfile_reader.nemzip_reader(zip_bytes) '''
        with ZipFileStreamer(zip_bytes) as zipfile:
            if zipfile.member_count == 1:
                filename = zipfile.namelist()[0]    # extracts the name of the csv file we want
                nemfile_object = zipfile.extract_stream(filename)      # io.BytesIO object
        ''' nemfile_object is then passed into nemfile_reader.nemfile_reader(nemfile_object) '''
        table_dict = {}
        table_dict = nemfile_object.readlines()
                        
    

# Use list comprehension to extract price_solution
zz = [x for x in table_dict if b'PRICESOLUTION' in x]


# Convert the above list of bytest (zz) to a pd.DF 
table_dict = {}
for line in zz:
    rows = line.decode().split(',')
    table = "{0}_{1}".format(rows[1], rows[2])
    
    #new table
    if rows[0] == "I":
        table_dict[table] = line

    #append data to each table
    elif rows[0] == "D":
        table_dict[table] += line
        
price_dict = {table:pd.read_csv(BytesIO(table_dict[table]))}    # convert list of bytes to dict
price_frame = price_dict['GPG_PRICESOLUTION']
price_frame['INTERVAL_DATETIME'] = pd.to_datetime(price_frame['INTERVAL_DATETIME'], format='%Y/%m/%d %H:%M:%S')
price_frame['INTERVAL_DATE'] = price_frame['INTERVAL_DATETIME'].dt.date     # extract date only for use in pivot

180it [00:00, 17878.11it/s]


## Averaged Daily Prices
Note that current day and 8 day ahead are partial days, do not use these figures for pricing / modelling  
Click on individual states in chart to add / remove them for closer inspection

In [17]:
#hide_input
''' create a pivot table '''
''' IMPORTANT: should probably remove first and last data aggregations because they are incomplete days '''
price_pivot = pd.pivot_table(price_frame, values='RRP', index=['INTERVAL_DATE', 'REGIONID', 'RUN_DATETIME'], aggfunc=np.mean)

price_pivot = price_pivot.reset_index()
price_pivot = price_pivot.pivot(index='INTERVAL_DATE', columns='REGIONID', values='RRP')
price_pivot = price_pivot.round(2)
price_pivot['RUN_DATETIME'] = price_frame['RUN_DATETIME'][0]
print(price_pivot)

REGIONID        NSW1   QLD1     SA1    TAS1    VIC1         RUN_DATETIME
INTERVAL_DATE                                                           
2021-02-03     29.01  29.81   36.53   32.45   18.23  2021/02/03 07:30:00
2021-02-04     87.53  84.64   95.62   62.98   65.93  2021/02/03 07:30:00
2021-02-05     79.04  75.97 -157.07   52.05   48.23  2021/02/03 07:30:00
2021-02-06     45.13  55.54 -276.83  216.99 -219.95  2021/02/03 07:30:00
2021-02-07     30.04  76.11 -162.74   14.43   -4.58  2021/02/03 07:30:00
2021-02-08     48.09  63.36   -7.82  285.57   11.02  2021/02/03 07:30:00
2021-02-09     44.26  44.79   67.70  383.62   27.73  2021/02/03 07:30:00
2021-02-10     33.84  33.11   39.84   38.47   31.79  2021/02/03 07:30:00


In [41]:
#hide_input
## Plotting using altair ##
import altair as alt
regions = pd.DataFrame({'REGIONID': ['NSW1','QLD1','SA1','VIC1','TAS1']})

# Create subset DF for plotting
plot_frame = price_frame.set_index('INTERVAL_DATETIME').copy()
plot_frame = plot_frame[['REGIONID','RRP']]
plot_frame = plot_frame.reset_index()

selection = alt.selection_multi(fields=['REGIONID'])
color = alt.condition(selection, alt.Color('REGIONID:N'), alt.value('lightgray'))
make_selector = alt.Chart(regions).mark_rect().encode(y='REGIONID', color=color).add_selection(selection)

#nem_price_chart = alt.Chart(plot_frame).mark_line().encode(x='INTERVAL_DATETIME', y=alt.Y('RRP', scale=alt.Scale(domain=[0,10])), color='REGIONID').transform_filter(selection)
nem_price_chart = alt.Chart(plot_frame).mark_line().encode(
    x='INTERVAL_DATETIME', y=alt.Y('RRP'), color='REGIONID').transform_filter(selection).properties(
    width=650,
    height=400, title='30 Min NEM Prices - Interactive Chart').interactive()

make_selector | nem_price_chart

# Weather Data

In [19]:
#hide_input
import psycopg2

db = "BoM_Data"
userid = "postgres"
passwd = "iforgot23"
myHost = "localhost"

# Create a connection to the database
conn = None
try:
    # Parses the config file and connects using the connect string
    conn = psycopg2.connect(database=db,
                                user=userid,
                                password=passwd,
                                host=myHost)
except psycopg2.Error as sqle:
    print("psycopg2.Error : " + sqle.pgerror)

In [20]:
#hide_input
## WRAPPER FUNCTION FOR QUERYING ##
import psycopg2.extras

def pgquery( conn, sqlcmd, args, silent=False, returntype='tuple'):
   """ utility function to execute some SQL query statement
       it can take optional arguments (as a dictionary) to fill in for placeholder in the SQL
       will return the complete query result as return value - or in case of error: None
       error and transaction handling built-in (by using the 'with' clauses) """
   retval = None
   with conn:
      cursortype = None if returntype != 'dict' else psycopg2.extras.RealDictCursor     
      with conn.cursor(cursor_factory=cursortype) as cur:
         try:
            if args is None:
                cur.execute(sqlcmd)
            else:
                cur.execute(sqlcmd, args)
            retval = cur.fetchall() # we use fetchall() as we expect only _small_ query results
         except Exception as e:
            if e.pgcode != None and not(silent):
                print("db read error: ")
                print(e)
   return retval

In [47]:
#hide_input
## Create a table showing the daily change in temp forecasts
query_stmt = "select * from capital_city_forecasts"

raw_df = pgquery(conn, query_stmt, None)
raw_df = pd.DataFrame(raw_df)
raw_df.columns = ['town_index','index','product_id','state','town','aac','lat','lon','elev','start_time_local','end_time_local','utc_offset','start_time_utc','end_time_utc','min_temp','max_temp','lower_precip_limit','upper_precip_limit','precis','prob_precip','Date','Date_Added']
raw_df = raw_df[['town', 'min_temp','max_temp','lower_precip_limit','upper_precip_limit','Date','Date_Added']]

## BoM Temperature Forecasts
> Dashed line displays yesterday's forecast. Solid line is the most current BoM forecast

In [48]:
#hide
from plotly.subplots import make_subplots 
# Get a common x-axis to use
xaxis_current = raw_df[(raw_df.Date_Added==current_date)&(raw_df.town=='Sydney')].Date
xaxis_yesterday = raw_df[(raw_df.Date_Added==yesterday)&(raw_df.town=='Sydney')].Date


towns = raw_df.town.unique()
fig = make_subplots(rows=8, cols=1, subplot_titles=(towns))

for i,t in enumerate(raw_df.town.unique()):
    # Todays min forecast
    fig.add_trace(go.Scatter(x=xaxis_current, y=raw_df[(raw_df.town==t) & (raw_df.Date_Added==current_date)].min_temp,
                 name='Current Min Forecast', line=dict(color='royalblue', width=4)), row=(i+1), col=1)
    
    # Yesterdays min forecast
    fig.add_trace(go.Scatter(x=xaxis_yesterday, y=raw_df[(raw_df.town==t) & (raw_df.Date_Added==yesterday)].min_temp,
                 name='Yesterday Min Forecast', line=dict(color='royalblue', width=4, dash='dot')), row=(i+1), col=1)
    
    # Todays max forecast
    fig.add_trace(go.Scatter(x=xaxis_current, y=raw_df[(raw_df.town==t) & (raw_df.Date_Added==current_date)].max_temp,
                 name='Current Max Forecast', line=dict(color='firebrick', width=4)), row=(i+1), col=1)   
    
    # Yesterday max forecast
    fig.add_trace(go.Scatter(x=xaxis_yesterday, y=raw_df[(raw_df.town==t) & (raw_df.Date_Added==yesterday)].max_temp,
                 name='Yesterday Max Forecast', line=dict(color='firebrick', width=4, dash='dot')), row=(i+1), col=1)  

fig.update_layout(height=1400, width=800, title_text='BoM Temperature Forecasts', showlegend=False)    
fig.show()

# For next cell
#from IPython.display import HTML
#hide_input
#HTML(fig.to_html())

In [None]:
#hide_input
from IPython.display import HTML