In [1]:
# = = = = = = = = = = = = = 
# Created:       15:09:2020
# Last Updated:  05:11:2020
#
# Description:  Webscraper targeted at reading table format of fuel prices provided by AIP and preserve them in CSV format
#               Focused on using Requests library
# = = = = = = = = = = = = = 

In [2]:
# What I need to do
#
# Retrieve values from web page
# - Convert values to object
# - Store values in Array
#
# Retrieve values from CSV
# - Convert values to object
# - Store values in Array
#
# Compare Web values to CSV Values
# - Append Web to CSV if new values
# - Update CSV values if web values are different from same time
# 
# Update CSV if new or updated values present
# - Create new file for old values if too many
# - File name: fuel-data-YYYYMMDD-YYYYMMDD.csv

In [3]:
import requests
import datetime
import os
import csv

In [4]:
def isfloat(value):
  try:
    float(value)
    return True
  except ValueError:
    return False

In [33]:
class Fuel_Price:
    def __init__(self, time, date, loc, avg, chng, var, low, high):
        self.time = float(time)
        self.date = date
        self.loc = loc
        self.avg = float(avg)
        self.low = float(low)
        self.high = float(high)
        
        self.chng = float(chng) if isfloat(chng) else 0.0
        self.var = float(var) if isfloat(var) else 0.0
       
    # Convert object to a useable array to be written to file
    def toArray(self):
        return [str(self.time),
               self.date,
               self.loc,
               str(self.avg),
               str(self.chng),
               str(self.var),
               str(self.low),
               str(self.high)]
    
    def compare(self, price):
        if (self.time == price.time) &&
           (self.loc == price.loc):
                return True
        return False


SyntaxError: invalid syntax (<ipython-input-33-3e733175f3e1>, line 25)

In [6]:
def json_to_fuelPrice(fdata, ftime):
    # Object Data created from input
    prices = []
    
    for item in fdata:
        if fdata[item]['location'].lower() in targets:
            prices.append(Fuel_Price(
                ftime.timestamp(),
                ftime.strftime("%Y-%m-%d"),
                fdata[item]['location'],
                fdata[item]['weeklyAverage'],
                fdata[item]['weeklyChange'],
                fdata[item]['diff'],
                fdata[item]['weeklyLow'],
                fdata[item]['weeklyHigh']
            ))
            
    return prices

In [7]:
def get_Sunday():
    today = datetime.date.today()

    idx = (today.weekday() +1) % 7
    
    lstSun = today - datetime.timedelta(idx)
    
    return datetime.datetime(lstSun.year,lstSun.month,lstSun.day)

In [12]:
def get_csv_prices(fName):
    data = []
    if os.path.exists(fName):
        with open(fName) as fd:
            reader = csv.reader(fd, delimiter=',')
            next(reader)
            for e in reader:
                data.append(Fuel_Price(e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7]))
    return data

In [9]:
# Fuel data CSV
fileName = "fuel_data.csv"

# Target Cities
targets = ['canberra', 'sydney', 'batemans bay', 'cooma', 'goulburn', 'coffs harbour', 'wollongong', 'yass']

# Link to the JSON that contains week values
url = requests.get('https://aip.com.au/aip-api-request?api-path=public/api&call=nswUlpTable&fuelType=undefined')

In [10]:
# JSON of data retrieved from 
fuelJSon = url.json()
# Convert to Fuel_Price Array
urlPrices = json_to_fuelPrice(fuelJSon, get_Sunday())

In [19]:
# Get prices from CSV
csvPrices = get_csv_prices(fileName)

In [28]:
def list_Combine(list1, list2):
    time1 = get_Latest_Time(list1)
    time2 = list2[0].time
    
    ## Append if new values to list
    if (time1 != time2):
        return list1 + list2
    else: # Update list
        for val1 in list1:
            if val1.time == time1:
                for val2 in list2:
                    if val1.compare(val2):
                        val1 = val2
    return list1

In [29]:
def get_Latest_Time(fList):
    timestamp = 0.0
    
    for val in fList:
        if val.time > timestamp:
            timestamp = val.time
    return timestamp

In [30]:
prices = list_Combine(csvPrices, urlPrices)

for val in prices:
    print(val.time)

1599919200.0
1599919200.0
1599919200.0
1599919200.0
1599919200.0
1599919200.0
1599919200.0
1599919200.0
1600524000.0
1600524000.0
1600524000.0
1600524000.0
1600524000.0
1600524000.0
1600524000.0
1600524000.0
1601128800.0
1601128800.0
1601128800.0
1601128800.0
1601128800.0
1601128800.0
1601128800.0
1601128800.0
1601733600.0
1601733600.0
1601733600.0
1601733600.0
1601733600.0
1601733600.0
1601733600.0
1601733600.0
1602334800.0
1602334800.0
1602334800.0
1602334800.0
1602334800.0
1602334800.0
1602334800.0
1602334800.0
1602939600.0
1602939600.0
1602939600.0
1602939600.0
1602939600.0
1602939600.0
1602939600.0
1602939600.0
1603544400.0
1603544400.0
1603544400.0
1603544400.0
1603544400.0
1603544400.0
1603544400.0
1603544400.0
1604149200.0
1604149200.0
1604149200.0
1604149200.0
1604149200.0
1604149200.0
1604149200.0
1604149200.0
1605963600.0
1605963600.0
1605963600.0
1605963600.0
1605963600.0
1605963600.0
1605963600.0
1605963600.0


In [252]:
def latest_timestamp(fdata):
    timestamp = 0.0
    
    for val in fdata:
        if val.time > timestamp:
            timestamp = val.time
    return timestamp

In [253]:
def get_latest(fdata):
    timestamp = latest_timestamp(fdata)
    
    latest = []
    
    for val in fdata:
        if val.time == timestamp:
            latest.append(val)
            
    return latest

In [254]:
stored_data = retrieve_data(fileName)

# Objects are pass by reference
latest_objects = get_latest(stored_data)

In [260]:
def convert_weekly(fdata, fcurr_time):
    # Object Data created from input
    obj_data = []
    
    for item in fdata:
        if fdata[item]['location'].lower() in targets:
            # time, date, loc, avg, chng, var, low, high, flag
            obj_data.append(Weekly_Price(
                fcurr_time.timestamp(),
                fcurr_time.strftime("%Y-%m-%d"),
                fdata[item]['location'],
                fdata[item]['weeklyAverage'],
                fdata[item]['weeklyChange'],
                fdata[item]['diff'],
                fdata[item]['weeklyLow'],
                fdata[item]['weeklyHigh'],
                'n'
            ))
            
    return obj_data

In [256]:
# JSON of data retrieved from 
data = convert_weekly(url.json(), curr_time)

In [None]:
def compare_Update(new_obj, exis_obj):
    updated = False
    
    for val1 in exis_obj:
        for val2 in new_obj:
            if val1.compare_locTime(val2):
                if not all(val1.compare_values(val2)):
                    val1.update_obj(val2.avg, val2.chng, val2.var, val2.low, val2.high)
                    updated = true
            elif val1.compare_loc(val2):
                
        
    
    return updated

In [257]:
# Flag whether we've updated values
updated = False

# Compare current week to most recent data
for val1 in data:
    for val2 in latest_objects:
        if val2.compare_locTime(val1):
            # If values from the same week aren't the same, update them with the more
            # Recent ones.
            if not all(val2.compare_values(val1)):
                val2.update_obj(val1.avg, val1.chng, val1.var, val1.low, val1.high)
                updated = True
        elif val1.compare_loc(val2):
            # If values from two different weeks are identical, flag the value
            if all(val2.compare_values(val1)):
                val2.update_flag('y')

In [258]:
if latest_objects[0].time != data[0].time:
    print("here")
elif updated:
    with open(fileName, 'w', newline='') as fd:
        writer = csv.writer(fd)
        # Write Headers
        writer.writerow(["Timestamp (Unix)","Date (YYYY-MM-DD)","Location", "Weekly Average", "Weekly Change", "Variation", "Weekly Low", "Weekly High", "flagged (y/n/d)"])
        
        for value in stored_data:
            writer.writerow(value.toArray())

In [259]:
# Gets skipped if we already have this weeks valeus
if (not check_week(fileName, curr_time)):
    ## The request made to get the raw infromation used for populating the table
    ## Found in the Networks tab of inspect element
    

    ## The URL is a JSON file, thus we will be just reading it as a JSON
    data = url.json()
        
    with open(fileName, 'a', newline='') as fd:
        writer = csv.writer(fd)
        ## Iterate through JSON objects
        for item in data:
            if data[item]['location'].lower() in targets:
                # Following syntax
                element = [
                    curr_time.timestamp(),
                    curr_time.strftime("%Y-%m-%d"),
                    data[item]['location'],
                    data[item]['weeklyAverage'],
                    data[item]['weeklyChange'],
                    data[item]['diff'],
                    data[item]['weeklyLow'],
                    data[item]['weeklyHigh']
                ]
                
                writer.writerow(element)