initial

StashProperty · Feb 15, 2024 · f7f1b28 · f7f1b28
1 parent e53fa76
commit f7f1b28
Showing 1 changed file with 112 additions and 24 deletions.
diff --git a/scraper.py b/scraper.py
@@ -1,24 +1,112 @@
-# This is a template for a Python scraper on morph.io (https://morph.io)
-# including some code snippets below that you should find helpful
-
-# import scraperwiki
-# import lxml.html
-#
-# # Read in a page
-# html = scraperwiki.scrape("http://foo.com")
-#
-# # Find something on the page using css selectors
-# root = lxml.html.fromstring(html)
-# root.cssselect("div[align='left']")
-#
-# # Write out to the sqlite database using scraperwiki library
-# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
-#
-# # An arbitrary query against the database
-# scraperwiki.sql.select("* from data where 'name'='peter'")
-
-# You don't have to do things with the ScraperWiki and lxml libraries.
-# You can use whatever libraries you want: https://morph.io/documentation/python
-# All that matters is that your final data is written to an SQLite database
-# called "data.sqlite" in the current working directory which has at least a table
-# called "data".
+import datetime
+import os
+import time
+
+import requests
+import scraperwiki
+
+days_offset_from = int(os.getenv('DAYS_OFFSET_FROM', 1))
+days_offset_to = int(os.getenv('DAYS_OFFSET_TO', 0))
+progress = os.getenv('PROGRESS', 'all')  # In Progress|Decided|Past|all
+councils = os.getenv('COUNCILS', 'ipswich').split(",")
+
+
+today = datetime.datetime.strftime(datetime.datetime.now(), "%m-%d-%Y")
+
+urls = dict(
+    ipswich=dict(
+        url='https://developmenti.ipswich.qld.gov.au/Geo/GetApplicationFilterResults',
+        info_url='https://developmenti.ipswich.qld.gov.au/Home/FilterDirect?filters=DANumber=',
+        property_details_url='https://developmenti.ipswich.qld.gov.au/Geo/GetPropertyDetailsByLandNumber?landNumber=',
+    ),
+    brisbane=dict(
+        url='https://developmenti.brisbane.qld.gov.au/Geo/GetApplicationFilterResults',
+        info_url='https://developmenti.brisbane.qld.gov.au/Home/FilterDirect?filters=DANumber=',
+        property_details_url='https://developmenti.brisbane.qld.gov.au/Geo/GetPropertyDetailsByLandNumber?landNumber=',
+    ),
+)
+
+# there seem to be a lot of duplicates
+council_references = set()
+
+
+def extract_feature(feature, council):
+    lng, lat = feature['geometry']['coordinates']
+    properties = feature['properties']
+    council_reference = properties['application_number']
+    try:
+        record = scraperwiki.sql.select("* from data where authority_label=? and council_reference=?", [council, council_reference])[0]
+    except:
+        record = dict()
+    record.update(
+        council_reference=council_reference,
+        authority_label=council,
+        description=properties['description'],
+        category_desc=properties['category_desc'],
+        info_url=urls[council]['info_url'] + properties['application_number'],
+        date_received=properties['date_received'],
+        progress=properties['progress'],
+        date_scraped=today,
+        lat=lat,
+        lng=lng,
+        land_id=properties.get('land_no')
+    )
+    if council_reference not in council_references:
+        council_references.add(council_reference)
+        print("Saving %s, %s..." % (council_reference, record['description']))
+        scraperwiki.sqlite.save(['authority_label', 'council_reference'], record)
+
+
+for council in councils:
+    print("Importing development.i records for %s" % council)
+    has_more_pages = True
+    total_number_returned = 0
+
+    while has_more_pages:
+        print("Downloading Applications with offset %d" % total_number_returned)
+        resp = requests.post(urls[council]['url'], json={
+            "Progress": "all",
+            "StartDateUnixEpochNumber": int(str(int(time.mktime((datetime.date.today() - datetime.timedelta(days=days_offset_from)).timetuple()))) + "000"),
+            "EndDateUnixEpochNumber": int(str(int(time.mktime((datetime.date.today() - datetime.timedelta(days=days_offset_to) + datetime.timedelta(days=1)).timetuple()))) + "999"),
+            "DateRangeField": "submitted",
+            "SortField": "submitted",
+            "SortAscending": False,
+            "PagingStartIndex": total_number_returned,
+            "MaxRecords": 200,
+            "ShowCode": True, "ShowImpact": True, "ShowOther": True, "ShowIAGA": True, "ShowIAGI": True,
+            "ShowRequest": True,
+            "ShowNotifiableCode": True,
+            "ShowReferralResponse": True,
+            "IncludeAroundMe": False,
+            "PixelWidth": 800, "PixelHeight": 800
+        })
+
+        raw = resp.json()
+
+        for feature in raw['features']:
+            extract_feature(feature, council)
+
+        for multiSpot in raw['multiSpot'].values():
+            for feature in multiSpot:
+                extract_feature(feature, council)
+
+        number_returned = raw['numberReturned']
+        total_number_returned += number_returned
+        total_features = raw['totalFeatures']
+        has_more_pages = total_number_returned < total_features
+
+    # populate each DA's address and lot plan if missing
+    missing_address_query = "* from data where authority_label=? and land_id is not null"
+    if 'address' in list(scraperwiki.sql.dt.column_names('data')):
+        missing_address_query += " and (address is null or lot_plan is null)"
+    das = scraperwiki.sql.select(missing_address_query, [council])
+
+    print("Populate %d DA's with address and lot/plan" % len(das))
+    for da in das:
+        resp = requests.get(urls[council]['property_details_url'] + da['land_id'])
+        if resp.ok:
+            properties = resp.json()['features'][0]['properties']
+            da['address'] = properties['address_format']
+            da['lot_plan'] = properties['lot_plan']
+            print("Updating %s -> %s" % (da['council_reference'], da['address']))
+            scraperwiki.sqlite.save(['authority_label', 'council_reference'], da)