Skip to content

Commit

Permalink
fixed for new page
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreM101 committed Apr 8, 2021
1 parent d9215e2 commit 58d99a9
Showing 1 changed file with 18 additions and 81 deletions.
99 changes: 18 additions & 81 deletions scraper.rb
@@ -1,87 +1,24 @@
# Adapted from planningalerts.org.au to return data
# back to Jan 01, 2007

require 'scraperwiki'
require 'mechanize'
require 'faraday'
require 'date'

def scrape_page(page)
page.at("//table").search("tr").each do |tr|
begin
tds = tr.search('td').map{|t| t.inner_text.gsub("\r\n", "").strip}
day, month, year = tds[3].split("/").map{|s| s.to_i}
record = {
"info_url" => (page.uri + tr.search('td').at('a')["href"]).to_s,
"council_reference" => tds[1].split(" - ")[0].squeeze(" ").strip,
"date_received" => Date.new(year, month, day).to_s,
"description" => tds[1].split(" - ")[1..-1].join(" - ").squeeze(" ").strip,
"address" => tds[2].squeeze(" ").strip,
"date_scraped" => Date.today.to_s
}
if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
ScraperWiki.save_sqlite(['council_reference'], record)
# else
# puts "Skipping already saved record " + record['council_reference']
end
rescue
next
end
end
end

url = 'https://developmenti.brisbane.qld.gov.au/Geo/GetApplicationFilterResults'

# Implement a click on a link that understands stupid asp.net doPostBack
def click(page, doc)
begin
js = doc["href"] || doc["onclick"]
if js =~ /javascript:__doPostBack\('(.*)','(.*)'\)/
event_target = $1
event_argument = $2
form = page.form_with(id: "aspnetForm")
form["__EVENTTARGET"] = event_target
form["__EVENTARGUMENT"] = event_argument
form.submit
elsif js =~ /return false;__doPostBack\('(.*)','(.*)'\)/
nil
else
# TODO Just follow the link likes it's a normal link
raise
end
rescue
nil
end
resp = Faraday.post(url) do |req|
req.headers['Content-Type'] = 'application/json'
req.body = '{"Progress":"all","StartDateUnixEpochNumber":0,"EndDateUnixEpochNumber":'+Date.today.to_time.to_i.to_s + '000,"DateRangeField":"submitted","DateRangeDescriptor":"Last 7 Days","MaxRecords":1000,"SortField":"submitted","PixelWidth":800,"PixelHeight":800}'
end
raw = JSON.parse(resp.body)

raw['features'][1..-1].each do |set|
record = {}
set = set['properties']
record['council_reference'] = set['application_number']
record['address'] = set['description'].split(" - ")[0]
record['description'] = set['description'].split(" - ")[1..-1].join(" - ")
record['date_received'] = set['date_received']
record['date_scraped'] = Date.today.to_s
puts "Saving #{record['council_reference']}, #{record['address']}"
ScraperWiki.save_sqlite(['council_reference'], record)
end

years = [2025, 2024, 2023, 2022, 2021, 2020]
periodstrs = years.map(&:to_s).product([*'-01'..'-12'].reverse).map(&:join).select{|d| d <= Date.today.to_s[0..-3]}.reverse

periodstrs.each {|periodstr|

matches = periodstr.scan(/^([0-9]{4})-(0[1-9]|1[0-2])$/)
period = "&1=" + Date.new(matches[0][0].to_i, matches[0][1].to_i, 1).strftime("%d/%m/%Y")
period = period + "&2=" + Date.new(matches[0][0].to_i, matches[0][1].to_i, -1).strftime("%d/%m/%Y")

puts "Getting data in `" + periodstr + "`."

url = "https://pdonline.brisbane.qld.gov.au/MasterViewUI/Modules/ApplicationMaster/default.aspx?page=found" + period

agent = Mechanize.new

# Read in a page
page = agent.get(url)

current_page_no = 1
next_page_link = true

while next_page_link
if (current_page_no%5) == 0
puts "Scraping page #{current_page_no}..."
end
scrape_page(page)

current_page_no += 1
next_page_link = page.at(".rgPageNext")
page = click(page, next_page_link)
next_page_link = nil if page.nil?
end
}

0 comments on commit 58d99a9

Please sign in to comment.