Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
TimLundSE26 committed Dec 14, 2017
1 parent 613938c commit 08d19b9
Showing 1 changed file with 31 additions and 119 deletions.
150 changes: 31 additions & 119 deletions scraper.py
Expand Up @@ -4,127 +4,39 @@
from lxml.html.soupparser import fromstring
from time import sleep

def search():
urlBase = "http://mycouncil.oxford.gov.uk/"

# print "GET 'http://mycouncil.oxford.gov.uk/mgMemberIndex.aspx?FN=ALPHA&VW=TABLE&PIC=1'"
sleep(2)
result = requests.get(urlBase + 'mgMemberIndex.aspx?FN=ALPHA&VW=TABLE&PIC=1')

result_dom = fromstring(result.content)
councillors = result_dom.xpath("//table[@id='mgTable1']//tr")

if len(councillors) == 0:
return
else:
for index, councillor in enumerate(councillors):

roles = ""
eHome = ""
eWork = ""
homePhone = ""
workPhone = ""
homeMobile = ""
workMobile = ""
surgery = ""

# data = {"name": name, "link": link, "address": address, "roles": roles, "eWork": eWork, "eHome": eHome, "homePhone": homePhone,
# "workhone": workhone, "homeMobile": homeMobile, "workMobile": workMobile, "party": party, "ward": ward}

cols = councillor.xpath("td")
if len(cols) == 4:

paras = cols[1].xpath('p')

for i, para in enumerate(paras):
if i == 0:
name = "".join(para.xpath('./a/text()')).strip()
link = "".join(para.xpath('./a/@href')).strip()
else:
pText = "".join(para.xpath('text()')).strip()
# print i, pText
month = "Oct 17"

if len(para.xpath('a')) ==1:
link1 = "".join(para.xpath('./a/@href')).strip()
matchObj = re.search( r'@', link1)
if matchObj:
matchObj1 = re.search( r'work', pText, re.I)
if re.search( r'work', pText, re.I):
eWork = link1
elif re.search( r'home', pText, re.I):
eHome = link1
else:
print i, pText, link1
else:
print "non email address link"
else:
matchObj = re.search( r'OX\d \d[A-Z]{2}', pText)
if matchObj:
address = pText
else:
matchObj = re.search( r'^(.+)?\:\s+(0[0-9 ]+)$', pText)
if matchObj:
number = matchObj.group(2)
numberType = matchObj.group(1)
def search(mth):
request_data = { "month": mth, "dateType": "DC_Validated" }

if re.search( r'home\s+mob', numberType, re.I):
homeMobile = number
elif re.search( r'work\s+mob', numberType, re.I):
workMobile = number
elif re.search( r'home', numberType, re.I):
homePhone = number
elif re.search( r'work', numberType, re.I):
workPhone = number
else:
# print i, pText
roles = roles.join(pText)


party = "".join(cols[2].xpath('text()')).strip()
ward = "".join(cols[3].xpath('text()')).strip()

sleep(2)

# print "GET " + urlBase + link

result1 = requests.get(urlBase + link)
result_dom1 = fromstring(re.sub(u"(\u2018|\u2019)", "'", result1.content))
mgUserBody = result_dom1.xpath("//div[@class='mgUserBody']")[0]
mgUserBodySectionTitles = mgUserBody.xpath("//h2[@class='mgSectionTitle']")

print len(mgUserBodySectionTitles)

for mgUserBodySectionTitle in mgUserBodySectionTitles:

mgUserBodySection = mgUserBodySectionTitle.xpath('following-sibling::*')[0]

if mgUserBodySection:
mgUserBodySectionName = "".join(mgUserBodySectionTitle.xpath('text()')).strip()

matchObj = re.search( r'Surgery details', mgUserBodySectionName, re.I)
if re.search( r'Surgery details', mgUserBodySectionName, re.I):
surgery = re.sub(u"(\u2018|\u2019)", "'", "".join(mgUserBodySection.xpath('text()')).strip())
elif re.search( r'terms of office', mgUserBodySectionName, re.I):
print mgUserBodySectionName, len(mgUserBodySection.xpath('li'))
elif re.search( r'More information about this councillor', mgUserBodySectionName, re.I):
print mgUserBodySectionName, len(mgUserBodySection.xpath('li'))
elif re.search( r'committee appointments', mgUserBodySectionName, re.I):
print mgUserBodySectionName, len(mgUserBodySection.xpath('li'))
elif re.search( r'Appointments to outside bodies', mgUserBodySectionName, re.I):
print mgUserBodySectionName, len(mgUserBodySection.xpath('li'))
elif re.search( r'Additional Information', mgUserBodySectionName, re.I):
print mgUserBodySection.tag

else:
print "No next sibling"


data = { "index": index, "surgery": surgery, "name": name, "link": link, "address": address, "roles": roles, "eWork": eWork, "eHome": eHome, "homePhone": homePhone, "workPhone": workPhone, "homeMobile": homeMobile, "workMobile": workMobile, "party": party, "ward": ward}

scraperwiki.sqlite.save(unique_keys=['index', 'link'], data=data)
# <input type="radio" name="dateType" value="DC_Validated" checked="checked" id="dateValidated">
# is this the way to pass values when posting to a form? A dictionary object with the keys being the names of the controls?

print "POST 'http://public.oxford.gov.uk/online-applications/advancedSearchResults.do?action=firstPage'"
sleep(2)
result = requests.post('http://public.oxford.gov.uk/online-applications/search.do?action=monthlyList', request_data)

# print data

result_dom = fromstring(result.content)

applications = result_dom.xpath("//li[@class='searchresult']")

search()
if len(applications) == 0:
return
else:
for index, application in enumerate(applications):
application_link = application.xpath("a/@href")
matchObj = re.search( r'keyVal=(.*$)', application_link)
key = matchObj.group(1)

tabletype = "summary"
application_url = "http://public.oxford.gov.uk/online-applications/applicationDetails.do?activeTab=" + tabletype + "&keyVal=" + key
application_url = "http://public.oxford.gov.uk" + application_link
print "GET " + application_url

sleep(2)
application_page = requests.get(application_url)
application_dom = fromstring(application_page.content)

print len(application_dom.xpath("//table"))

search(month)

0 comments on commit 08d19b9

Please sign in to comment.