Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
TimLundSE26 committed Dec 13, 2017
1 parent 7a5a44c commit cf16b08
Showing 1 changed file with 46 additions and 57 deletions.
103 changes: 46 additions & 57 deletions scraper.py
Expand Up @@ -28,74 +28,63 @@ def search():
# "workhone": workhone, "homeMobile": homeMobile, "workMobile": workMobile, "party": party, "ward": ward}

cols = councillor.xpath("td")
# print len(cols)
if len(cols) == 4:

paras = cols[1].xpath('p')
paras = cols[1].xpath('p')

for i, para in enumerate(paras):
if i == 0:
name = "".join(para.xpath('./a/text()')).strip()
link = "".join(para.xpath('./a/@href')).strip()
else:
pText = "".join(para.xpath('text()')).strip()
# print i, pText
for i, para in enumerate(paras):
if i == 0:
name = "".join(para.xpath('./a/text()')).strip()
link = "".join(para.xpath('./a/@href')).strip()
else:
pText = "".join(para.xpath('text()')).strip()
# print i, pText

if len(para.xpath('a')) ==1:
link1 = "".join(para.xpath('./a/@href')).strip()
matchObj = re.search( r'@', link1)
if matchObj:
matchObj1 = re.search( r'work', pText, re.I)
if re.search( r'work', pText, re.I):
eWork = link1
elif re.search( r'home', pText, re.I):
eHome = link1
if len(para.xpath('a')) ==1:
link1 = "".join(para.xpath('./a/@href')).strip()
matchObj = re.search( r'@', link1)
if matchObj:
matchObj1 = re.search( r'work', pText, re.I)
if re.search( r'work', pText, re.I):
eWork = link1
elif re.search( r'home', pText, re.I):
eHome = link1
else:
print i, pText, link1
else:
print i, pText, link1
print "non email address link"
else:
print "non email address link"
else:
matchObj = re.search( r'OX\d \d[A-Z]{2}', pText)
if matchObj:
address = pText
else:
matchObj = re.search( r'^(.+)?\:\s+(0[0-9 ]+)$', pText)
matchObj = re.search( r'OX\d \d[A-Z]{2}', pText)
if matchObj:
number = matchObj.group(2)
numberType = matchObj.group(1)

if re.search( r'home\s+mob', numberType, re.I):
homeMobile = number
elif re.search( r'work\s+mob', numberType, re.I):
workMobile = number
elif re.search( r'home', numberType, re.I):
homePhone = number
elif re.search( r'work', numberType, re.I):
workPhone = number
address = pText
else:
print i, pText
roles = roles.join(pText)
matchObj = re.search( r'^(.+)?\:\s+(0[0-9 ]+)$', pText)
if matchObj:
number = matchObj.group(2)
numberType = matchObj.group(1)

if re.search( r'home\s+mob', numberType, re.I):
homeMobile = number
elif re.search( r'work\s+mob', numberType, re.I):
workMobile = number
elif re.search( r'home', numberType, re.I):
homePhone = number
elif re.search( r'work', numberType, re.I):
workPhone = number
else:
print i, pText
roles = roles.join(pText)

party = "".join(cols[2].xpath('text()')).strip()
ward = "".join(cols[3].xpath('text()')).strip()

data = { "index": index, "name": name, "link": link, "address": address, "roles": roles, "eWork": eWork, "eHome": eHome, "homePhone": homePhone, "workPhone": workPhone, "homeMobile": homeMobile, "workMobile": workMobile, "party": party, "ward": ward}

scraperwiki.sqlite.save(unique_keys=['index', 'link'], data=data)

print data

party = "".join(cols[2].xpath('text()')).strip()
ward = "".join(cols[3].xpath('text()')).strip()

data = { "index": index, "name": name, "link": link, "address": address, "roles": roles, "eWork": eWork, "eHome": eHome, "homePhone": homePhone, "workPhone": workPhone, "homeMobile": homeMobile, "workMobile": workMobile, "party": party, "ward": ward}

scraperwiki.sqlite.save(unique_keys=['index', 'link'], data=data)

print data

# if len(councillors) == 0:
# return
# else:
# for index, councillor in enumerate(councillors):
# col1 = councillor.xpath(".//td)[1]").strip()
# data = {"col1": col1, "index": index}

# print data
# scraperwiki.sqlite.save(unique_keys=['index'], data=data)



search()

0 comments on commit cf16b08

Please sign in to comment.