Skip to content

Commit

Permalink
Bugfix for RIS Server Bug in Duisburg
Browse files Browse the repository at this point in the history
  • Loading branch information
the-infinity committed Jul 16, 2013
1 parent 2060c5f commit 0078b15
Showing 1 changed file with 151 additions and 137 deletions.
288 changes: 151 additions & 137 deletions risscraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,149 +404,163 @@ def get_submission(self, submission_url=None, submission_id=None):

logging.info("Getting submission %d from %s", submission_id, submission_url)
submission = Submission(numeric_id=submission_id)

time.sleep(self.config.WAIT_TIME)
try:
response = self.user_agent.open(submission_url)
except urllib2.HTTPError, e:
if e.code == 404:
sys.stderr.write("URL not found (HTTP 404) error caught: %s\n" % submission_url)
sys.stderr.write("Please check BASE_URL in your configuration.\n")
sys.exit(1)
mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False)
response.seek(0)
html = response.read()
html = html.replace(' ', ' ')
parser = etree.HTMLParser()
dom = etree.parse(StringIO(html), parser)

# check for page errors
try:
page_title = dom.xpath('//h1')[0].text
if 'Fehlermeldung' in page_title:
logging.info("Page %s cannot be accessed due to server error", submission_url)
if self.options.verbose:
print "Page %s cannot be accessed due to server error" % submission_url
return
if 'Berechtigungsfehler' in page_title:
logging.info("Page %s cannot be accessed due to permissions", submission_url)
if self.options.verbose:
print "Page %s cannot be accessed due to permissions" % submission_url
return
except:
pass

submission.original_url = submission_url

# Session title
try:
stitle = dom.xpath(self.xpath['SUBMISSION_DETAIL_TITLE'])
submission.title = stitle[0].text
except:
logging.critical('Cannot find submission title element using XPath SUBMISSION_DETAIL_TITLE')
raise TemplateError('Cannot find submission title element using XPath SUBMISSION_DETAIL_TITLE')

# Submission identifier, date, type etc
tds = dom.xpath(self.xpath['SUBMISSION_DETAIL_IDENTIFIER_TD'])
if len(tds) == 0:
logging.critical('Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD')
logging.critical('HTML Dump:' + html)
raise TemplateError('Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD')
else:
current_category = None
for n in range(0, len(tds)):
try_until = 1
try_counter = 0
try_found = False

while (try_counter < try_until):
try_counter += 1
try_found = False
time.sleep(self.config.WAIT_TIME)
try:
response = self.user_agent.open(submission_url)
except urllib2.HTTPError, e:
if e.code == 404:
sys.stderr.write("URL not found (HTTP 404) error caught: %s\n" % submission_url)
sys.stderr.write("Please check BASE_URL in your configuration.\n")
sys.exit(1)
mechanize_forms = mechanize.ParseResponse(response, backwards_compat=False)
response.seek(0)
html = response.read()
html = html.replace('&nbsp;', ' ')
parser = etree.HTMLParser()
dom = etree.parse(StringIO(html), parser)
# Hole die Seite noch einmal wenn unbekannter zufällig auftretender Fehler ohne Fehlermeldung ausgegeben wird (gefunden in Duisburg, vermutlich kaputte Server Config)
try:
page_title = dom.xpath('//h1')[0].text
if 'Fehler' in page_title:
try_until = 3
try_found = True
logging.info("Original RIS Server Bug, restart scraping submission %s", submission_url)
except:
pass
if (try_found == False):
# check for page errors
try:
tdcontent = tds[n].text.strip()
if 'Fehlermeldung' in page_title:
logging.info("Page %s cannot be accessed due to server error", submission_url)
if self.options.verbose:
print "Page %s cannot be accessed due to server error" % submission_url
return
if 'Berechtigungsfehler' in page_title:
logging.info("Page %s cannot be accessed due to permissions", submission_url)
if self.options.verbose:
print "Page %s cannot be accessed due to permissions" % submission_url
return
except:
continue
if tdcontent == 'Name:':
submission.identifier = tds[n + 1].text.strip()
elif tdcontent == 'Art:':
submission.type = tds[n + 1].text.strip()
elif tdcontent == 'Datum:':
submission.date = tds[n + 1].text.strip()
elif tdcontent == 'Name:':
submission.identifier = tds[n + 1].text.strip()
elif tdcontent == 'Betreff:':
submission.subject = '; '.join(tds[n + 1].xpath('./text()'))
elif tdcontent == 'Referenzvorlage:':
link = tds[n + 1].xpath('a')[0]
href = link.get('href')
parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
submission.superordinate = {
'identifier': link.text.strip(),
'numeric_id': parsed['submission_id']
}
# add superordinate submission to queue
if hasattr(self, 'submission_queue'):
self.submission_queue.add(parsed['submission_id'])
# subordinate submissions are added to the queue
elif tdcontent == 'Untergeordnete Vorlage(n):':
current_category = 'subordinates'
for link in tds[n + 1].xpath('a'):
href = link.get('href')
parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
if hasattr(self, 'submission_queue') and parsed is not None:
#add subordinate submission to queue
self.submission_queue.add(parsed['submission_id'])
pass

submission.original_url = submission_url

# Session title
try:
stitle = dom.xpath(self.xpath['SUBMISSION_DETAIL_TITLE'])
submission.title = stitle[0].text
except:
logging.critical('Cannot find submission title element using XPath SUBMISSION_DETAIL_TITLE')
raise TemplateError('Cannot find submission title element using XPath SUBMISSION_DETAIL_TITLE')

# Submission identifier, date, type etc
tds = dom.xpath(self.xpath['SUBMISSION_DETAIL_IDENTIFIER_TD'])
if len(tds) == 0:
logging.critical('Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD')
logging.critical('HTML Dump:' + html)
raise TemplateError('Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD')
else:
if current_category == 'subordinates':
for link in tds[n + 1].xpath('a'):
current_category = None
for n in range(0, len(tds)):
try:
tdcontent = tds[n].text.strip()
except:
continue
if tdcontent == 'Name:':
submission.identifier = tds[n + 1].text.strip()
elif tdcontent == 'Art:':
submission.type = tds[n + 1].text.strip()
elif tdcontent == 'Datum:':
submission.date = tds[n + 1].text.strip()
elif tdcontent == 'Name:':
submission.identifier = tds[n + 1].text.strip()
elif tdcontent == 'Betreff:':
submission.subject = '; '.join(tds[n + 1].xpath('./text()'))
elif tdcontent == 'Referenzvorlage:':
link = tds[n + 1].xpath('a')[0]
href = link.get('href')
parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
if hasattr(self, 'submission_queue') and parsed is not None:
submission.superordinate = {
'identifier': link.text.strip(),
'numeric_id': parsed['submission_id']
}
# add superordinate submission to queue
if hasattr(self, 'submission_queue'):
self.submission_queue.add(parsed['submission_id'])

if not hasattr(submission, 'identifier'):
logging.critical('Cannot find session identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH')
raise TemplateError('Cannot find session identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH')

# "Beratungsfolge"(list of sessions for this submission)
# This is currently not parsed for scraping, but only for
# gathering session-attachment ids fpr later exclusion
found_attachments = []
rows = dom.xpath(self.xpath['SUBMISSION_DETAIL_AGENDA_ROWS'])
for row in rows:
formfields = row.xpath('.//input[@type="hidden"][@name="DT"]')
if len(formfields):
attachment_id = formfields[0].get('value')
if attachment_id is not None:
found_attachments.append(attachment_id)

# submission-related attachments
submission.attachments = []
containers = dom.xpath(self.xpath['SUBMISSION_DETAIL_ATTACHMENTS'])
for container in containers:
try:
classes = container.get('class').split(' ')
except:
continue
if self.xpath['SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes:
continue
rows = container.xpath('.//tr')
for row in rows:
forms = row.xpath('.//form')
for form in forms:
name = " ".join(row.xpath('./td/text()')).strip()
for hidden_field in form.xpath('input[@name="DT"]'):
attachment_id = hidden_field.get('value')
if attachment_id in found_attachments:
continue
attachment = Attachment(
identifier=attachment_id,
name=name)
#print attachment_id
# Traversing the whole mechanize response to submit this form
#print mechanize_forms
for mform in mechanize_forms:
#print "Form found: '%s'" % mform
for control in mform.controls:
if control.name == 'DT' and control.value == attachment_id:
attachment = self.get_attachment_file(attachment, mform)
submission.attachments.append(attachment)

# forcing overwrite=True here
oid = self.db.save_submission(submission)
# subordinate submissions are added to the queue
elif tdcontent == 'Untergeordnete Vorlage(n):':
current_category = 'subordinates'
for link in tds[n + 1].xpath('a'):
href = link.get('href')
parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
if hasattr(self, 'submission_queue') and parsed is not None:
#add subordinate submission to queue
self.submission_queue.add(parsed['submission_id'])
else:
if current_category == 'subordinates':
for link in tds[n + 1].xpath('a'):
href = link.get('href')
parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
if hasattr(self, 'submission_queue') and parsed is not None:
self.submission_queue.add(parsed['submission_id'])

if not hasattr(submission, 'identifier'):
logging.critical('Cannot find session identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH')
raise TemplateError('Cannot find session identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH')

# "Beratungsfolge"(list of sessions for this submission)
# This is currently not parsed for scraping, but only for
# gathering session-attachment ids fpr later exclusion
found_attachments = []
rows = dom.xpath(self.xpath['SUBMISSION_DETAIL_AGENDA_ROWS'])
for row in rows:
formfields = row.xpath('.//input[@type="hidden"][@name="DT"]')
if len(formfields):
attachment_id = formfields[0].get('value')
if attachment_id is not None:
found_attachments.append(attachment_id)

# submission-related attachments
submission.attachments = []
containers = dom.xpath(self.xpath['SUBMISSION_DETAIL_ATTACHMENTS'])
for container in containers:
try:
classes = container.get('class').split(' ')
except:
continue
if self.xpath['SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes:
continue
rows = container.xpath('.//tr')
for row in rows:
forms = row.xpath('.//form')
for form in forms:
name = " ".join(row.xpath('./td/text()')).strip()
for hidden_field in form.xpath('input[@name="DT"]'):
attachment_id = hidden_field.get('value')
if attachment_id in found_attachments:
continue
attachment = Attachment(
identifier=attachment_id,
name=name)
#print attachment_id
# Traversing the whole mechanize response to submit this form
#print mechanize_forms
for mform in mechanize_forms:
#print "Form found: '%s'" % mform
for control in mform.controls:
if control.name == 'DT' and control.value == attachment_id:
attachment = self.get_attachment_file(attachment, mform)
submission.attachments.append(attachment)

# forcing overwrite=True here
oid = self.db.save_submission(submission)

def get_attachment_file(self, attachment, form):
"""
Expand Down

0 comments on commit 0078b15

Please sign in to comment.