Skip to content

Commit

Permalink
Merge pull request #104 from Lukas0907/dev/next
Browse files Browse the repository at this point in the history
Don't drop feeds with empty title and adjust help.gv.at spider to new frontend
  • Loading branch information
Lukas0907 committed Sep 26, 2017
2 parents 9dc01c1 + f3d78a5 commit 9ce7d57
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 19 deletions.
16 changes: 10 additions & 6 deletions feeds/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,23 @@
from feeds.items import FeedEntryItem


class AtomAutogenerateIdPipeline(object):
"""Autogenerate the id field in case it is missing."""
class AtomAutogenerateFieldsPipeline(object):
"""Autogenerate missing fields in case they are missing."""
def process_item(self, item, spider):
if 'id' in item:
return item
else:
if 'id' not in item:
if 'link' in item:
item['id'] = uuid.uuid5(uuid.NAMESPACE_DNS, item['link']).urn
return item
else:
raise DropItem('A link is required to autogenerate the feed '
'id for: {}'.format(item))

if 'title' not in item:
# Having a title is mandatory, so we use an empty string if none
# is set.
item['title'] = ''

return item


class AtomCheckRequiredFieldsPipeline(object):
"""Check presence of required fields."""
Expand Down
2 changes: 1 addition & 1 deletion feeds/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

# Custom item pipeline
ITEM_PIPELINES = {
'feeds.pipelines.AtomAutogenerateIdPipeline': 100,
'feeds.pipelines.AtomAutogenerateFieldsPipeline': 100,
'feeds.pipelines.AtomCheckRequiredFieldsPipeline': 110,
'feeds.pipelines.AtomExportPipeline': 400
}
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/facebook_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def start_requests(self):
app_secret=app_secret)

for page_id in self.spider_settings.get('pages').split():
url = 'https://graph.{name}/v2.7/{page_id}'.format(
url = 'https://graph.{name}/v2.10/{page_id}'.format(
name=self.name, page_id=page_id)
url = w3lib.url.add_or_replace_parameter(url,
'access_token',
Expand Down
27 changes: 16 additions & 11 deletions feeds/spiders/help_gv_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,30 @@ class HelpGvAtSpider(FeedsSpider):
_timezone = 'Europe/Vienna'

def parse(self, response):
yield scrapy.Request(
'https://www.{}/Portal.Node/hlpd/public/content/171/'
'Seite.1710000.html'.format(self.name), self._parse_lists,
meta={'dont_cache': True})
paths = [
'171/Seite.1710000.html',
'194/Seite.1940000.html',
]
for path in paths:
yield scrapy.Request(
'https://www.{}/Portal.Node/hlpd/public/content/{}'.format(
self.name, path), self._parse_lists,
meta={'dont_cache': True})

yield scrapy.Request(
'https://www.{}/Portal.Node/hlpd/public/content/194/'
'Seite.1940000.html'.format(self.name), self._parse_lists,
('https://www.{}/Portal.Node/hlpd/public/content/340/' +
'weiterenews.html').format(self.name), self._parse_news,
meta={'dont_cache': True})

for link in response.css('.Aktuelles a::attr(href)').extract():
yield scrapy.Request(response.urljoin(link), self._parse_item,
meta={'dont_cache': True})

def _parse_lists(self, response):
for link in response.css('.Content ul a::attr(href)').extract():
for link in response.css('.Content > ul a::attr(href)').extract():
yield scrapy.Request(response.urljoin(link), self._parse_item,
meta={'dont_cache': True})

def _parse_news(self, response):
for link in response.css('.Content article a::attr(href)').extract():
yield scrapy.Request(response.urljoin(link), self._parse_item)

def _parse_item(self, response):
remove_elems = [
'h1', '.nono', '.acceptance_org', '.state', 'script',
Expand Down

0 comments on commit 9ce7d57

Please sign in to comment.