Skip to content

Commit

Permalink
Merge pull request #162 from Lukas0907/fixes
Browse files Browse the repository at this point in the history
Fixes
  • Loading branch information
Lukas0907 committed Sep 4, 2018
2 parents eb2edcb + 5d2e5e3 commit 891bee3
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 7 deletions.
8 changes: 8 additions & 0 deletions feeds/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,14 @@ def cleanup_html(tree, loader_context):
for elem in tree.xpath(elem_sel):
elem.drop_tree()

# Change attrib names.
for elem_sel, attribs in loader_context.get("change_attribs", {}).items():
selector = CSSSelector(elem_sel)
for elem in selector(tree):
for attrib in elem.attrib.keys():
if attrib in attribs:
elem.attrib[attribs[attrib]] = elem.attrib.pop(attrib)

# Change tag names.
for elem_sel, elem_tag in loader_context.get("change_tags", {}).items():
selector = CSSSelector(elem_sel)
Expand Down
21 changes: 18 additions & 3 deletions feeds/spiders/derstandard_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
class DerStandardAtSpider(FeedsXMLFeedSpider):
name = "derstandard.at"
allowed_domains = [name]
custom_settings = {"COOKIES_ENABLED": False}
custom_settings = {
"COOKIES_ENABLED": False,
# Don't filter duplicates. This would impose a race condition.
"DUPEFILTER_CLASS": "scrapy.dupefilters.BaseDupeFilter",
}

_title = "derStandard.at"
_subtitle = "Nachrichten in Echtzeit"
Expand Down Expand Up @@ -81,7 +85,13 @@ def _parse_article(self, response):
".continue",
".sequence-number",
]
change_tags = {"#media-list li": "div", "#media-list": "div"}
change_tags = {
"#media-list li .description": "figcaption",
"#media-list li": "figure",
"#media-list": "div",
".photo": "figure",
".caption": "figcaption",
}
replace_regex = {
# data-zoom-src is only valid if it starts with //images.derstandard.at.
r'<img[^>]+data-zoom-src="(//images.derstandard.at/[^"]+)"': (
Expand All @@ -104,7 +114,12 @@ def _parse_article(self, response):
)
il.add_value("link", response.url)
il.add_css("title", 'meta[property="og:title"]::attr(content)')
il.add_css("author_name", "span.author::text")
for author in response.css("span.author::text").extract():
# Sometimes the author name is messed up and written in upper case.
# This happens usually for articles written by Günter Traxler.
if author.upper() == author:
author = author.title()
il.add_value("author_name", author)
il.add_value("path", response.meta["ressort"])
il.add_value("updated", response.meta["updated"])
il.add_css("category", "#breadcrumb .item a::text")
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/falter_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def start_requests(self):
},
meta={
"dont_redirect": True,
"cache_expires": timedelta(days=1),
"cache_expires": timedelta(hours=3),
"handle_httpstatus_list": [302],
},
callback=self.request_archive,
Expand Down
1 change: 1 addition & 0 deletions feeds/spiders/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

# Readability's output is not that interesting to justify log level "INFO".
import readability.readability

readability.readability.log.info = readability.readability.log.debug


Expand Down
14 changes: 12 additions & 2 deletions feeds/spiders/orf_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ def _parse_article(self, response):
try:
# Heuristic for news.ORF.at to to detect teaser articles.
more = response.css(
".shortnews p > strong:contains('Mehr') + a::attr(href)"
".story-story p > strong:contains('Mehr') + a::attr(href), "
+ ".story-story p > a:contains('Lesen Sie mehr')::attr(href)"
).extract_first()
if more:
self.logger.debug(
Expand All @@ -132,6 +133,11 @@ def _parse_article(self, response):
".slideshow",
"script",
".oon-youtube-logo",
# redesign
"#more-to-read-anchor",
".social-buttons",
".story-horizontal-ad",
".linkcard",
]
pullup_elems = {
".remote .instagram": 1,
Expand All @@ -144,6 +150,7 @@ def _parse_article(self, response):
".remote": "<p><em>Hinweis: Der eingebettete Inhalt ist nur im Artikel "
+ "verfügbar.</em></p>"
}
change_attribs = {"img": {"data-src": "src"}}
author, author_selector = self._extract_author(response)
if author:
self.logger.debug("Extracted possible author '{}'".format(author))
Expand All @@ -157,6 +164,7 @@ def _parse_article(self, response):
remove_elems=remove_elems,
pullup_elems=pullup_elems,
replace_elems=replace_elems,
change_attribs=change_attribs,
)
# news.ORF.at
data = response.css('script[type="application/ld+json"]::text').extract_first()
Expand All @@ -169,8 +177,10 @@ def _parse_article(self, response):
il.add_value("updated", updated)
il.add_css("title", "title::text", re="(.*) - .*")
il.add_value("link", response.url)
il.add_css("content_html", ".opener img") # fm4.ORF.at
il.add_css("content_html", ".opener img") # FM4, news
il.add_css("content_html", ".story-lead-text") # news
il.add_css("content_html", "#ss-storyText")
il.add_css("content_html", "#ss-storyContent") # news
il.add_value("author_name", author)
il.add_value("path", response.meta["path"])
il.add_value("category", response.meta["categories"])
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/wienerlinien_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def parse(self, response):
response=response,
timezone=self._timezone,
ignoretz=True,
base_url="https://{}".format(self.name),
base_url="https://www.{}".format(self.name),
)
link = response.urljoin(item.css("a::attr(href)").extract_first())
il.add_value("link", link)
Expand Down

0 comments on commit 891bee3

Please sign in to comment.