Skip to content

Commit

Permalink
Merge pull request #165 from Lukas0907/fixes
Browse files Browse the repository at this point in the history
Improvements for derStandard.at, ORF.at, iframes
  • Loading branch information
Lukas0907 committed Sep 7, 2018
2 parents 0e5c09b + 7d0b939 commit 27a20f1
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 6 deletions.
23 changes: 23 additions & 0 deletions feeds/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from copy import deepcopy
from datetime import datetime
from textwrap import TextWrapper
from urllib.parse import urljoin

import dateparser
import lxml
Expand Down Expand Up @@ -174,6 +175,27 @@ def convert_footnotes(tree, loader_context):
return [tree]


def convert_iframes(tree, loader_context):
"""Convert iframes to divs with links to its src.
convert_iframes() is called after cleanup_html() so that unwanted iframes can be
eliminated first.
"""
base_url = loader_context.get("base_url", None) if loader_context else None
selector = CSSSelector("iframe")
for elem in selector(tree):
if "src" not in elem.attrib:
continue
url = urljoin(base_url, elem.attrib.pop("src"))
elem_new = lxml.html.fragment_fromstring(
'<div><a href="{url}">{url}</a></div>'.format(url=url)
)
elem_new.tail = elem.tail
elem.getparent().replace(elem, elem_new)

return [tree]


def skip_empty_tree(tree):
if tree.text:
# Has a text.
Expand Down Expand Up @@ -293,6 +315,7 @@ class FeedEntryItemLoader(BaseItemLoader):
build_tree,
convert_footnotes,
cleanup_html,
convert_iframes,
lxml_cleaner,
skip_empty_tree,
make_links_absolute,
Expand Down
13 changes: 9 additions & 4 deletions feeds/spiders/derstandard_at.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import html
from datetime import timedelta

import scrapy
Expand Down Expand Up @@ -84,6 +85,8 @@ def _parse_article(self, response):
".image-zoom",
".continue",
".sequence-number",
".js-embed-output",
"#mycountrytalks-embed",
]
change_tags = {
"#media-list li .description": "figcaption",
Expand All @@ -93,16 +96,18 @@ def _parse_article(self, response):
".caption": "figcaption",
}
replace_regex = {
# Replace every special script container with its unescaped content.
r'<script class="js-embed-template" type="text/html">([^<]+)</script>': (
lambda match: html.unescape(match.group(1))
),
# data-zoom-src is only valid if it starts with //images.derstandard.at.
r'<img[^>]+data-zoom-src="(//images.derstandard.at/[^"]+)"': (
r'<img src="\1"'
)
),
}
replace_elems = {
".embedded-posting": "<p><em>Hinweis: Das eingebettete Posting ist nur "
+ "im Artikel verfügbar.</em></p>",
".js-embed-output": "<p><em>Hinweis: Der eingebettete Inhalt ist nur "
+ "im Artikel verfügbar.</em></p>",
+ "im Artikel verfügbar.</em></p>"
}
il = FeedEntryItemLoader(
response=response,
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/facebook_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def start_requests(self):
)

for page_id in self.settings.get("FEEDS_SPIDER_FACEBOOK_COM_PAGES").split():
url = "https://graph.{name}/v2.10/{page_id}".format(
url = "https://graph.{name}/v3.1/{page_id}".format(
name=self.name, page_id=page_id
)
url = w3lib.url.add_or_replace_parameter(url, "access_token", access_token)
Expand Down
2 changes: 2 additions & 0 deletions feeds/spiders/orf_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ def _parse_article(self, response):
+ "verfügbar.</em></p>"
}
change_attribs = {"img": {"data-src": "src"}}
change_tags = {".image": "figure", ".caption": "figcaption"}
author, author_selector = self._extract_author(response)
if author:
self.logger.debug("Extracted possible author '{}'".format(author))
Expand All @@ -164,6 +165,7 @@ def _parse_article(self, response):
pullup_elems=pullup_elems,
replace_elems=replace_elems,
change_attribs=change_attribs,
change_tags=change_tags,
)
# The field is part of a JSON that is sometimes not valid, so don't bother with
# parsing it properly.
Expand Down
1 change: 1 addition & 0 deletions feeds/spiders/profil_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def parse_item(self, response):
"aside",
"script",
"h1",
"source",
".breadcrumbs",
".author-date",
".artikel-social-kommentar",
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/uebermedien_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def parse_node(self, response, node):
)

def _parse_article(self, response):
remove_elems = ["iframe", "script"]
remove_elems = ["script"]
convert_footnotes = [".footnoteContent"]
pullup_elems = {".footnoteContent": 1}
il = FeedEntryItemLoader(
Expand Down

0 comments on commit 27a20f1

Please sign in to comment.