Skip to content

Commit

Permalink
Merge pull request #163 from Lukas0907/fixes
Browse files Browse the repository at this point in the history
Fixes
  • Loading branch information
Lukas0907 committed Sep 4, 2018
2 parents 891bee3 + 386557f commit 0e5c09b
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 26 deletions.
24 changes: 6 additions & 18 deletions feeds/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import lxml
from dateutil.parser import parse as dateutil_parse
from dateutil.tz import gettz
from lxml import etree
from lxml.cssselect import CSSSelector
from lxml.html.clean import Cleaner
from scrapy.loader import ItemLoader
Expand Down Expand Up @@ -92,6 +91,7 @@ def cleanup_html(tree, loader_context):
for _ in range(parent_dist):
parent = parent.getparent()
if parent is not None and parent.getparent() is not None:
elem.tail = parent.tail
parent.getparent().replace(parent, elem)
else:
logger.error(
Expand All @@ -106,7 +106,9 @@ def cleanup_html(tree, loader_context):
for elem in selector(tree):
# New element could be replaced more than once but every node must be a
# different element.
elem.getparent().replace(elem, deepcopy(elem_new))
elem_new_copy = deepcopy(elem_new)
elem_new_copy.tail = elem.tail
elem.getparent().replace(elem, elem_new_copy)

remove_elems = []

Expand Down Expand Up @@ -162,26 +164,12 @@ def lxml_cleaner(tree):


def convert_footnotes(tree, loader_context):
footnotes = []

# Convert footnotes.
for elem_sel in loader_context.get("convert_footnotes", []):
selector = CSSSelector(elem_sel)
for elem in selector(tree):
footnotes.append(elem.text_content())
ref = etree.Element("span")
ref.text = " [{}]".format(len(footnotes))
elem.getparent().replace(elem, ref)

# Add new <div> with all the footnotes, one per <p>
if footnotes:
footnotes_elem = etree.Element("div")
tree.append(footnotes_elem)

for i, footnote in enumerate(footnotes):
footnote_elem = etree.Element("p")
footnote_elem.text = "[{}] {}".format(i + 1, footnote)
footnotes_elem.append(footnote_elem)
elem.tag = "small"
elem.text = " ({})".format(elem.text)

return [tree]

Expand Down
14 changes: 7 additions & 7 deletions feeds/spiders/orf_at.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
import re
from urllib.parse import urlparse

Expand Down Expand Up @@ -111,7 +110,7 @@ def _parse_article(self, response):
".story-story p > strong:contains('Mehr') + a::attr(href), "
+ ".story-story p > a:contains('Lesen Sie mehr')::attr(href)"
).extract_first()
if more:
if more and more != response.url:
self.logger.debug(
"Detected teaser article, redirecting to {}".format(more)
)
Expand Down Expand Up @@ -166,11 +165,12 @@ def _parse_article(self, response):
replace_elems=replace_elems,
change_attribs=change_attribs,
)
# news.ORF.at
data = response.css('script[type="application/ld+json"]::text').extract_first()
if data:
data = json.loads(data)
updated = data["datePublished"]
# The field is part of a JSON that is sometimes not valid, so don't bother with
# parsing it properly.
match = re.search(r'"datePublished": "([^"]+)"', response.text)
if match:
# news.ORF.at
updated = match.group(1)
else:
# other
updated = response.meta["updated"]
Expand Down
4 changes: 4 additions & 0 deletions feeds/spiders/uebermedien_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,15 @@ def parse_node(self, response, node):

def _parse_article(self, response):
remove_elems = ["iframe", "script"]
convert_footnotes = [".footnoteContent"]
pullup_elems = {".footnoteContent": 1}
il = FeedEntryItemLoader(
response=response,
parent=response.meta["il"],
remove_elems=remove_elems,
base_url="https://{}".format(self.name),
convert_footnotes=convert_footnotes,
pullup_elems=pullup_elems,
)
il.add_css("content_html", ".entry-content")
return il.load_item()
2 changes: 1 addition & 1 deletion feeds/spiders/wienerlinien_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def parse_item(self, response):
parent=response.meta["il"],
remove_elems=remove_elems,
change_tags=change_tags,
base_url="https://{}".format(self.name),
base_url="https://www.{}".format(self.name),
)
il.add_xpath("content_html", '//div[@id="main-inner"]')
yield il.load_item()

0 comments on commit 0e5c09b

Please sign in to comment.