Skip to content

Commit

Permalink
Merge pull request #123 from Lukas0907/master
Browse files Browse the repository at this point in the history
Various improvements
  • Loading branch information
Lukas0907 committed Jul 4, 2018
2 parents 3bb2e3f + 6997595 commit f032b19
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 32 deletions.
42 changes: 29 additions & 13 deletions feeds/loaders.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import html
import logging
import os
import re
from copy import deepcopy
Expand All @@ -10,13 +11,15 @@
from dateutil.tz import gettz
from lxml import etree
from lxml.cssselect import CSSSelector
from lxml.html import HtmlComment
from lxml.html.clean import Cleaner
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Compose, Identity, Join, MapCompose, TakeFirst
from w3lib.html import remove_tags

from feeds.items import FeedEntryItem, FeedItem

logger = logging.getLogger(__name__)


def parse_datetime(date_time, loader_context):
if isinstance(date_time, datetime):
Expand Down Expand Up @@ -79,13 +82,20 @@ def make_links_absolute(tree):


def cleanup_html(tree, loader_context):
for elem_child, elem_parent in loader_context.get("child_to_parent", {}).items():
sel_child = CSSSelector(elem_child)
sel_parent = CSSSelector(elem_parent)
for e_parent in sel_parent(tree):
e_children = sel_child(e_parent)
if e_children:
e_parent.getparent().replace(e_parent, e_children[0])
for elem_child, parent_dist in loader_context.get("pullup_elems", {}).items():
selector = CSSSelector(elem_child)
for elem in selector(tree):
parent = elem
for _ in range(parent_dist):
parent = parent.getparent()
if parent is not None and parent.getparent() is not None:
parent.getparent().replace(parent, elem)
else:
logger.error(
'Could not find parent with distance {} for selector "{}".'.format(
parent_dist, elem_child
)
)

for elem_sel, elem_new in loader_context.get("replace_elems", {}).items():
elem_new = lxml.html.fragment_fromstring(elem_new)
Expand All @@ -99,11 +109,11 @@ def cleanup_html(tree, loader_context):
for elem_sel in loader_context.get("remove_elems", []):
selector = CSSSelector(elem_sel)
for elem in selector(tree):
elem.getparent().remove(elem)
elem.drop_tree()

for elem_sel in loader_context.get("remove_elems_xpath", []):
for elem in tree.xpath(elem_sel):
elem.getparent().remove(elem)
elem.drop_tree()

# Change tag names.
for elem_sel, elem_tag in loader_context.get("change_tags", {}).items():
Expand All @@ -113,9 +123,6 @@ def cleanup_html(tree, loader_context):

# tree.iter() iterates over the tree including the root node.
for elem in tree.iter():
# Remove HTML comments.
if isinstance(elem, HtmlComment):
elem.getparent().remove(elem)
# Remove class and id attribute from all elements which are not needed
# in the feed.
elem.attrib.pop("class", None)
Expand All @@ -128,6 +135,14 @@ def cleanup_html(tree, loader_context):
return [tree]


def lxml_cleaner(tree):
cleaner = Cleaner(
scripts=True, javascript=True, comments=True, style=True, inline_style=True
)
cleaner(tree)
return [tree]


def convert_footnotes(tree, loader_context):
footnotes = []

Expand Down Expand Up @@ -215,6 +230,7 @@ class FeedEntryItemLoader(BaseItemLoader):
build_tree,
convert_footnotes,
cleanup_html,
lxml_cleaner,
skip_empty_tree,
make_links_absolute,
serialize_tree,
Expand Down
18 changes: 12 additions & 6 deletions feeds/spiders/addendum_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ def _parse_article(self, response):
".addCommunity",
".download",
".BCaudioPlayer",
"style",
".icon-date",
".callToAction__button",
]
change_tags = {
"div.heroStage__introText": "strong",
Expand All @@ -62,21 +65,24 @@ def _parse_article(self, response):
replace_regex = {
r'<span data-src="([^"]+)"></span>.*?<span data-src="([^"]+)" '
+ r'data-min-width="1000">': r'<a href="\2"><img src="\1"></a>',
r'<div style=".*?"><video.*?></video>.*?</div></div>': (
"<em>Das eingebettete Video ist nur im Artikel verfügbar.</em>"
),
r'<video.*?data-placeholder="([^"]+)".*?</video>': r'<img src="\1">',
}
replace_elems = {
"video": "<p><em>Hinweis: Das eingebettete Video ist nur im Artikel "
+ "verfügbar.</em></p>"
}
il = FeedEntryItemLoader(
response=response,
timezone=self._timezone,
base_url="https://www.{}".format(self.name),
base_url=response.url,
remove_elems=remove_elems,
change_tags=change_tags,
replace_regex=replace_regex,
replace_elems=replace_elems,
)
il.add_value("link", response.url)
il.add_value("author_name", "Addendum")
il.add_css("title", 'meta[property="og:title"]::attr(content)')
il.add_css("author_name", ".sidebar .authors__name::text")
il.add_css("title", "title::text", re="(.*) - Addendum")
il.add_css("updated", 'meta[property="article:modified_time"]::attr(content)')
# If not yet modified:
il.add_css("updated", 'meta[property="article:published_time"]::attr(content)')
Expand Down
35 changes: 22 additions & 13 deletions feeds/spiders/orf_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,32 +131,32 @@ def _parse_article(self, response):
".storyMeta",
".slideshow",
"script",
".oon-youtube-logo",
]
child_to_parent = {
".remote .instagram": ".remote",
".remote .facebook": ".remote",
".remote .twitter": ".remote",
".remote .youtube": ".remote",
".remote table": ".remote",
pullup_elems = {
".remote .instagram": 1,
".remote .facebook": 1,
".remote .twitter": 1,
".remote .youtube": 1,
".remote table": 1,
}
replace_elems = {
".remote": "<p><em>Hinweis: Der eingebettete Inhalt ist nur im Artikel "
+ "verfügbar.</em></p>"
}
author = self._extract_author(response)
author, author_selector = self._extract_author(response)
if author:
self.logger.debug("Extracted possible author '{}'".format(author))
# Remove the paragraph that contains the author.
remove_elems.append("p:contains('{}')".format(author))
remove_elems.insert(0, author_selector)
else:
self.logger.debug("Could not extract author name")
author = "{}.ORF.at".format(response.meta["path"])
il = FeedEntryItemLoader(
response=response,
remove_elems=remove_elems,
child_to_parent=child_to_parent,
pullup_elems=pullup_elems,
replace_elems=replace_elems,
timezone=None, # timezone is part of date string
)
# news.ORF.at
data = response.css('script[type="application/ld+json"]::text').extract_first()
Expand Down Expand Up @@ -193,12 +193,13 @@ def _extract_author(response):
)
.extract_first()
)
author_selector = "#ss-storyText > .socialButtons + p"
if author:
return author
return (author, author_selector)
elif domain == "orf.at":
author = response.css(".byline ::text").extract_first()
if author:
return re.split(r"[/,]", author)[0]
return (re.split(r"[/,]", author)[0], ".byline")
elif domain in ["science.orf.at", "help.orf.at", "religion.orf.at"]:
try:
author = (
Expand All @@ -210,10 +211,18 @@ def _extract_author(response):
if 2 <= len(author) <= 50:
# Only take the author name before ",".
author = re.split(r"[/,]", author)[0]
return author
return (
author,
(
"#ss-storyText > p:not(.date):not(.toplink):"
+ "contains('{}')"
).format(author),
)
except IndexError:
pass

return (None, None)

@staticmethod
def _get_logo(channel):
images = {
Expand Down

0 comments on commit f032b19

Please sign in to comment.