Skip to content

Commit

Permalink
Merge pull request #186 from Lukas0907/next
Browse files Browse the repository at this point in the history
Fixes for addendum, flatten resulting HTML tree
  • Loading branch information
Lukas0907 committed Nov 2, 2018
2 parents 037ce53 + 80eac23 commit c7a5da0
Show file tree
Hide file tree
Showing 13 changed files with 112 additions and 32 deletions.
30 changes: 22 additions & 8 deletions feeds/exporters.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,13 @@ def insert_updated(self):
child.text = self._feed_updated
self._xml.insert(0, child)

def sort(self, field="updated", default=0, reverse=True):
def sort(self, fields=("updated", "id"), default=0, reverse=True):
for item in sorted(
self._feed_items,
reverse=reverse,
key=lambda k: k.findtext(field, default=default),
key=lambda k: tuple(
k.findtext(field, default=default) for field in fields
),
):
self._xml.append(item)

Expand Down Expand Up @@ -188,12 +190,24 @@ def finish_exporting(self):
else:
feed.insert_updated()
feed.sort()
with open(path, "wb") as f:
f.write(
feed.tostring(
encoding=self.encoding,
pretty_print=self._pretty_print,
xml_declaration=True,
feed = feed.tostring(
encoding=self.encoding,
pretty_print=self._pretty_print,
xml_declaration=True,
)
try:
with open(path, "rb") as f:
logger.debug("Found existing feed at '{}'".format(path))
old_feed = f.read()
except FileNotFoundError:
old_feed = None
if feed != old_feed:
with open(path, "wb") as f:
f.write(feed)
else:
logger.debug(
"Feed content not changed, not overwriting feed '{}'".format(
path
)
)

Expand Down
51 changes: 51 additions & 0 deletions feeds/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,26 @@
scripts=True, javascript=True, comments=True, style=True, inline_style=True
)

# List of so-called empty elements in HTML.
# Source: https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
EMPTY_ELEMENTS = [
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr",
]


def parse_datetime(date_time, loader_context):
if isinstance(date_time, datetime):
Expand Down Expand Up @@ -236,6 +256,35 @@ def convert_iframes(tree, loader_context):
return [tree]


def flatten_tree(tree):
# Post-order traversal.
for child in tree.iterchildren():
flatten_tree(child)

# Points to the first child if tree has a child and it's the only child or None
# otherwise.
only_child = list(tree)[0] if len(tree) == 1 else None
if (
tree.tag not in EMPTY_ELEMENTS
and (tree.text is None or tree.text.strip() == "")
and len(tree) == 0
and tree.getparent() is not None
):
# Remove elements which don't have a text and are not supposed to be empty.
tree.drop_tree()
return None
elif (
only_child is not None
and only_child.tag == tree.tag
and tree.getparent() is not None
):
# Replace tree with child if there is only one child and it has the same tag.
only_child.tail = tree.tail
tree.getparent().replace(tree, only_child)

return [tree]


def skip_empty_tree(tree):
if tree.text:
# Has a text.
Expand Down Expand Up @@ -363,13 +412,15 @@ class FeedEntryItemLoader(BaseItemLoader):
replace_regex,
build_tree,
convert_footnotes,
pullup_elems,
replace_elems,
remove_elems,
change_attribs,
change_tags,
cleanup_html,
convert_iframes,
lxml_cleaner,
flatten_tree,
skip_empty_tree,
make_links_absolute,
serialize_tree,
Expand Down
30 changes: 22 additions & 8 deletions feeds/spiders/addendum_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,43 +108,57 @@ def _inline_picture(elem):
)[-1]["src"]

remove_elems = [
".projectNav",
"h1",
"script",
"style",
".projectNav",
".socialShare",
".socialShare__headline",
".socialShare__icon",
".socialMedia",
".socialMedia__headline",
".whyRead",
".overlayCTA",
".authors",
".socialMedia",
".sidebar",
".sectionBackground--colorTheme1",
".heroStage__copyright",
".heroStage__downLink",
"script",
".image__zoom ",
".image__copyrightWrapper",
".callToAction",
".print-action",
".internalLink span",
".addCommunity",
".download",
".BCaudioPlayer",
"style",
".icon-date",
".callToAction__button",
'a[href^="http://partners.webmasterplan.com/click.asp"]',
".relatedSlider",
".imageLightbox",
".image__copyrightWrapper",
".image__zoom",
".image > .picture",
".imageHC",
]
change_tags = {"div.heroStage__introText": "strong"}
change_tags = {
"div.heroStage__introText": "strong",
".quote": "blockquote",
".quote__label": "footer",
".supernumber": "blockquote",
".image": "figure",
".image__element": "div",
}
replace_elems = {
"video": partial(_inline_video, media),
".picture": _inline_picture,
}
pullup_elems = {".image__content figcaption": 3}
il = FeedEntryItemLoader(
response=response,
base_url=response.url,
remove_elems=remove_elems,
change_tags=change_tags,
replace_elems=replace_elems,
pullup_elems=pullup_elems,
)
il.add_value("link", response.url)
il.add_css("author_name", ".sidebar .authors__name::text")
Expand Down
4 changes: 2 additions & 2 deletions feeds/spiders/atv_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def parse_item(self, response):
yield scrapy.Request(url, self.parse_program)

def parse_program(self, response):
if not response.css(".jsb_video\/FlashPlayer"):
if not response.css(r".jsb_video\/FlashPlayer"):
return
data = json.loads(
response.css(".jsb_video\/FlashPlayer").xpath("@data-jsb").extract()[0]
response.css(r".jsb_video\/FlashPlayer").xpath("@data-jsb").extract()[0]
)
data = data["config"]["initial_video"]["parts"][0]["tracking"]["nurago"]
il = FeedEntryItemLoader(
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/cbird_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class CbirdAtSpider(FeedsCrawlSpider):
name = "cbird.at"
allowed_domains = ["cbird.at"]
start_urls = ["https://cbird.at/hilfe/neu/", "https://cbird.at/impressum"]
rules = (Rule(LinkExtractor(allow=("hilfe/neu/(\d+)",)), callback="parse_item"),)
rules = (Rule(LinkExtractor(allow=(r"hilfe/neu/(\d+)",)), callback="parse_item"),)

feed_title = "Neue cbird Versionen"
feed_subtitle = "Die neuesten Versionen von cbird."
Expand Down
6 changes: 4 additions & 2 deletions feeds/spiders/delinski_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def parse(self, response):
reverse=True,
)
for restaurant in restaurants[:20]:
il = FeedEntryItemLoader(timezone="Europe/Vienna", base_url=response.url)
il = FeedEntryItemLoader(timezone="UTC", base_url=response.url)
url = response.urljoin(restaurant["url"])
il.add_value("link", url)
il.add_value("title", restaurant["name"])
Expand All @@ -44,7 +44,9 @@ def parse(self, response):
</ul>
"""
il.add_value("content_html", content.format(**restaurant))
il.add_value("updated", datetime.fromtimestamp(int(restaurant["created"])))
il.add_value(
"updated", datetime.utcfromtimestamp(int(restaurant["created"]))
)
yield scrapy.Request(url, self._parse_restaurant, meta={"il": il})

def _parse_restaurant(self, response):
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/derstandard_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def _fix_img_src(elem):
return scrapy.Request(url, self._parse_blog_article, meta={"il": il})
elif response.css("#feature-content"):
cover_photo = response.css("#feature-cover-photo::attr(style)").re_first(
"\((.*)\)"
r"\((.*)\)"
)
il.add_value("content_html", '<img src="{}">'.format(cover_photo))
il.add_css("content_html", "#feature-cover-title h2")
Expand Down
1 change: 0 additions & 1 deletion feeds/spiders/diepresse_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ def _clean_caption(elem):
".swiper-lazy-preloader",
],
change_tags={".article__lead": "strong"},
pullup_elems={".zoomable__image--zoomed": 2},
change_attribs={".zoomable__image--zoomed": {"data-src": "src"}},
replace_elems={".article__media-caption": _clean_caption},
base_url="https://www.{}".format(self.name),
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/konsument_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def _parse_article_url(self, response):
il.add_value("link", response.url)
il.add_value("author_name", "VKI")
date = response.css(".issue").re_first(
"veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})"
r"veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})"
)
il.add_value("updated", date)
url = response.xpath('//a[text()="Druckversion"]/@onclick').re_first(
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/nachrichten_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def _parse_article(self, response):
il.add_value("category", "paywalled")
il.add_css("link", 'link[rel="canonical"]::attr(href)')
il.add_css("title", 'meta[property="og:title"]::attr(content)')
il.add_css("author_name", ".druckheadline::text", re="·\s*(.*)\s*·")
il.add_css("author_name", ".druckheadline::text", re=r"·\s*(.*)\s*·")
# Mon, 01 Oct 18 13:42:45 +0200
il.add_css("updated", 'meta[http-equiv="last-modified"]::attr(content)')
il.add_css("content_html", ".druckcontent")
Expand Down
4 changes: 2 additions & 2 deletions feeds/spiders/puls4_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def parse(self, response):
def _parse_shows_list(self, response):
shows = json.loads(response.text)["formatOverviewItems"]
for show in shows:
time = re.findall("(\d{2}:\d{2})", show["announcement"]) or None
time = re.findall(r"(\d{2}:\d{2})", show["announcement"]) or None
if time:
time = time[0]
yield scrapy.Request(
Expand Down Expand Up @@ -61,7 +61,7 @@ def _parse_episode(self, response):
il.add_xpath(
"title",
'//meta[@name="title"]/@content',
re="(?s)(.*?)(?: vom .*)? - puls4\.com",
re=r"(?s)(.*?)(?: vom .*)? - puls4\.com",
)
il.add_value(
"updated",
Expand Down
4 changes: 3 additions & 1 deletion feeds/spiders/ubup_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ def parse(self, response):
"link",
response.urljoin(item.css(".item-link::attr(href)").extract_first()),
)
image_url = item.css(".item-image::attr(style)").re_first("'([^']+)'")
image_url = item.css(".item-image::attr(data-bg)").re_first(
r"url\(([^)]+)\)"
)
il.add_value("content_html", '<img src="{}">'.format(image_url))
il.add_css("content_html", ".item-des-container")
il.add_value("path", response.meta["path"])
Expand Down
6 changes: 2 additions & 4 deletions feeds/spiders/vice_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,7 @@ def parse(self, response):
"p i:last-of-type:contains('Facebook'):contains('Twitter')",
]
for article in articles:
il = FeedEntryItemLoader(
timezone="Europe/Vienna", remove_elems=remove_elems
)
il = FeedEntryItemLoader(timezone="UTC", remove_elems=remove_elems)
il.add_value("title", article["title"])
il.add_value("link", article["url"])
if "thumbnail_url_1_1" in article:
Expand All @@ -61,7 +59,7 @@ def parse(self, response):
)
il.add_value("content_html", article["body"])
il.add_value(
"updated", datetime.fromtimestamp(article["publish_date"] / 1000)
"updated", datetime.utcfromtimestamp(article["publish_date"] / 1000)
)
il.add_value(
"author_name",
Expand Down

0 comments on commit c7a5da0

Please sign in to comment.