Skip to content

Commit

Permalink
Merge pull request #221 from Lukas0907/next
Browse files Browse the repository at this point in the history
Add trend.at; fix addendum.org, nachrichten.at
  • Loading branch information
Lukas0907 committed Jul 8, 2020
2 parents c264a3c + ce3afed commit 7abfb6c
Show file tree
Hide file tree
Showing 9 changed files with 175 additions and 171 deletions.
1 change: 0 additions & 1 deletion .isort.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,4 @@ multi_line_output = 3
include_trailing_comma = True
force_grid_wrap = 0
line_length = 88
not_skip = __init__.py
skip_glob = */.tox/*
3 changes: 2 additions & 1 deletion docs/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ Custom extraction
These spiders take an existing RSS feed and inline the article content while
cleaning up the content (removing share buttons, etc.):

* :ref:`spider_addendum.org`
* :ref:`spider_arstechnica.com`
* :ref:`spider_derstandard.at`
* :ref:`spider_dietiwag.org`
Expand All @@ -128,6 +127,7 @@ Utilizing an API
~~~~~~~~~~~~~~~~
Some use a REST API which we can use to fetch the content.

* :ref:`spider_addendum.org`
* :ref:`spider_falter.at`
* :ref:`spider_indiehackers.com`
* :ref:`spider_kurier.at`
Expand All @@ -140,6 +140,7 @@ Utilizing the sitemap
Others provide a sitemap_ which we can parse:

* :ref:`spider_profil.at`
* :ref:`spider_trend.at`

Custom extraction
~~~~~~~~~~~~~~~~~
Expand Down
16 changes: 16 additions & 0 deletions docs/spiders/trend.at.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.. _spider_trend.at:

trend.at
---------
Newest articles from `trend <https://www.trend.at>`_.

Configuration
~~~~~~~~~~~~~
Add ``trend.at`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
trend.at
28 changes: 25 additions & 3 deletions feeds/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@

logger = logging.getLogger(__name__)

_lxml_cleaner = Cleaner(style=True)

# List of so-called empty elements in HTML.
# Source: https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
EMPTY_ELEMENTS = [
Expand Down Expand Up @@ -106,6 +104,23 @@ def make_links_absolute(tree):
return [tree]


def make_srcset_absolute(tree):
# Also make URLs in srcset attributes absolute (not part of make_links_absolute()).
if tree.base_url:
# https://html.spec.whatwg.org/multipage/images.html#srcset-attributes
srcset_regex = re.compile(
r"(?:\s*(?P<url>[^\s,]+)\s*(?P<dimension> \d+w| \d+x)\s*(?:,|$))"
)
selector = CSSSelector("img[srcset]")
for elem in selector(tree):
srcset = []
for url, dimension in srcset_regex.findall(elem.attrib["srcset"]):
srcset.append(urljoin(tree.base_url, url) + dimension)
elem.attrib["srcset"] = ",".join(srcset)

return [tree]


def pullup_elems(tree, loader_context):
for elem_child, parent_dist in loader_context.get("pullup_elems", {}).items():
selector = CSSSelector(elem_child)
Expand Down Expand Up @@ -217,7 +232,13 @@ def cleanup_html(tree, loader_context):


def lxml_cleaner(tree):
_lxml_cleaner(tree)
cleaner = Cleaner(style=True)
# Allow "srcset" and "sizes" attributes which are standardized for <img>.
safe_attrs = set(cleaner.safe_attrs)
safe_attrs.add("srcset")
safe_attrs.add("sizes")
cleaner.safe_attrs = frozenset(safe_attrs)
cleaner(tree)
return [tree]


Expand Down Expand Up @@ -428,6 +449,7 @@ class FeedEntryItemLoader(BaseItemLoader):
flatten_tree,
skip_empty_tree,
make_links_absolute,
make_srcset_absolute,
serialize_tree,
)
content_html_out = Compose(Join(), truncate_text)
Expand Down
215 changes: 52 additions & 163 deletions feeds/spiders/addendum_org.py
Original file line number Diff line number Diff line change
@@ -1,185 +1,74 @@
import json
from copy import deepcopy
from functools import partial

import lxml
import scrapy
from inline_requests import inline_requests

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsXMLFeedSpider
from feeds.utils import generate_feed_header
from feeds.spiders import FeedsSpider


class AddendumOrgSpider(FeedsXMLFeedSpider):
class AddendumOrgSpider(FeedsSpider):
name = "addendum.org"
start_urls = ["https://www.addendum.org/feed/rss2-addendum"]
start_urls = ["https://www.addendum.org/api/wp/v2/posts"]

_max_articles = 10
_num_articles = 0
feed_title = "Addendum"
feed_subtitle = "das, was fehlt"
feed_link = f"https://www.{name}"
feed_logo = f"{feed_link}/content/themes/qvv/img/logoAdd.png"
feed_icon = f"{feed_link}/content/themes/qvv/img/favicons/favicon-16x16.png"

def feed_headers(self):
feeds = {"": "Addendum", "podcast": "Addendum Podcast"}
for path, title in feeds.items():
yield generate_feed_header(
title=title,
path=path,
subtitle="das, was fehlt",
link="https://www.{}".format(self.name),
icon=(
"https://www.{}/resources/dist/favicons/android-chrome-192x192.png"
).format(self.name),
def parse(self, response):
posts = json.loads(response.text)
for post in posts:
il = FeedEntryItemLoader()
il.add_value("title", post["title"]["rendered"])
il.add_value("link", post["link"])
il.add_value("updated", post["modified"])
yield scrapy.Request(
post["link"], self._parse_article, meta={"il": il},
)

def parse_node(self, response, node):
url = node.xpath("link/text()").extract_first()
if not node.xpath("category"):
# Overview pages don't have a category.
return
if self._num_articles >= self._max_articles:
# Maximum number of articles reached.
return
self._num_articles += 1
return scrapy.Request(url, self._parse_article)

@staticmethod
def _build_api_request(video_id):
return scrapy.Request(
"https://edge.api.brightcove.com/playback/v1/accounts/5548093587001/"
+ "videos/{}".format(video_id),
headers={
"Accept": "application/json;pk="
+ "BCpkADawqM0FoC-KqFQHZMCJQrN5XC_gdWIYO4204LOOSFrv34GdVavc7TJP"
+ "tw5F612ztVUZmw47U2kWmyVU3kdtE6dga_P110le86FuaNCGPYlJ6ljW0Z3_"
+ "HYTlnFsPeFs8F61PGWYBsYnG"
},
)

@inline_requests
def _parse_article(self, response):
def _inline_video(videos, elem):
if "data-video-id" in elem.attrib:
source = lxml.etree.Element("source")
source.attrib["src"] = videos[elem.attrib["data-video-id"]]
source.attrib["type"] = "video/mp4"
elem.insert(0, source)
return elem
else:
# Header video, replace with placeholder image.
parent = elem.getparent()
parent.tag = "figure"
if "data-placeholderbig" in elem.attrib:
src = elem.attrib["data-placeholderbig"]
else:
src = elem.attrib["data-placeholder"]
image = lxml.etree.Element("img")
image.attrib["src"] = src
return image

def _inline_picture(elem):
elem.tag = "img"
src = elem.attrib.get("data-original")
data_min_width = 1000 if src else -1
for child in elem.getchildren():
if child.tag != "span":
continue
if int(child.attrib.get("data-min-width", 0)) > data_min_width:
src = child.attrib["data-src"]
data_min_width = int(child.attrib.get("data-min-width", 0))
child.drop_tree()
elem.attrib["src"] = src
return elem

audio_ids = response.css(
'#BCaudioPlayer_eindeutig::attr("data-video-id")'
).extract()
video_ids = response.css('.video-js::attr("data-video-id")').extract()
media = {}
for media_id in audio_ids + video_ids:
api_response = yield self._build_api_request(media_id)
api_response = json.loads(api_response.text)
media[media_id] = sorted(
(
video
for video in api_response["sources"]
if "src" in video and video.get("container") == "MP4"
),
key=lambda v: v["size"],
)[-1]["src"]

remove_elems = [
"h1",
"script",
"style",
".projectNav",
".socialShare",
".socialShare__headline",
".socialShare__icon",
".socialMedia",
".socialMedia__headline",
".whyRead",
".overlayCTA",
".authors",
".sectionBackground--colorTheme1",
".heroStage__copyright",
".heroStage__downLink",
".callToAction",
".print-action",
".internalLink span",
".addCommunity",
".download",
".BCaudioPlayer",
".icon-date",
".callToAction__button",
'a[href^="http://partners.webmasterplan.com/click.asp"]',
".relatedSlider",
".imageLightbox",
".image__copyrightWrapper",
".image__zoom",
".image > .picture",
".imageHC",
".article-header .title",
".article-header .date",
".paragraph-comments-sticky",
".qvv-spinner",
".article-image.mobile",
".image-copyright",
".article-image-copyright",
".vertical-slider-mobile-image",
".quote-box",
".icon-end-of-article",
".article-teaser-card",
".content-row:contains('Lesen Sie auch:')",
".article-footer ~ div",
".article-footer",
".cta-box",
".slide-image .mobile",
".slide-text .credit",
".article-play",
".top-tag",
]
change_tags = {
"div.heroStage__introText": "strong",
".quote": "blockquote",
".quote__label": "footer",
".supernumber": "blockquote",
".image": "figure",
".image__element": "div",
".summary": "strong",
".article-captioned-image-2": "figure",
".keyplayer-card": "figure",
".keyplayer-card .text-wrapper": "figcaption",
".article-captioned-image": "figure",
".article-captioned-image .image-caption": "figcaption",
".slide": "figure",
".slide-text .caption": "figcaption",
".bg-white": "blockquote",
".collapse-box": "blockquote",
}
replace_elems = {
"video": partial(_inline_video, media),
".picture": _inline_picture,
}
pullup_elems = {".image__content figcaption": 3}
il = FeedEntryItemLoader(
response=response,
base_url=response.url,
base_url=self.feed_link,
remove_elems=remove_elems,
change_tags=change_tags,
replace_elems=replace_elems,
pullup_elems=pullup_elems,
parent=response.meta["il"],
)
il.add_value("link", response.url)
il.add_css("author_name", ".sidebar .authors__name::text")
il.add_css("title", "title::text", re="(.*) - Addendum")
il.add_css("updated", 'meta[property="article:modified_time"]::attr(content)')
# If not yet modified:
il.add_css("updated", 'meta[property="article:published_time"]::attr(content)')
il.add_css("content_html", ".content")
for medium_id, medium_url in media.items():
if medium_id not in audio_ids:
il.add_value("enclosure", {"iri": medium_url, "type": "video/mp4"})
item = il.load_item()
# Save a copy before yielding it.
item_podcast = deepcopy(item)
yield item

if audio_ids:
# Export to podcast feed.
il = FeedEntryItemLoader(item=item_podcast)
il.add_value("path", "podcast")
for medium_id, medium_url in media.items():
if medium_id in audio_ids:
il.add_value("enclosure", {"iri": medium_url, "type": "audio/mp4"})
yield il.load_item()
il.add_css("content_html", ".article-wrapper")
il.add_css("author_name", ".article-author-link ::text")
il.add_css("category", ".top-tag ::text")
yield il.load_item()
3 changes: 2 additions & 1 deletion feeds/spiders/nachrichten_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,13 @@ def _fix_img_src(elem):
)
if response.css(".mainLogin__linkToggle"):
il.add_value("category", "paywalled")
il.add_css("link", 'link[rel="canonical"]::attr(href)')
il.add_value("link", response.url.replace("#ref=rss", ""))
il.add_css("title", 'meta[property="og:title"]::attr(content)')
il.add_css("author_name", ".artDetailAutor__headline::text")
# Mon, 01 Oct 18 13:42:45 +0200
il.add_css("updated", 'meta[name="date"]::attr(content)')
il.add_css("content_html", "article.artDetail")
il.add_css("category", ".artDetailOrt__linkText::text")
il.add_css("category", ".artDetail__topline ::text")
il.add_value("path", response.meta["ressort"])
return il.load_item()

0 comments on commit 7abfb6c

Please sign in to comment.