-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #221 from Lukas0907/next
Add trend.at; fix addendum.org, nachrichten.at
- Loading branch information
Showing
9 changed files
with
175 additions
and
171 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
.. _spider_trend.at: | ||
|
||
trend.at | ||
--------- | ||
Newest articles from `trend <https://www.trend.at>`_. | ||
|
||
Configuration | ||
~~~~~~~~~~~~~ | ||
Add ``trend.at`` to the list of spiders: | ||
|
||
.. code-block:: ini | ||
# List of spiders to run by default, one per line. | ||
spiders = | ||
trend.at | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,185 +1,74 @@ | ||
import json | ||
from copy import deepcopy | ||
from functools import partial | ||
|
||
import lxml | ||
import scrapy | ||
from inline_requests import inline_requests | ||
|
||
from feeds.loaders import FeedEntryItemLoader | ||
from feeds.spiders import FeedsXMLFeedSpider | ||
from feeds.utils import generate_feed_header | ||
from feeds.spiders import FeedsSpider | ||
|
||
|
||
class AddendumOrgSpider(FeedsXMLFeedSpider): | ||
class AddendumOrgSpider(FeedsSpider): | ||
name = "addendum.org" | ||
start_urls = ["https://www.addendum.org/feed/rss2-addendum"] | ||
start_urls = ["https://www.addendum.org/api/wp/v2/posts"] | ||
|
||
_max_articles = 10 | ||
_num_articles = 0 | ||
feed_title = "Addendum" | ||
feed_subtitle = "das, was fehlt" | ||
feed_link = f"https://www.{name}" | ||
feed_logo = f"{feed_link}/content/themes/qvv/img/logoAdd.png" | ||
feed_icon = f"{feed_link}/content/themes/qvv/img/favicons/favicon-16x16.png" | ||
|
||
def feed_headers(self): | ||
feeds = {"": "Addendum", "podcast": "Addendum Podcast"} | ||
for path, title in feeds.items(): | ||
yield generate_feed_header( | ||
title=title, | ||
path=path, | ||
subtitle="das, was fehlt", | ||
link="https://www.{}".format(self.name), | ||
icon=( | ||
"https://www.{}/resources/dist/favicons/android-chrome-192x192.png" | ||
).format(self.name), | ||
def parse(self, response): | ||
posts = json.loads(response.text) | ||
for post in posts: | ||
il = FeedEntryItemLoader() | ||
il.add_value("title", post["title"]["rendered"]) | ||
il.add_value("link", post["link"]) | ||
il.add_value("updated", post["modified"]) | ||
yield scrapy.Request( | ||
post["link"], self._parse_article, meta={"il": il}, | ||
) | ||
|
||
def parse_node(self, response, node): | ||
url = node.xpath("link/text()").extract_first() | ||
if not node.xpath("category"): | ||
# Overview pages don't have a category. | ||
return | ||
if self._num_articles >= self._max_articles: | ||
# Maximum number of articles reached. | ||
return | ||
self._num_articles += 1 | ||
return scrapy.Request(url, self._parse_article) | ||
|
||
@staticmethod | ||
def _build_api_request(video_id): | ||
return scrapy.Request( | ||
"https://edge.api.brightcove.com/playback/v1/accounts/5548093587001/" | ||
+ "videos/{}".format(video_id), | ||
headers={ | ||
"Accept": "application/json;pk=" | ||
+ "BCpkADawqM0FoC-KqFQHZMCJQrN5XC_gdWIYO4204LOOSFrv34GdVavc7TJP" | ||
+ "tw5F612ztVUZmw47U2kWmyVU3kdtE6dga_P110le86FuaNCGPYlJ6ljW0Z3_" | ||
+ "HYTlnFsPeFs8F61PGWYBsYnG" | ||
}, | ||
) | ||
|
||
@inline_requests | ||
def _parse_article(self, response): | ||
def _inline_video(videos, elem): | ||
if "data-video-id" in elem.attrib: | ||
source = lxml.etree.Element("source") | ||
source.attrib["src"] = videos[elem.attrib["data-video-id"]] | ||
source.attrib["type"] = "video/mp4" | ||
elem.insert(0, source) | ||
return elem | ||
else: | ||
# Header video, replace with placeholder image. | ||
parent = elem.getparent() | ||
parent.tag = "figure" | ||
if "data-placeholderbig" in elem.attrib: | ||
src = elem.attrib["data-placeholderbig"] | ||
else: | ||
src = elem.attrib["data-placeholder"] | ||
image = lxml.etree.Element("img") | ||
image.attrib["src"] = src | ||
return image | ||
|
||
def _inline_picture(elem): | ||
elem.tag = "img" | ||
src = elem.attrib.get("data-original") | ||
data_min_width = 1000 if src else -1 | ||
for child in elem.getchildren(): | ||
if child.tag != "span": | ||
continue | ||
if int(child.attrib.get("data-min-width", 0)) > data_min_width: | ||
src = child.attrib["data-src"] | ||
data_min_width = int(child.attrib.get("data-min-width", 0)) | ||
child.drop_tree() | ||
elem.attrib["src"] = src | ||
return elem | ||
|
||
audio_ids = response.css( | ||
'#BCaudioPlayer_eindeutig::attr("data-video-id")' | ||
).extract() | ||
video_ids = response.css('.video-js::attr("data-video-id")').extract() | ||
media = {} | ||
for media_id in audio_ids + video_ids: | ||
api_response = yield self._build_api_request(media_id) | ||
api_response = json.loads(api_response.text) | ||
media[media_id] = sorted( | ||
( | ||
video | ||
for video in api_response["sources"] | ||
if "src" in video and video.get("container") == "MP4" | ||
), | ||
key=lambda v: v["size"], | ||
)[-1]["src"] | ||
|
||
remove_elems = [ | ||
"h1", | ||
"script", | ||
"style", | ||
".projectNav", | ||
".socialShare", | ||
".socialShare__headline", | ||
".socialShare__icon", | ||
".socialMedia", | ||
".socialMedia__headline", | ||
".whyRead", | ||
".overlayCTA", | ||
".authors", | ||
".sectionBackground--colorTheme1", | ||
".heroStage__copyright", | ||
".heroStage__downLink", | ||
".callToAction", | ||
".print-action", | ||
".internalLink span", | ||
".addCommunity", | ||
".download", | ||
".BCaudioPlayer", | ||
".icon-date", | ||
".callToAction__button", | ||
'a[href^="http://partners.webmasterplan.com/click.asp"]', | ||
".relatedSlider", | ||
".imageLightbox", | ||
".image__copyrightWrapper", | ||
".image__zoom", | ||
".image > .picture", | ||
".imageHC", | ||
".article-header .title", | ||
".article-header .date", | ||
".paragraph-comments-sticky", | ||
".qvv-spinner", | ||
".article-image.mobile", | ||
".image-copyright", | ||
".article-image-copyright", | ||
".vertical-slider-mobile-image", | ||
".quote-box", | ||
".icon-end-of-article", | ||
".article-teaser-card", | ||
".content-row:contains('Lesen Sie auch:')", | ||
".article-footer ~ div", | ||
".article-footer", | ||
".cta-box", | ||
".slide-image .mobile", | ||
".slide-text .credit", | ||
".article-play", | ||
".top-tag", | ||
] | ||
change_tags = { | ||
"div.heroStage__introText": "strong", | ||
".quote": "blockquote", | ||
".quote__label": "footer", | ||
".supernumber": "blockquote", | ||
".image": "figure", | ||
".image__element": "div", | ||
".summary": "strong", | ||
".article-captioned-image-2": "figure", | ||
".keyplayer-card": "figure", | ||
".keyplayer-card .text-wrapper": "figcaption", | ||
".article-captioned-image": "figure", | ||
".article-captioned-image .image-caption": "figcaption", | ||
".slide": "figure", | ||
".slide-text .caption": "figcaption", | ||
".bg-white": "blockquote", | ||
".collapse-box": "blockquote", | ||
} | ||
replace_elems = { | ||
"video": partial(_inline_video, media), | ||
".picture": _inline_picture, | ||
} | ||
pullup_elems = {".image__content figcaption": 3} | ||
il = FeedEntryItemLoader( | ||
response=response, | ||
base_url=response.url, | ||
base_url=self.feed_link, | ||
remove_elems=remove_elems, | ||
change_tags=change_tags, | ||
replace_elems=replace_elems, | ||
pullup_elems=pullup_elems, | ||
parent=response.meta["il"], | ||
) | ||
il.add_value("link", response.url) | ||
il.add_css("author_name", ".sidebar .authors__name::text") | ||
il.add_css("title", "title::text", re="(.*) - Addendum") | ||
il.add_css("updated", 'meta[property="article:modified_time"]::attr(content)') | ||
# If not yet modified: | ||
il.add_css("updated", 'meta[property="article:published_time"]::attr(content)') | ||
il.add_css("content_html", ".content") | ||
for medium_id, medium_url in media.items(): | ||
if medium_id not in audio_ids: | ||
il.add_value("enclosure", {"iri": medium_url, "type": "video/mp4"}) | ||
item = il.load_item() | ||
# Save a copy before yielding it. | ||
item_podcast = deepcopy(item) | ||
yield item | ||
|
||
if audio_ids: | ||
# Export to podcast feed. | ||
il = FeedEntryItemLoader(item=item_podcast) | ||
il.add_value("path", "podcast") | ||
for medium_id, medium_url in media.items(): | ||
if medium_id in audio_ids: | ||
il.add_value("enclosure", {"iri": medium_url, "type": "audio/mp4"}) | ||
yield il.load_item() | ||
il.add_css("content_html", ".article-wrapper") | ||
il.add_css("author_name", ".article-author-link ::text") | ||
il.add_css("category", ".top-tag ::text") | ||
yield il.load_item() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.