Skip to content

Commit

Permalink
Merge pull request #182 from Lukas0907/next
Browse files Browse the repository at this point in the history
Next
  • Loading branch information
Lukas0907 committed Oct 17, 2018
2 parents 403a116 + fbb4019 commit 7c966e8
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 38 deletions.
7 changes: 6 additions & 1 deletion docs/spiders/orf.at.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ Add ``orf.at`` to the list of spiders:
orf.at
orf.at supports different channels via the ``channels`` parameter (one per
line). If no channel is given, ``news`` is used.
line). If no channel is given, ``news`` is used. It also possible to give
a list of authors for which feeds will then be generated. Note that the
ressort in which the author writes still has to be included in the ressorts
parameter.

.. code-block:: ini
Expand All @@ -38,3 +41,5 @@ line). If no channel is given, ``news`` is used.
tirol
vorarlberg
wien
authors =
Erich Moechel
2 changes: 2 additions & 0 deletions feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ useragent = feeds (+https://github.com/nblock/feeds)
# vorarlberg
# tirol
# religion
#authors =
# Erich Moechel

#[derstandard.at]
#ressorts =
Expand Down
20 changes: 14 additions & 6 deletions feeds/exporters.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

from feeds.items import FeedEntryItem, FeedItem

logger = logging.getLogger(__name__)


class AtomExporter(BaseItemExporter):
class AtomFeed(object):
Expand All @@ -19,6 +21,7 @@ def __init__(self, exporter, link_self=None):
self._xml = etree.Element(
"feed", nsmap={None: "http://www.w3.org/2005/Atom"}
)
self._ids = set()

def add_item(self, item):
if isinstance(item, FeedItem):
Expand All @@ -27,10 +30,16 @@ def add_item(self, item):
for child in self._convert_feed_item(item):
self._xml.insert(0, child)
elif isinstance(item, FeedEntryItem):
entry = etree.Element("entry")
for child in self._convert_feed_item(item):
entry.append(child)
self._feed_items.append(entry)
if item["id"] not in self._ids:
self._ids.add(item["id"])
entry = etree.Element("entry")
for child in self._convert_feed_item(item):
entry.append(child)
self._feed_items.append(entry)
else:
logger.debug(
"Feed entry with id '{}' already in feed.".format(item["id"])
)

def insert_updated(self):
child = etree.Element("updated")
Expand Down Expand Up @@ -165,14 +174,13 @@ def __init__(self, output_path, output_url, name, **kwargs):
self._name = name
self._feeds = {}
self._pretty_print = kwargs.pop("pretty_print", True)
self._logger = logging.getLogger(__name__)

def finish_exporting(self):
for path, feed in self._feeds.items():
path = os.path.join(self._output_path, path)
os.makedirs(os.path.dirname(path), exist_ok=True)
if len(feed) == 0:
self._logger.warning("Feed {} contains no items!".format(path))
logger.warning("Feed '{}' contains no items!".format(path))
try:
os.remove(path)
except OSError:
Expand Down
10 changes: 9 additions & 1 deletion feeds/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,14 @@ def skip_empty_tree(tree):
return None


def skip_none(value):
"""Skip values that are None immediately."""
if value is not None:
return value

return None


def skip_false(value):
"""
Skip values that evaluate to False.
Expand Down Expand Up @@ -323,7 +331,7 @@ class BaseItemLoader(ItemLoader):
# Defaults
# Unescape twice to get rid of &&xxx; encoding errors.
default_input_processor = MapCompose(
str.strip, skip_false, html.unescape, html.unescape
skip_none, str.strip, skip_false, html.unescape, html.unescape
)
default_output_processor = TakeFirst()

Expand Down
3 changes: 3 additions & 0 deletions feeds/spiders/biblioweb_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

class BibliowebAtSpider(FeedsSpider):
name = "biblioweb.at"
custom_settings = {"DUPEFILTER_CLASS": "scrapy.dupefilters.RFPDupeFilter"}

_days = 60

Expand Down Expand Up @@ -44,6 +45,8 @@ def parse(self, response):

def parse_overview_page(self, response):
# Find other pages
# Note that the dupefilter has to be enabled, otherwise already
# parsed pages will be parsed again.
for href in response.xpath(
'//div[@id="p_main"][1]/div/a/div[@id!="p_aktuell"]/../@href'
):
Expand Down
21 changes: 17 additions & 4 deletions feeds/spiders/orf_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ def start_requests(self):

self._channels = channels

self._authors = [
author
for author in (
self.settings.get("FEEDS_SPIDER_ORF_AT_AUTHORS", "").split("\n")
)
if author
]

def feed_headers(self):
for channel in self._channels:
channel_url = "{}.ORF.at".format(channel)
Expand All @@ -75,6 +83,9 @@ def feed_headers(self):
logo=self._get_logo(channel),
)

for author in self._authors:
yield generate_feed_header(title="ORF.at: {}".format(author), path=author)

def parse_node(self, response, node):
categories = [
node.xpath("orfon:storyType/@rdf:resource").re_first("urn:orfon:type:(.*)"),
Expand Down Expand Up @@ -197,13 +208,15 @@ def _parse_article(self, response):
# other
updated = response.meta["updated"]
il.add_value("updated", updated)
il.add_css("title", "title::text", re="(.*) - .*")
il.add_css("title", "title::text", re=re.compile(r"(.*) - .*", flags=re.S))
il.add_value("link", response.url)
il.add_css("content_html", ".opener img") # FM4, news
il.add_css("content_html", ".story-lead-text") # news
il.add_css("content_html", "#ss-storyText")
il.add_css("content_html", "#ss-storyContent") # news
il.add_value("author_name", author)
if author in self._authors:
il.add_value("path", author)
il.add_value("path", response.meta["path"])
il.add_value("category", response.meta["categories"])
yield il.load_item()
Expand Down Expand Up @@ -242,11 +255,11 @@ def _extract_author(response):
)
author_selector = "#ss-storyText > .socialButtons + p"
if author:
return (author, author_selector)
return (author.strip(), author_selector)
elif domain == "orf.at":
author = response.css(".byline ::text").extract_first()
if author:
return (re.split(r"[/,]", author)[0], ".byline")
return (re.split(r"[/,]", author)[0].strip(), ".byline")
elif domain in ["science.orf.at", "help.orf.at", "religion.orf.at"]:
try:
author = (
Expand All @@ -259,7 +272,7 @@ def _extract_author(response):
# Only take the author name before ",".
author = re.split(r"[/,]", author)[0]
return (
author,
author.strip(),
(
"#ss-storyText > p:not(.date):not(.toplink):"
+ "contains('{}')"
Expand Down
11 changes: 5 additions & 6 deletions feeds/spiders/tvthek_orf_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ def parse(self, response):
)

for item in json_response["_embedded"]["items"]:
# Skip incomplete items or items with active youth protection.
# We want to have working download links in the feed item.
if not item["segments_complete"] or item["has_active_youth_protection"]:
continue

# We scrape the episode itself so we can get the segments which are not
# embedded in the schedule response.
# Furthermore since this request will be cached, the download URL will also
Expand Down Expand Up @@ -78,12 +83,6 @@ def _parse_episode(self, response):
if s["quality_key"] == "Q8C"
)
il.add_value("enclosure", {"iri": video["src"], "type": "video/mp4"})
subtitle = item["_embedded"].get("subtitle")
if subtitle:
subtitle = subtitle["_embedded"]["srt_file"]["public_urls"]["reference"]
il.add_value("enclosure", {"iri": subtitle["url"], "type": "text/plain"})
else:
self.logger.debug("No subtitle file found for '{}'".format(item["url"]))
il.add_value(
"category",
self._categories_from_oewa_base_path(
Expand Down
37 changes: 17 additions & 20 deletions feeds/spiders/uebermedien_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import scrapy
from scrapy.http import FormRequest
from inline_requests import inline_requests

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsXMLFeedSpider
Expand All @@ -23,31 +24,32 @@ def start_requests(self):
self._username = self.settings.get("FEEDS_SPIDER_UEBERMEDIEN_DE_USERNAME")
self._password = self.settings.get("FEEDS_SPIDER_UEBERMEDIEN_DE_PASSWORD")
if self._username and self._password:
yield scrapy.Request(
"https://steadyhq.com/en/oauth/authorize?"
+ "client_id=0c29f006-1a98-48f1-8a63-2c0652c59f28&"
+ "redirect_uri=https://uebermedien.de&scope=read&"
+ "response_type=code&refresh_only=false",
callback=self._steady_login,
meta={"cache_expires": timedelta(days=1)},
)
yield from self._steady_login(None)
else:
self.logger.info("Login failed: No username or password given")
# We can still try to scrape the free articles.
yield from super().start_requests()
self.logger.info("Login failed: No username or password given")

yield from super().start_requests()

@inline_requests
def _steady_login(self, response):
return FormRequest.from_response(
response = yield scrapy.Request(
"https://steadyhq.com/oauth/authorize?"
+ "client_id=0c29f006-1a98-48f1-8a63-2c0652c59f28&"
+ "redirect_uri=https://uebermedien.de&scope=read&"
+ "response_type=code&refresh_only=false",
meta={"cache_expires": timedelta(days=1)},
)

response = yield FormRequest.from_response(
response,
formdata=OrderedDict(
[("user[email]", self._username), ("user[password]", self._password)]
),
callback=self._request_steady_token,
dont_filter=True,
meta={"handle_httpstatus_list": [301], "cache_expires": timedelta(days=1)},
)

def _request_steady_token(self, response):
try:
code = parse_qs(urlparse(response.url).query)["code"][0]
except KeyError:
Expand All @@ -62,18 +64,14 @@ def _request_steady_token(self, response):
("redirect_uri", "https://uebermedien.de"),
]
)
return scrapy.Request(
response = yield scrapy.Request(
"https://steadyhq.com/api/v1/oauth/token",
method="POST",
body=json.dumps(body),
headers={"Accept": "application/json", "Content-Type": "application/json"},
callback=self._set_steady_token,
meta={"cache_expires": timedelta(days=1)},
)

def _set_steady_token(self, response):
self._steady_token = json.loads(response.text)["access_token"]
return super().start_requests()

def parse_node(self, response, node):
il = FeedEntryItemLoader(
Expand All @@ -84,8 +82,7 @@ def parse_node(self, response, node):
il.add_value("category", node.xpath("//category/text()").extract())
title = node.xpath("(//title)[2]/text()").extract()
if not title:
# Fallback to the first category if no title is provided
# (e.g. comic).
# Fallback to the first category if no title is provided (e.g. comic).
title = node.xpath("//category/text()").extract_first()
il.add_value("title", title)
link = node.xpath("(//link)[2]/text()").extract_first()
Expand Down

0 comments on commit 7c966e8

Please sign in to comment.