Skip to content

Commit

Permalink
Merge pull request #223 from Lukas0907/next
Browse files Browse the repository at this point in the history
Kurier/Profil fixes
  • Loading branch information
Lukas0907 committed Jul 25, 2020
2 parents 7abfb6c + e4b31e4 commit 0d1cec6
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 139 deletions.
2 changes: 1 addition & 1 deletion feeds/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
import lxml
from dateutil.parser import parse as dateutil_parse
from dateutil.tz import gettz
from itemloaders.processors import Compose, Identity, Join, MapCompose, TakeFirst
from lxml.cssselect import CSSSelector
from lxml.html.clean import Cleaner
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Compose, Identity, Join, MapCompose, TakeFirst
from w3lib.html import remove_tags

from feeds.items import FeedEntryItem, FeedItem
Expand Down
168 changes: 87 additions & 81 deletions feeds/spiders/kurier_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,82 @@
from feeds.utils import generate_feed_header


def parse_article(response):
article = json.loads(response.text)["layout"]["center"][0]
il = FeedEntryItemLoader()
il.add_value(
"link", urljoin("https://{}".format(article["portal"]), article["url"])
)
il.add_value("title", article["title"])
if "teaser_img" in article:
il.add_value(
"content_html",
_create_figure(
article["portal"],
article["teaser_img"]["url"],
article["teaser_img"].get("description"),
),
)
il.add_value(
"content_html", "<p><strong>{}</strong></p>".format(article["teaser_text"])
)
for paragraph in article["paragraphs"]:
if paragraph["type"] == "text":
il.add_value("content_html", paragraph["data"]["html"])
elif paragraph["type"] == "youtube":
url = "https://www.youtube.com/watch?v={}".format(
paragraph["data"]["videoid"]
)
il.add_value(
"content_html", '<div><a href="{url}">{url}</a></div>'.format(url=url),
)
elif paragraph["type"] == "image":
il.add_value(
"content_html",
_create_figure(
article["portal"],
paragraph["data"]["url"].replace("large", "original"),
paragraph["data"].get("description"),
),
)
elif paragraph["type"] == "gallery":
# Only include 1 image (the latest) if the feed type is article.
# This is is a special case for comic articles where a new image is
# added to the article once a day and it doesn't make sense to always
# include all the old ones in the feed.
max_images = 1 if response.meta["feed_type"] == "article" else None
for image in paragraph["data"]["images"][:max_images]:
il.add_value(
"content_html",
_create_figure(
article["portal"],
image["url"].replace("large", "original"),
image.get("description"),
),
)
il.add_value("updated", article["updated_date"])
for author in article["authors"]:
il.add_value("author_name", "{firstname} {lastname}".format(**author))
if not article["authors"]:
il.add_value("author_name", article["agency"])
il.add_value("category", article["channel"]["name"])
il.add_value("category", article["portal"])
if "path" in response.meta:
il.add_value("path", response.meta["path"])
if article["sponsored"]:
il.add_value("category", "sponsored")
il.add_value("category", article.get("pretitle"))
return il.load_item()


def _create_figure(name, src, caption=None):
src = urljoin("https://image.{}".format(name), src)
return (
'<figure><div><img src="{src}"></div>'
+ "<figcaption>{caption}</figcaption></figure>"
).format(src=src, caption=caption or "")


class KurierAtSpider(FeedsSpider):
name = "kurier.at"

Expand Down Expand Up @@ -52,7 +128,7 @@ def start_requests(self):
"https://efs.kurier.at/api/v1/cfs/route?uri=/kurierat{}".format(
article
),
self._parse_article,
parse_article,
meta={"path": article, "dont_cache": True, "feed_type": "article"},
)

Expand Down Expand Up @@ -85,87 +161,17 @@ def _parse_channel(self, response):
def _parse_collection(self, response):
articles = json.loads(response.text)["items"]
for article in articles:
yield scrapy.Request(
"https://efs.kurier.at/api/v1/cfs/route?uri=/{}{}".format(
article["portal"].replace(".", ""), article["url"]
),
self._parse_article,
meta={
"path": response.meta["path"],
"feed_type": response.meta["feed_type"],
},
)

def _create_figure(self, src, caption=None):
src = urljoin("https://image.{}".format(self.name), src)
return (
'<figure><div><img src="{src}"></div>'
+ "<figcaption>{caption}</figcaption></figure>"
).format(src=src, caption=caption or "")

def _parse_article(self, response):
article = json.loads(response.text)["layout"]["center"][0]
il = FeedEntryItemLoader()
il.add_value(
"link", urljoin("https://{}".format(article["portal"]), article["url"])
)
il.add_value("title", article["title"])
if "teaser_img" in article:
il.add_value(
"content_html",
self._create_figure(
article["teaser_img"]["url"],
article["teaser_img"].get("description"),
),
)
il.add_value(
"content_html", "<p><strong>{}</strong></p>".format(article["teaser_text"])
)
for paragraph in article["paragraphs"]:
if paragraph["type"] == "text":
il.add_value("content_html", paragraph["data"]["html"])
elif paragraph["type"] == "youtube":
url = "https://www.youtube.com/watch?v={}".format(
paragraph["data"]["videoid"]
)
il.add_value(
"content_html",
'<div><a href="{url}">{url}</a></div>'.format(url=url),
)
elif paragraph["type"] == "image":
il.add_value(
"content_html",
self._create_figure(
paragraph["data"]["url"].replace("large", "original"),
paragraph["data"].get("description"),
if article["type"] != "empty":
yield scrapy.Request(
"https://efs.kurier.at/api/v1/cfs/route?uri=/{}{}".format(
article["portal"].replace(".", ""), article["url"]
),
parse_article,
meta={
"path": response.meta["path"],
"feed_type": response.meta["feed_type"],
},
)
elif paragraph["type"] == "gallery":
# Only include 1 image (the latest) if the feed type is article.
# This is is a special case for comic articles where a new image is
# added to the article once a day and it doesn't make sense to always
# include all the old ones in the feed.
max_images = 1 if response.meta["feed_type"] == "article" else None
for image in paragraph["data"]["images"][:max_images]:
il.add_value(
"content_html",
self._create_figure(
image["url"].replace("large", "original"),
image.get("description"),
),
)
il.add_value("updated", article["updated_date"])
for author in article["authors"]:
il.add_value("author_name", "{firstname} {lastname}".format(**author))
if not article["authors"]:
il.add_value("author_name", article["agency"])
il.add_value("category", article["channel"]["name"])
il.add_value("category", article["portal"])
il.add_value("path", response.meta["path"])
if article["sponsored"]:
il.add_value("category", "sponsored")
il.add_value("category", article.get("pretitle"))
return il.load_item()

def _parse_author(self, response):
query = json.loads(response.text)["layout"]["center"][0]["query"]
Expand All @@ -188,7 +194,7 @@ def _parse_search(self, response):
"https://efs.kurier.at/api/v1/cfs/route?uri=/{}{}".format(
article["portal"].replace(".", ""), article["url"]
),
self._parse_article,
parse_article,
meta={
"path": response.meta["path"],
"feed_type": response.meta["feed_type"],
Expand Down
63 changes: 7 additions & 56 deletions feeds/spiders/profil_at.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,20 @@
from datetime import datetime, timedelta

import scrapy
from dateutil.tz import gettz

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsXMLFeedSpider
from feeds.spiders import FeedsXMLFeedSpider, kurier_at


class ProfilAtSpider(FeedsXMLFeedSpider):
name = "profil.at"
namespaces = [
("i", "http://www.google.com/schemas/sitemap-image/1.1"),
("rss", "http://www.sitemaps.org/schemas/sitemap/0.9"),
]
itertag = "rss:url"
itertag = "item/link/text()"
iterator = "xml"
start_urls = ["https://www.profil.at/xml/rss"]

feed_title = "PROFIL"
feed_subtitle = "Österreichs unabhängiges Nachrichtenmagazin"

def start_requests(self):
# Scrape this and last month so that the feed is not empty on the first day of a
# new month.
this_month = datetime.now(gettz("Europe/Vienna")).date().replace(day=1)
last_month = (this_month - timedelta(days=1)).replace(day=1)
for month in [this_month, last_month]:
yield scrapy.Request(
"https://www.{}/sitemap-articles-{}.xml".format(
self.name, month.strftime("%Y-%m")
),
meta={"dont_cache": True, "handle_httpstatus_list": [404]},
)

def parse_node(self, response, node):
url = node.xpath("rss:loc/text()").extract_first()
updated = node.xpath("rss:lastmod/text()").extract_first()
return scrapy.Request(url, self.parse_item, meta={"updated": updated})

def parse_item(self, response):
remove_elems = [
"aside",
"script",
"h1",
"source",
".breadcrumbs",
".author-date",
".artikel-social-kommentar",
".bild-copyright",
".ressortTitleMobile",
".article-number",
".artikel-kommentarlink",
".umfrage-wrapper",
".articleIssueInfo",
]
il = FeedEntryItemLoader(
response=response,
base_url="https://{}".format(self.name),
remove_elems=remove_elems,
)
il.add_value("link", response.url)
author_name = (
response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)") or "Red."
path = node.extract().replace("https://profil.at/", "")
url = "https://efs.profil.at/api/v1/cfs/route?uri=/profilat/" + path
return scrapy.Request(
url, kurier_at.parse_article, meta={"feed_type": "article"}
)
il.add_value("author_name", author_name)
il.add_css("title", 'h1[itemprop="headline"]::text')
il.add_value("updated", response.meta["updated"])
il.add_css("content_html", "article")
return il.load_item()
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
include_package_data=True,
install_requires=[
"Click>=6.6",
"Scrapy>=1.6",
"Scrapy>=2.2",
"bleach>=1.4.3",
"dateparser>=0.5.1",
"feedparser",
Expand All @@ -25,6 +25,7 @@
"pyxdg>=0.26",
"readability-lxml>=0.7",
"scrapy-inline-requests",
"itemloaders", # explicit dependency of Scrapy > 2.2.1
],
extras_require={
"docs": ["sphinx", "sphinx_rtd_theme"],
Expand Down

0 comments on commit 0d1cec6

Please sign in to comment.