Skip to content

Commit

Permalink
Merge pull request #179 from Lukas0907/next
Browse files Browse the repository at this point in the history
Fix enclosure, improvements for tvthek
  • Loading branch information
Lukas0907 committed Oct 5, 2018
2 parents 017d14f + eb551d2 commit cd61665
Show file tree
Hide file tree
Showing 9 changed files with 84 additions and 53 deletions.
35 changes: 17 additions & 18 deletions feeds/exporters.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ def __init__(self, exporter, link_self=None):
self._link_self = link_self
self._feed_updated = None
self._feed_items = []
self._xml = etree.Element("feed")
self._xml.set("xmlns", "http://www.w3.org/2005/Atom")
self._xml = etree.Element(
"feed", nsmap={None: "http://www.w3.org/2005/Atom"}
)

def add_item(self, item):
if isinstance(item, FeedItem):
Expand Down Expand Up @@ -72,22 +73,19 @@ def _convert_feed_item(self, item):
xml_items.append(self._convert_special_link(item, key, "self"))
item.pop(key)

# Convert enclosure
key_iri = "enclosure_iri"
key_type = "enclosure_type"
if key_iri in item:
xml_items.append(
self._convert_special_enclosure(item, key_iri, key_type)
)
item.pop(key_iri)
item.pop(key_type, None)

# Convert content
for key in ("content_text", "content_html"):
if key in item:
xml_items.append(self._convert_special_content(item, key))
item.pop(key)

# Convert enclosure
key = "enclosure"
if key in item:
for enclosure in self._convert_special_enclosure(item, key):
xml_items.append(enclosure)
item.pop(key)

key = "category"
if key in item:
for category in self._convert_special_category(item, key):
Expand Down Expand Up @@ -139,12 +137,13 @@ def _convert_special_content(self, item, key):
xml_item.text = item[key]
return xml_item

def _convert_special_enclosure(self, item, key_iri, key_type):
xml_item = etree.Element("link")
xml_item.set("rel", "enclosure")
xml_item.set("href", item[key_iri])
xml_item.set("type", item[key_type])
return xml_item
def _convert_special_enclosure(self, item, key):
for enclosure in item[key]:
xml_item = etree.Element("link")
xml_item.set("rel", "enclosure")
xml_item.set("href", enclosure["iri"])
xml_item.set("type", enclosure["type"])
yield xml_item

def _convert_special_category(self, item, key):
for category in item[key]:
Expand Down
4 changes: 1 addition & 3 deletions feeds/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,4 @@ class FeedEntryItem(BaseItem):
content_html = scrapy.Field()

# Optional
enclosure_iri = scrapy.Field()
# Optional
enclosure_type = scrapy.Field()
enclosure = scrapy.Field()
3 changes: 3 additions & 0 deletions feeds/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,3 +370,6 @@ class FeedEntryItemLoader(BaseItemLoader):

# Use sorted to keep the output stable.
category_out = Compose(set, sorted)

enclosure_in = Identity()
enclosure_out = Identity()
8 changes: 7 additions & 1 deletion feeds/spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@ def from_crawler(cls, crawler):

def process_spider_exception(self, response, exception, spider):
if isinstance(exception, HttpError):
logger.info(
if response.status >= 500:
# Transient errors usually caused by overloaded sites, updates, short
# downtimes, etc.
lgr = logger.info
else:
lgr = logger.warning
lgr(
"Ignoring response %(response)r: HTTP status code is not "
"handled or not allowed",
{"response": response},
Expand Down
6 changes: 2 additions & 4 deletions feeds/spiders/addendum_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,7 @@ def _inline_picture(elem):
il.add_css("content_html", ".content")
for medium_id, medium_url in media.items():
if medium_id not in audio_ids:
il.add_value("enclosure_iri", medium_url)
il.add_value("enclosure_type", "video/mp4")
il.add_value("enclosure", {"iri": medium_url, "type": "video/mp4"})
item = il.load_item()
# Save a copy before yielding it.
item_podcast = deepcopy(item)
Expand All @@ -168,6 +167,5 @@ def _inline_picture(elem):
il.add_value("path", "podcast")
for medium_id, medium_url in media.items():
if medium_id in audio_ids:
il.add_value("enclosure_iri", medium_url)
il.add_value("enclosure_type", "audio/mp4")
il.add_value("enclosure", {"iri": medium_url, "type": "audio/mp4"})
yield il.load_item()
8 changes: 7 additions & 1 deletion feeds/spiders/diepresse_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,15 @@ def parse_node(self, response, node):
if keywords:
il.add_value("category", keywords.split(", "))
il.add_xpath("updated", "news:news/news:publication_date/text()")
return scrapy.Request(url, self.parse_item, meta={"il": il})
return scrapy.Request(
url, self.parse_item, meta={"il": il, "handle_httpstatus_list": [404]}
)

def parse_item(self, response):
if response.status == 404:
self.logger.info("Article '{}' not available anymore.".format(response.url))
return

def _clean_caption(elem):
if "–" in elem.text:
# Caption is of the format "text - credit".
Expand Down
7 changes: 3 additions & 4 deletions feeds/spiders/oe1_orf_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,11 @@ def _parse_broadcast(self, response):
il.add_value("link", link)
il.add_value("title", broadcast["programTitle"])
il.add_value("title", broadcast["title"])
if broadcast.get("streams"):
for stream in broadcast["streams"]:
stream = "https://loopstream01.apa.at/?channel=oe1&id={}".format(
broadcast["streams"][0]["loopStreamId"]
stream["loopStreamId"]
)
il.add_value("enclosure_iri", stream)
il.add_value("enclosure_type", "audio/mpeg")
il.add_value("enclosure", {"iri": stream, "type": "audio/mpeg"})
il.add_value("updated", broadcast["niceTimeISO"])
if broadcast["subtitle"]:
il.add_value(
Expand Down
1 change: 1 addition & 0 deletions feeds/spiders/orf_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def _parse_article(self, response):
".storyMeta",
"script",
".oon-youtube-logo",
".vote",
# redesign
"#more-to-read-anchor",
".social-buttons",
Expand Down
65 changes: 43 additions & 22 deletions feeds/spiders/tvthek_orf_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,32 +42,53 @@ def parse(self, response):
)

for item in json_response["_embedded"]["items"]:
il = FeedEntryItemLoader(response=response)
il.add_value("title", item["title"])
il.add_value(
"content_html",
'<img src="{}">'.format(item["playlist"]["preview_image_url"]),
)
if item["description"]:
il.add_value(
"content_html", item["description"].replace("\r\n", "<br>")
)
il.add_value("updated", item["date"])
il.add_value(
"link", item["url"].replace("api-tvthek.orf.at", "tvthek.orf.at")
)
# We scrape the episode itself so we can get the segments which are not
# embedded in the schedule response.
# Furthermore since this request will be cached, the download URL will also
# be cached which is convenient for youth protected content.
yield Request(
item["_links"]["profile"]["href"],
self._parse_profile,
meta={"item": il},
dont_filter=True,
item["_links"]["self"]["href"],
self._parse_episode,
# Responses are > 100 KB and useless after 7 days.
# So don't keep them longer than necessary.
meta={"cache_expires": timedelta(days=7)},
)

def _parse_profile(self, response):
il = response.meta["item"]
profile = json.loads(response.text)
def _parse_episode(self, response):
item = json.loads(response.text)
il = FeedEntryItemLoader()
il.add_value("title", item["title"])
il.add_value(
"content_html",
'<img src="{}">'.format(item["playlist"]["preview_image_url"]),
)
if item["description"]:
il.add_value("content_html", item["description"].replace("\r\n", "<br>"))
il.add_value("updated", item["date"])
il.add_value("link", item["url"].replace("api-tvthek.orf.at", "tvthek.orf.at"))
# Check how many segments are part of this episode.
if len(item["_embedded"]["segments"]) == 1:
# If only one segment, item["sources"] contains invalid links.
# We use the first embedded segment instead.
# This is also how mediathekviewweb.de works.
item["sources"] = item["_embedded"]["segments"][0]["sources"]
video = next(
s
for s in item["sources"]["progressive_download"]
if s["quality_key"] == "Q8C"
)
il.add_value("enclosure", {"iri": video["src"], "type": "video/mp4"})
subtitle = item["_embedded"].get("subtitle")
if subtitle:
subtitle = subtitle["_embedded"]["srt_file"]["public_urls"]["reference"]
il.add_value("enclosure", {"iri": subtitle["url"], "type": "text/plain"})
else:
self.logger.debug("No subtitle file found for '{}'".format(item["url"]))
il.add_value(
"category", self._categories_from_oewa_base_path(profile["oewa_base_path"])
"category",
self._categories_from_oewa_base_path(
item["_embedded"]["profile"]["oewa_base_path"]
),
)
return il.load_item()

Expand Down

0 comments on commit cd61665

Please sign in to comment.