Skip to content

Commit

Permalink
Merge pull request #192 from Lukas0907/next
Browse files Browse the repository at this point in the history
Fixes for site changes
  • Loading branch information
Lukas0907 committed Mar 25, 2019
2 parents 76f91bd + 651d687 commit 942bd0e
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 6 deletions.
4 changes: 1 addition & 3 deletions feeds/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@

logger = logging.getLogger(__name__)

_lxml_cleaner = Cleaner(
scripts=True, javascript=True, comments=True, style=True, inline_style=True
)
_lxml_cleaner = Cleaner(style=True)

# List of so-called empty elements in HTML.
# Source: https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
Expand Down
29 changes: 27 additions & 2 deletions feeds/spiders/arstechnica_com.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

import scrapy

from feeds.loaders import FeedEntryItemLoader
Expand Down Expand Up @@ -53,10 +55,33 @@ def parse_node(self, response, node):
meta={"il": il, "path": response.meta["path"], "first_page": True},
)

@staticmethod
def _div_to_img(elem):
elem.tag = "img"
url = re.search(r"url\('([^']+)'\)", elem.attrib["style"]).group(1)
elem.attrib["src"] = url
elem.attrib["style"] = None
return elem

def _parse_article(self, response):
remove_elems = [".caption-credit", ".gallery-image-credit"]
remove_elems = [
".caption-credit",
".gallery-image-credit",
"#social-left",
"ul.toc",
"h3:contains('Table of Contents')",
"br",
".sidebar:contains('Further Reading')",
".credit",
]
change_tags = {".sidebar": "blockquote", "aside": "blockquote"}
replace_elems = {"div.image": self._div_to_img}
il = FeedEntryItemLoader(
response=response, parent=response.meta["il"], remove_elems=remove_elems
response=response,
parent=response.meta["il"],
remove_elems=remove_elems,
replace_elems=replace_elems,
change_tags=change_tags,
)
if response.meta.get("first_page", False):
il.add_value("link", response.url)
Expand Down
10 changes: 9 additions & 1 deletion feeds/spiders/tvthek_orf_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from dateutil.tz import gettz
from scrapy import Request

from feeds.exceptions import DropResponse
from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider

Expand Down Expand Up @@ -85,7 +86,14 @@ def _parse_episode(self, response):
)
il.add_value("enclosure", {"iri": video["src"], "type": "video/mp4"})
except StopIteration:
self.logger.error("Could not extract video for '{}'!".format(item["title"]))
self.logger.warning(
"Could not extract video for '{}'!".format(item["title"])
)
raise DropResponse(
"Skipping {} because not downloadable yet".format(response.url),
transient=True
)

subtitle = item["_embedded"].get("subtitle")
if subtitle:
subtitle = subtitle["_embedded"]["srt_file"]["public_urls"]["reference"]
Expand Down
5 changes: 5 additions & 0 deletions feeds/spiders/ubup_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ def parse(self, response):
image_url = item.css(".item-image::attr(data-bg)").re_first(
r"url\(([^)]+)\)"
)
# Fix broken images.
if image_url.startswith("https://markenankauf.momox.de/pics/https://"):
image_url = image_url.replace(
"https://markenankauf.momox.de/pics/https://", "https://"
)
il.add_value("content_html", '<img src="{}">'.format(image_url))
il.add_css("content_html", ".item-des-container")
il.add_value("path", response.meta["path"])
Expand Down

0 comments on commit 942bd0e

Please sign in to comment.