Skip to content

Commit

Permalink
Merge pull request #194 from Lukas0907/next
Browse files Browse the repository at this point in the history
Extraction improvements
  • Loading branch information
Lukas0907 committed Apr 4, 2019
2 parents 942bd0e + cd51dff commit 5f996b6
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 22 deletions.
48 changes: 28 additions & 20 deletions feeds/spiders/derstandard_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,27 @@ def start_requests(self):
meta={"dont_cache": True, "ressort": ressort},
)

self._users = self.settings.get("FEEDS_SPIDER_DERSTANDARD_AT_USERS")
if self._users:
self._users = {user_id: None for user_id in self._users.split()}
for user_id in self._users.keys():
for page in range(3):
yield scrapy.Request(
(
"https://{}/userprofil/postings/{}?"
+ "pageNumber={}&sortMode=1"
).format(self.name, user_id, page),
self._parse_user_profile,
meta={
# Older pages should be cached longer.
"cache_expires": timedelta(hours=page),
"path": "userprofil/postings/{}".format(user_id),
"user_id": user_id,
},
)
self._users = {
user_id: None
for user_id in self.settings.get(
"FEEDS_SPIDER_DERSTANDARD_AT_USERS", ""
).split()
}
for user_id in self._users.keys():
for page in range(3):
yield scrapy.Request(
(
"https://{}/userprofil/postings/{}?"
+ "pageNumber={}&sortMode=1"
).format(self.name, user_id, page),
self._parse_user_profile,
meta={
# Older pages should be cached longer.
"cache_expires": timedelta(hours=page),
"path": "userprofil/postings/{}".format(user_id),
"user_id": user_id,
},
)

def feed_headers(self):
for ressort in self._ressorts:
Expand Down Expand Up @@ -127,8 +130,13 @@ def _fix_img_src(elem):
".js-embed-output",
"#mycountrytalks-embed",
# Remove self-promotion for (other) ressorts.
'.js-embed-output-feeds > a[href^="/r"]',
'.js-embed-output-feeds > a[href^="https://derstandard.at/"]',
'.js-embed-output-feeds a[href^="/r"]',
'.js-embed-output-feeds a[href^="https://derstandard.at/"]',
(
".js-embed-output-feeds "
+ 'img[src="https://images.derstandard.at/2018/10/18/'
+ 'Immobiliensuche202x122.png"]'
),
]
change_tags = {
"#media-list li .description": "figcaption",
Expand Down
6 changes: 5 additions & 1 deletion feeds/spiders/orf_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,11 @@ def _parse_article(self, response):
+ "verfügbar.</em></p>"
}
change_attribs = {"img": {"data-src": "src", "srcset": "src"}}
change_tags = {".image": "figure", ".caption": "figcaption"}
change_tags = {
".image": "figure",
".caption": "figcaption",
".fact": "blockquote", # FM4
}
author, author_selector = self._extract_author(response)
if author:
self.logger.debug("Extracted possible author '{}'".format(author))
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/tvthek_orf_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def _parse_episode(self, response):
)
raise DropResponse(
"Skipping {} because not downloadable yet".format(response.url),
transient=True
transient=True,
)

subtitle = item["_embedded"].get("subtitle")
Expand Down
2 changes: 2 additions & 0 deletions feeds/spiders/uebermedien_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,12 @@ def _parse_article(self, response):
remove_elems = ["script"]
convert_footnotes = [".footnoteContent"]
pullup_elems = {".footnoteContent": 1}
change_tags = {".entry-content-info-box": "blockquote"}
il = FeedEntryItemLoader(
response=response,
parent=response.meta["il"],
remove_elems=remove_elems,
change_tags=change_tags,
base_url="https://{}".format(self.name),
convert_footnotes=convert_footnotes,
pullup_elems=pullup_elems,
Expand Down

0 comments on commit 5f996b6

Please sign in to comment.