Skip to content

Commit

Permalink
Merge pull request #185 from Lukas0907/next
Browse files Browse the repository at this point in the history
Update TU Wien, diepresse, derstandard, add delinski
  • Loading branch information
Lukas0907 committed Oct 21, 2018
2 parents 7c966e8 + b28a127 commit 037ce53
Show file tree
Hide file tree
Showing 7 changed files with 125 additions and 34 deletions.
1 change: 1 addition & 0 deletions docs/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ scraping from there.
* :ref:`spider_atv.at`
* :ref:`spider_biblioweb.at`
* :ref:`spider_cbird.at`
* :ref:`spider_delinski.at`
* :ref:`spider_help.gv.at`
* :ref:`spider_indiehackers.com`
* :ref:`spider_openwrt.org`
Expand Down
15 changes: 15 additions & 0 deletions docs/spiders/delinski.at.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
.. _spider_delinski.at:

delinski.at
-----------
Newest restaurants in Wien bookable at `Delinski <https://delinski.at>`_.

Configuration
~~~~~~~~~~~~~
Add ``delinski.at`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
delinski.at
60 changes: 60 additions & 0 deletions feeds/spiders/delinski_at.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import json
import re
from datetime import datetime, timedelta

import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider


class DelinskiAtSpider(FeedsSpider):
name = "delinski.at"

feed_title = "Delinski"
feed_link = "https://{}".format(name)
feed_logo = "https://{}/favicon.ico".format(name)

def start_requests(self):
yield scrapy.Request(
"https://www.delinski.at/wien/restaurants",
# The restaurants page is not cached and takes a few seconds to load.
# Don't query more than once a day.
meta={"cache_expires": timedelta(days=1)},
)

def parse(self, response):
m = re.search("window.DELINSKI, {listViewEntities: (.*)}", response.text)
restaurants = sorted(
json.loads(m.group(1))["restaurants"]["entities"].values(),
key=lambda r: int(r["created"]),
reverse=True,
)
for restaurant in restaurants[:20]:
il = FeedEntryItemLoader(timezone="Europe/Vienna", base_url=response.url)
url = response.urljoin(restaurant["url"])
il.add_value("link", url)
il.add_value("title", restaurant["name"])
content = """
<img src="{image}">
<ul>
<li>{address}</li>
<li>{price_range_human}</li>
<li>{cuisine_text}</li>
</ul>
"""
il.add_value("content_html", content.format(**restaurant))
il.add_value("updated", datetime.fromtimestamp(int(restaurant["created"])))
yield scrapy.Request(url, self._parse_restaurant, meta={"il": il})

def _parse_restaurant(self, response):
il = FeedEntryItemLoader(
response=response,
base_url=response.url,
parent=response.meta["il"],
remove_elems=[".external"],
)
il.add_css("content_html", ".content .right p")
il.add_css("content_html", ".restaurant-link")
il.add_css("category", ".tags a ::text")
yield il.load_item()
5 changes: 3 additions & 2 deletions feeds/spiders/derstandard_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,9 @@ def _fix_img_src(elem):
".sequence-number",
".js-embed-output",
"#mycountrytalks-embed",
# Remove self-promotion for ressorts (links starting with "/r").
'.js-embed-output-feeds a[href^="/r"]',
# Remove self-promotion for (other) ressorts.
'.js-embed-output-feeds > a[href^="/r"]',
'.js-embed-output-feeds > a[href^="https://derstandard.at/"]',
]
change_tags = {
"#media-list li .description": "figcaption",
Expand Down
17 changes: 10 additions & 7 deletions feeds/spiders/diepresse_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,12 @@ def _clean_caption(elem):
il = FeedEntryItemLoader(
response=response,
parent=response.meta["il"],
remove_elems=[".ad", ".article-paid"],
remove_elems=[
".ad",
".article-paid",
".js-overlay-close",
".swiper-lazy-preloader",
],
change_tags={".article__lead": "strong"},
pullup_elems={".zoomable__image--zoomed": 2},
change_attribs={".zoomable__image--zoomed": {"data-src": "src"}},
Expand All @@ -84,14 +89,12 @@ def _clean_caption(elem):
)
il.add_css(
"author_name",
".article__main .article__author ::text",
"article .article__author ::text",
re=re.compile(r"\s*(?:[Vv]on\s*)?(.+)", flags=re.DOTALL),
)
il.add_css("content_html", ".article__main .article__media")
il.add_css(
"content_html", ".article__main .article__lead"
) # change tags to strong
il.add_css("content_html", ".article__main .article__body")
il.add_css("content_html", "article .article__media .zoomable__inner")
il.add_css("content_html", "article .article__lead") # change tags to strong
il.add_css("content_html", "article .article__body")
if response.css(".article-paid"):
il.add_value("category", "paywalled")
il.add_value("category", section.split("/"))
Expand Down
59 changes: 35 additions & 24 deletions feeds/spiders/tuwien_ac_at.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import json
import re

import scrapy
from scrapy.loader.processors import TakeFirst
from scrapy.selector import Selector
from inline_requests import inline_requests

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider
Expand All @@ -21,31 +21,42 @@ def start_requests(self):
meta={"dont_cache": True},
)

@inline_requests
def parse(self, response):
mitteilungsblaetter = response.css(".mitteilungsblaetter")
updated = mitteilungsblaetter.css("::text").re_first("(\d{2}\.\d{2}\.\d{4})")
updated = mitteilungsblaetter.css("::text").re_first(r"(\d{2}\.\d{2}\.\d{4})")
link = response.urljoin(
mitteilungsblaetter.css('a::attr("href")').extract_first()
)
return scrapy.Request(
response.urljoin(link),
self._parse_mitteilungsblatt,
meta={"updated": updated},
)

def _parse_mitteilungsblatt(self, response):
content = "".join(response.css("#contentInner > div").extract())
for entry in re.split('<a name="n\d*">', content)[1:]:
entry = Selector(text=entry)
il = FeedEntryItemLoader(
selector=entry,
base_url="https://tiss.{}".format(self.name),
timezone="Europe/Vienna",
dayfirst=True,
)
il.add_value("updated", response.meta["updated"])
anchor_name = entry.css('::attr("name")').extract_first()
il.add_value("link", response.url + "#{}".format(anchor_name))
il.add_css("title", "strong > u ::text", TakeFirst())
il.add_css("content_html", "p")
yield il.load_item()
response = yield scrapy.Request(link, method="HEAD")
mb_url = response.url
mb_id = re.search(
r"https://tiss.tuwien.ac.at/mbl/blatt_struktur/anzeigen/(\d+)", mb_url
).group(1)

url = "https://tiss.{}/api/mbl/v22/id/{}".format(self.name, mb_id)
response = yield scrapy.Request(url)

last_entry = None
for entry in reversed(json.loads(response.text)["knoten"]):
(entry["main"], entry["sub"]) = re.match(
r"(\d+)\.?(\d*)", entry["counter"]
).groups()
if last_entry is not None and last_entry["main"] == entry["main"]:
entry["inhalt"] += "<h2>{}</h2>".format(last_entry["titel"])
entry["inhalt"] += last_entry["inhalt"]
if entry["sub"] == "":
il = FeedEntryItemLoader(
base_url="https://tiss.{}".format(self.name),
timezone="Europe/Vienna",
dayfirst=True,
)
il.add_value("updated", updated)
il.add_value("link", mb_url + "#{}".format(entry["counter"]))
il.add_value("title", entry["titel"])
il.add_value("content_html", entry["inhalt"])
yield il.load_item()
last_entry = None
else:
last_entry = entry
2 changes: 1 addition & 1 deletion feeds/spiders/uebermedien_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from urllib.parse import parse_qs, urlparse

import scrapy
from scrapy.http import FormRequest
from inline_requests import inline_requests
from scrapy.http import FormRequest

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsXMLFeedSpider
Expand Down

0 comments on commit 037ce53

Please sign in to comment.