Skip to content

Commit

Permalink
Merge pull request #220 from PyFeeds/dev/nblock
Browse files Browse the repository at this point in the history
Spider for tinyletter.com and smaller fixes
  • Loading branch information
nblock committed May 18, 2020
2 parents ca077b7 + cb9abd6 commit bacb025
Show file tree
Hide file tree
Showing 20 changed files with 102 additions and 7 deletions.
6 changes: 3 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ Feeds can be installed from PyPI using ``pip``:
$ pip install PyFeeds
You may also install the current development version:
You may also install the current development version. The master branch is
considered stable enough for daily use:

.. code-block:: bash
Expand Down Expand Up @@ -144,9 +145,8 @@ and `Lukas Anzinger <https://www.notinventedhere.org>`_.
License
-------

AGPL3, see `LICENSEFILE`_ for details.
AGPL3, see https://pyfeeds.readthedocs.io/en/latest/license.html for details.

.. _LICENSEFILE: LICENSE
.. _issue tracker: https://github.com/pyfeeds/pyfeeds/issues
.. _new issue: https://github.com/pyfeeds/pyfeeds/issues/new
.. _Scrapy: https://www.scrapy.org
Expand Down
1 change: 1 addition & 0 deletions docs/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ scraping from there.
* :ref:`spider_puls4.com`
* :ref:`spider_python-patterns.guide`
* :ref:`spider_servustv.com`
* :ref:`spider_tinyletter.com`
* :ref:`spider_tuwien.ac.at`
* :ref:`spider_ubup.com`
* :ref:`spider_usenix.org`
Expand Down
3 changes: 2 additions & 1 deletion docs/get.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ Feeds can be installed from PyPI using ``pip``:
$ pip install PyFeeds
You may also install the current development version:
You may also install the current development version. The master branch is
considered stable enough for daily use:

.. code-block:: bash
Expand Down
25 changes: 25 additions & 0 deletions docs/spiders/tinyletter.com.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
.. _spider_tinyletter.com:

tinyletter.com
--------------
Latest articles from `tinyletter <tinyletter.com>`_ users.

Configuration
~~~~~~~~~~~~~
Add ``tinyletter.com`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
tinyletter.com
At least one account is required. The account name is visible on the
subscription page, e.g. for http://tinyletter.com/dabeaz, the account name is
``dabeaz``.

.. code-block:: ini
[tinyletter.com]
accounts =
dabeaz
4 changes: 4 additions & 0 deletions feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,7 @@ useragent = feeds (+https://github.com/pyfeeds/pyfeeds)
# finance-and-economics
# special-report
# leaders

#[tinyletter.com]
#accounts =
# dabeaz
64 changes: 64 additions & 0 deletions feeds/spiders/tinyletter_com.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider
from feeds.utils import generate_feed_header


class TinyletterComSpider(FeedsSpider):
name = "tinyletter.com"
allowed_domains = ["tinyletter.com"]

_base_url = f"https://{name}"
_titles = {}
_subtitles = {}
_links = {}

def start_requests(self):
self._accounts = self.settings.get("FEEDS_SPIDER_TINYLETTER_COM_ACCOUNTS", [])
if self._accounts:
self._accounts = set(self._accounts.split())
else:
self.logger.error("No accounts given!")
return

for account in self._accounts:
yield scrapy.Request(
f"{self._base_url}/{account}/archive?recs=1000",
meta={"dont_cache": True, "account": account},
)

def feed_headers(self):
for account in self._accounts:
yield generate_feed_header(
title=self._titles.get(account),
subtitle=self._subtitles.get(account),
link=self._links.get(account),
icon=f"{self._base_url}/site/favicon.ico",
logo=f"{self._base_url}/site/assets/images/brand-assets/TL_logo.svg",
path=account,
)

def parse(self, response):
account = response.meta["account"]
self._titles[account] = response.css("title::text").get()
self._subtitles[account] = response.css(
"meta[property='og:description']::attr('content')"
).get()
self._links[account] = f"{self._base_url}/{account}/archive"

for u in response.css("ul.message-list a.message-link::attr('href')").getall():
yield scrapy.Request(
u, self.parse_letter, meta={"account": account},
)

def parse_letter(self, response):
account = response.meta["account"]
il = FeedEntryItemLoader(response=response, base_url=self._links.get(account))
il.add_value("path", account)
il.add_value("link", response.url)
il.add_css("title", "title::text")
il.add_css("author_name", "div#message-heading div.by-line a::text")
il.add_css("updated", "div#message-heading div.date::text")
il.add_css("content_html", "div.message-body")
yield il.load_item()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
author="Florian Preinstorfer, Lukas Anzinger",
author_email="florian@nblock.org, lukas@lukasanzinger.at",
url="https://github.com/PyFeeds/PyFeeds",
packages=find_packages(),
packages=find_packages(exclude=["tests"]),
include_package_data=True,
install_requires=[
"Click>=6.6",
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion feeds/tests/test_loaders.py → tests/test_loaders.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pytest

from feeds.loaders import build_tree, flatten_tree, serialize_tree
from feeds.tests.utils import load_file, load_fixtures
from .utils import load_file, load_fixtures


@pytest.mark.parametrize("in_html_file,out_html_file", load_fixtures("flatten_tree"))
Expand Down
2 changes: 1 addition & 1 deletion feeds/tests/utils.py → tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def load_file(filename):


def load_fixtures(test_name):
path = "feeds/tests/fixtures/{test_name}/*.{direction}.html"
path = "tests/fixtures/{test_name}/*.{direction}.html"
in_files = sorted(glob(path.format(test_name=test_name, direction="in")))
out_files = sorted(glob(path.format(test_name=test_name, direction="out")))
return zip(in_files, out_files)

0 comments on commit bacb025

Please sign in to comment.