Skip to content

Commit

Permalink
Merge pull request #154 from Lukas0907/indiehackers
Browse files Browse the repository at this point in the history
Add spider for indiehackers.com.
  • Loading branch information
Lukas0907 committed Aug 20, 2018
2 parents 19b1f78 + 48930e1 commit 1e7cd92
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 0 deletions.
16 changes: 16 additions & 0 deletions docs/spiders/indiehackers.com.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.. _spider_indiehackers.com:

indiehackers.com
----------------
Newest interviews on `Indie Hackers <https://indiehackers.com>`_.

Configuration
~~~~~~~~~~~~~
Add ``indiehackers.com`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
indiehackers.com
53 changes: 53 additions & 0 deletions feeds/spiders/indiehackers_com.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider


class IndieHackersComSpider(FeedsSpider):
name = "indiehackers.com"
allowed_domains = [name]
start_urls = ["https://www.indiehackers.com/interviews/page/1"]

_title = "Indie Hackers"

def parse(self, response):
interviews = response.css(
".interview__link::attr(href), .interview__date::text"
).extract()
self._logo = response.urljoin(
response.css('link[rel="icon"][sizes="192x192"]::attr(href)').
extract_first()
)
self._icon = response.urljoin(
response.css('link[rel="icon"][sizes="16x16"]::attr(href)').
extract_first()
)
for link, date in zip(interviews[::2], interviews[1::2]):
yield scrapy.Request(
response.urljoin(link),
self._parse_interview,
meta={"updated": date.strip()},
)

def _parse_interview(self, response):
remove_elems = [
".shareable-quote",
".share-bar",
# Remove the last two h2s and all paragraphs below.
".interview-body > h2:last-of-type ~ p",
".interview-body > h2:last-of-type",
".interview-body > h2:last-of-type ~ p",
".interview-body > h2:last-of-type",
]
il = FeedEntryItemLoader(
response=response,
base_url="https://{}".format(self.name),
remove_elems=remove_elems,
)
il.add_value("link", response.url)
il.add_css("title", "h1::text")
il.add_css("author_name", "header .user-link__name::text")
il.add_css("content_html", ".interview-body")
il.add_value("updated", response.meta["updated"])
yield il.load_item()

0 comments on commit 1e7cd92

Please sign in to comment.