Skip to content

Commit

Permalink
Merge pull request #118 from Lukas0907/orf
Browse files Browse the repository at this point in the history
ORF spider and more style checks
  • Loading branch information
Lukas0907 committed Jun 15, 2018
2 parents 8cdec1f + 0f8256c commit f6d0c99
Show file tree
Hide file tree
Showing 15 changed files with 318 additions and 15 deletions.
3 changes: 2 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
[flake8]
max-line-length = 89
max-line-length = 88
ignore = E203,W503
7 changes: 4 additions & 3 deletions .isort.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[settings]
line_length = 88
known_first_party = feeds
multi_line_output = 3
combine_as_imports = true
include_trailing_comma = true
include_trailing_comma = True
force_grid_wrap = 0
line_length = 88
not_skip = __init__.py
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ Feeds is currently able to create Atom feeds for the following sites:
of Weekly Editions
* `Oberösterreichische Nachrichten <https://www.nachrichten.at>`_:
Newest articles
* `ORF ON <http://orf.at>`_: Newest articles of various ORF ON (news, FM4, science,
etc.) platforms.
* `ORF Ö1 <http://oe1.orf.at>`_: Newest episodes of radio shows
* `ORF TVthek <http://tvthek.orf.at>`_: Newest episodes of TV shows
* `profil <http://www.profil.at>`_: Newest articles
Expand Down
40 changes: 40 additions & 0 deletions docs/spiders/orf.at.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
.. _spider_orf.at:

orf.at
------
Newest articles from `ORF ON <http://www.orf.at>`_.

Configuration
~~~~~~~~~~~~~
Add ``orf.at`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
orf.at
orf.at supports different channels via the ``channels`` parameter (one per line). If no
channel is given, ``news`` is used.

.. code-block:: ini
[orf.at]
ressorts =
burgenland
fm4
help
kaernten
news
noe
oe3
oesterreich
ooe
religion
salzburg
science
sport
steiermark
tirol
vorarlberg
wien
20 changes: 20 additions & 0 deletions feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,23 @@ useragent = feeds (+https://github.com/nblock/feeds)
## and password.
# username =
# password =

#[orf.at]
#channels =
# news
# fm4
# science
# help
# sport
# oe3
# oesterreich
# burgenland
# wien
# noe
# ooe
# salzburg
# steiermark
# kaernten
# vorarlberg
# tirol
# religion
2 changes: 1 addition & 1 deletion feeds/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def cleanup_cache(cache_dir, max_age):

logger.debug("Cleaning cache entries from {} ...".format(cache_dir))

for root, dirs, files in os.walk(cache_dir, topdown=False):
for root, _dirs, files in os.walk(cache_dir, topdown=False):
if "pickled_meta" in files:
meta = _read_meta(root)
timestamp = datetime.fromtimestamp(meta["timestamp"])
Expand Down
2 changes: 0 additions & 2 deletions feeds/exporters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@


class AtomExporter(BaseItemExporter):

class AtomFeed(object):

def __init__(self, exporter, link_self=None):
self._exporter = exporter
self._link_self = link_self
Expand Down
1 change: 0 additions & 1 deletion feeds/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@


class SpiderSettings:

@classmethod
def from_crawler(cls, crawler):
ext = cls()
Expand Down
23 changes: 21 additions & 2 deletions feeds/loaders.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import html
import os
import re
from copy import deepcopy
from datetime import datetime

import dateparser
Expand All @@ -18,7 +19,9 @@

def parse_datetime(text, loader_context):
if isinstance(text, datetime):
return delorean.Delorean(text, timezone=loader_context.get("timezone", "UTC"))
return (
delorean.Delorean(text, timezone=loader_context.get("timezone", "UTC"))
).shift("UTC")
elif isinstance(text, str):
try:
return delorean.parse(
Expand All @@ -30,7 +33,7 @@ def parse_datetime(text, loader_context):
except ValueError:
return delorean.Delorean(
dateparser.parse(text), timezone=loader_context.get("timezone", "UTC")
)
).shift("UTC")
else:
return text

Expand Down Expand Up @@ -68,6 +71,22 @@ def make_links_absolute(tree):


def cleanup_html(tree, loader_context):
for elem_child, elem_parent in loader_context.get("child_to_parent", {}).items():
sel_child = CSSSelector(elem_child)
sel_parent = CSSSelector(elem_parent)
for e_parent in sel_parent(tree):
e_children = sel_child(e_parent)
if e_children:
e_parent.getparent().replace(e_parent, e_children[0])

for elem_sel, elem_new in loader_context.get("replace_elems", {}).items():
elem_new = lxml.html.fragment_fromstring(elem_new)
selector = CSSSelector(elem_sel)
for elem in selector(tree):
# New element could be replaced more than once but every node must be a
# different element.
elem.getparent().replace(elem, deepcopy(elem_new))

# Remove tags.
for elem_sel in loader_context.get("remove_elems", []):
selector = CSSSelector(elem_sel)
Expand Down
1 change: 0 additions & 1 deletion feeds/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@


class FeedsHttpErrorMiddleware:

@classmethod
def from_crawler(cls, crawler):
return cls()
Expand Down
1 change: 0 additions & 1 deletion feeds/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ class AtomCheckRequiredFieldsPipeline(object):
"""Check presence of required fields."""

def process_item(self, item, spider):

def raise_if_missing(name, item):
if name not in item:
raise DropItem(
Expand Down
1 change: 0 additions & 1 deletion feeds/spiders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@


class FeedsSpider(Spider):

def generate_feed_header(
self,
title=None,
Expand Down

0 comments on commit f6d0c99

Please sign in to comment.