diff --git a/pkgs/development/python-modules/trafilatura/default.nix b/pkgs/development/python-modules/trafilatura/default.nix new file mode 100644 index 000000000000000..5bdbfe280310d90 --- /dev/null +++ b/pkgs/development/python-modules/trafilatura/default.nix @@ -0,0 +1,65 @@ +{ lib +, buildPythonPackage +, fetchPypi +, pytestCheckHook +, pythonOlder +, certifi +, charset-normalizer +, courlan +, htmldate +, justext +, lxml +, urllib3 +}: + +buildPythonPackage rec { + pname = "trafilatura"; + version = "1.6.1"; + format = "setuptools"; + + disabled = pythonOlder "3.6"; + + src = fetchPypi { + inherit pname version; + hash = "sha256-p3krA31iTQSrBfzOVWz+CLdx38jB2xSUx1CHlTDJoww="; + }; + + propagatedBuildInputs = [ + certifi + charset-normalizer + courlan + htmldate + justext + lxml + urllib3 + ]; + + nativeCheckInputs = [ pytestCheckHook ]; + + # disable tests that require an internet connection + disabledTests = [ + "test_download" + "test_fetch" + "test_redirection" + "test_meta_redirections" + "test_crawl_page" + "test_whole" + ]; + + # patch out gui cli because it is not supported in this packaging + # nixify path to the trafilatura binary in the test suite + postPatch = '' + substituteInPlace setup.py --replace '"trafilatura_gui=trafilatura.gui:main",' "" + substituteInPlace tests/cli_tests.py --replace "trafilatura_bin = 'trafilatura'" "trafilatura_bin = '$out/bin/trafilatura'" + ''; + + pythonImportsCheck = [ "trafilatura" ]; + + meta = with lib; { + description = "Python package and command-line tool designed to gather text on the Web"; + homepage = "https://trafilatura.readthedocs.io"; + changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md"; + license = licenses.gpl3Plus; + maintainers = with maintainers; [ joopitz ]; + }; +} diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix index 2e77ec47478d4c9..3d260dfeb7db0e6 100644 --- a/pkgs/top-level/python-packages.nix +++ b/pkgs/top-level/python-packages.nix @@ -12695,6 +12695,8 @@ self: super: with self; { trackpy = callPackage ../development/python-modules/trackpy { }; + trafilatura = callPackage ../development/python-modules/trafilatura { }; + trailrunner = callPackage ../development/python-modules/trailrunner {}; trainer = callPackage ../development/python-modules/trainer {};