diff --git a/pkgs/development/python-modules/trafilatura/default.nix b/pkgs/development/python-modules/trafilatura/default.nix new file mode 100644 index 000000000000000..57042627c968234 --- /dev/null +++ b/pkgs/development/python-modules/trafilatura/default.nix @@ -0,0 +1,67 @@ +{ lib +, buildPythonPackage +, fetchPypi +, pytestCheckHook +, pythonOlder +, certifi +, charset-normalizer +, courlan +, htmldate +, justext +, lxml +, urllib3 +}: + +buildPythonPackage rec { + pname = "trafilatura"; + version = "1.6.3"; + format = "setuptools"; + + disabled = pythonOlder "3.6"; + + src = fetchPypi { + inherit pname version; + hash = "sha256-Zx3W4AAOEBxLzo1w9ECLy3n8vyJ17iVZHv4z4sihYA0="; + }; + + propagatedBuildInputs = [ + certifi + charset-normalizer + courlan + htmldate + justext + lxml + urllib3 + ]; + + nativeCheckInputs = [ pytestCheckHook ]; + + # disable tests that require an internet connection + disabledTests = [ + "test_download" + "test_fetch" + "test_redirection" + "test_meta_redirections" + "test_crawl_page" + "test_whole" + "test_probing" + "test_cli_pipeline" + ]; + + # patch out gui cli because it is not supported in this packaging + # nixify path to the trafilatura binary in the test suite + postPatch = '' + substituteInPlace setup.py --replace '"trafilatura_gui=trafilatura.gui:main",' "" + substituteInPlace tests/cli_tests.py --replace "trafilatura_bin = 'trafilatura'" "trafilatura_bin = '$out/bin/trafilatura'" + ''; + + pythonImportsCheck = [ "trafilatura" ]; + + meta = with lib; { + description = "Python package and command-line tool designed to gather text on the Web"; + homepage = "https://trafilatura.readthedocs.io"; + changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md"; + license = licenses.gpl3Plus; + maintainers = with maintainers; [ jokatzke ]; + }; +} diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix index 64a49324c6b61d4..e8487e19f665f27 100644 --- a/pkgs/top-level/python-packages.nix +++ b/pkgs/top-level/python-packages.nix @@ -14429,6 +14429,8 @@ self: super: with self; { trackpy = callPackage ../development/python-modules/trackpy { }; + trafilatura = callPackage ../development/python-modules/trafilatura { }; + trailrunner = callPackage ../development/python-modules/trailrunner {}; trainer = callPackage ../development/python-modules/trainer {};