From 2b886865cfa1109b7f32b5758c78a319e4b2dcda Mon Sep 17 00:00:00 2001 From: aszlig Date: Fri, 29 Mar 2019 04:37:53 +0100 Subject: [PATCH] Merge pull request #57519 (systemd-confinement) Currently if you want to properly chroot a systemd service, you could do it using BindReadOnlyPaths=/nix/store or use a separate derivation which gathers the runtime closure of the service you want to chroot. The former is the easier method and there is also a method directly offered by systemd, called ProtectSystem, which still leaves the whole store accessible. The latter however is a bit more involved, because you need to bind-mount each store path of the runtime closure of the service you want to chroot. This can be achieved using pkgs.closureInfo and a small derivation that packs everything into a systemd unit, which later can be added to systemd.packages. However, this process is a bit tedious, so the changes here implement this in a more generic way. Now if you want to chroot a systemd service, all you need to do is: { systemd.services.myservice = { description = "My Shiny Service"; wantedBy = [ "multi-user.target" ]; confinement.enable = true; serviceConfig.ExecStart = "${pkgs.myservice}/bin/myservice"; }; } If more than the dependencies for the ExecStart* and ExecStop* (which btw. also includes script and {pre,post}Start) need to be in the chroot, it can be specified using the confinement.packages option. By default (which uses the full-apivfs confinement mode), a user namespace is set up as well and /proc, /sys and /dev are mounted appropriately. In addition - and by default - a /bin/sh executable is provided, which is useful for most programs that use the system() C library call to execute commands via shell. Unfortunately, there are a few limitations at the moment. The first being that DynamicUser doesn't work in conjunction with tmpfs, because systemd seems to ignore the TemporaryFileSystem option if DynamicUser is enabled. I started implementing a workaround to do this, but I decided to not include it as part of this pull request, because it needs a lot more testing to ensure it's consistent with the behaviour without DynamicUser. The second limitation/issue is that RootDirectoryStartOnly doesn't work right now, because it only affects the RootDirectory option and doesn't include/exclude the individual bind mounts or the tmpfs. A quirk we do have right now is that systemd tries to create a /usr directory within the chroot, which subsequently fails. Fortunately, this is just an ugly error and not a hard failure. The changes also come with a changelog entry for NixOS 19.03, which is why I asked for a vote of the NixOS 19.03 stable maintainers whether to include it (I admit it's a bit late a few days before official release, sorry for that): @samueldr: Via pull request comment[1]: +1 for backporting as this only enhances the feature set of nixos, and does not (at a glance) change existing behaviours. Via IRC: new feature: -1, tests +1, we're at zero, self-contained, with no global effects without actively using it, +1, I think it's good @lheckemann: Via pull request comment[2]: I'm neutral on backporting. On the one hand, as @samueldr says, this doesn't change any existing functionality. On the other hand, it's a new feature and we're well past the feature freeze, which AFAIU is intended so that new, potentially buggy features aren't introduced in the "stabilisation period". It is a cool feature though? :) A few other people on IRC didn't have opposition either against late inclusion into NixOS 19.03: @edolstra: "I'm not against it" @Infinisil: "+1 from me as well" @grahamc: "IMO its up to the RMs" So that makes +1 from @samueldr, 0 from @lheckemann, 0 from @edolstra and +1 from @Infinisil (even though he's not a release manager) and no opposition from anyone, which is the reason why I'm merging this right now. I also would like to thank @Infinisil, @edolstra and @danbst for their reviews. [1]: https://github.com/NixOS/nixpkgs/pull/57519#issuecomment-477322127 [2]: https://github.com/NixOS/nixpkgs/pull/57519#issuecomment-477548395 (cherry picked from commit dcf40f7c24eec1160e6433b6644d3e2dd268e417) --- nixos/doc/manual/release-notes/rl-1903.xml | 11 + nixos/modules/module-list.nix | 1 + .../modules/security/systemd-confinement.nix | 199 ++++++++++++++++++ nixos/modules/system/boot/systemd-lib.nix | 9 +- nixos/tests/all-tests.nix | 1 + nixos/tests/systemd-confinement.nix | 168 +++++++++++++++ 6 files changed, 384 insertions(+), 5 deletions(-) create mode 100644 nixos/modules/security/systemd-confinement.nix create mode 100644 nixos/tests/systemd-confinement.nix diff --git a/nixos/doc/manual/release-notes/rl-1903.xml b/nixos/doc/manual/release-notes/rl-1903.xml index bbd3cf2e9db500..7c94f6e9473ead 100644 --- a/nixos/doc/manual/release-notes/rl-1903.xml +++ b/nixos/doc/manual/release-notes/rl-1903.xml @@ -68,6 +68,17 @@ for details. + + + There is now a set of options for + , which allows to restrict services + into a + chroot + 2 + ed environment that only contains the store paths from + the runtime closure of the service. + + diff --git a/nixos/modules/module-list.nix b/nixos/modules/module-list.nix index 0047530aa88524..40dd0cb524a742 100644 --- a/nixos/modules/module-list.nix +++ b/nixos/modules/module-list.nix @@ -170,6 +170,7 @@ ./security/rtkit.nix ./security/wrappers/default.nix ./security/sudo.nix + ./security/systemd-confinement.nix ./services/admin/oxidized.nix ./services/admin/salt/master.nix ./services/admin/salt/minion.nix diff --git a/nixos/modules/security/systemd-confinement.nix b/nixos/modules/security/systemd-confinement.nix new file mode 100644 index 00000000000000..cd4eb81dbe197e --- /dev/null +++ b/nixos/modules/security/systemd-confinement.nix @@ -0,0 +1,199 @@ +{ config, pkgs, lib, ... }: + +let + toplevelConfig = config; + inherit (lib) types; + inherit (import ../system/boot/systemd-lib.nix { + inherit config pkgs lib; + }) mkPathSafeName; +in { + options.systemd.services = lib.mkOption { + type = types.attrsOf (types.submodule ({ name, config, ... }: { + options.confinement.enable = lib.mkOption { + type = types.bool; + default = false; + description = '' + If set, all the required runtime store paths for this service are + bind-mounted into a tmpfs-based + chroot + 2 + . + ''; + }; + + options.confinement.fullUnit = lib.mkOption { + type = types.bool; + default = false; + description = '' + Whether to include the full closure of the systemd unit file into the + chroot, instead of just the dependencies for the executables. + + While it may be tempting to just enable this option to + make things work quickly, please be aware that this might add paths + to the closure of the chroot that you didn't anticipate. It's better + to use to explicitly add additional store paths to the + chroot. + ''; + }; + + options.confinement.packages = lib.mkOption { + type = types.listOf (types.either types.str types.package); + default = []; + description = let + mkScOption = optName: ""; + in '' + Additional packages or strings with context to add to the closure of + the chroot. By default, this includes all the packages from the + ${lib.concatMapStringsSep ", " mkScOption [ + "ExecReload" "ExecStartPost" "ExecStartPre" "ExecStop" + "ExecStopPost" + ]} and ${mkScOption "ExecStart"} options. If you want to have all the + dependencies of this systemd unit, you can use + . + + The store paths listed in are + not included in the closure as + well as paths from other options except those listed + above. + ''; + }; + + options.confinement.binSh = lib.mkOption { + type = types.nullOr types.path; + default = toplevelConfig.environment.binsh; + defaultText = "config.environment.binsh"; + example = lib.literalExample "\${pkgs.dash}/bin/dash"; + description = '' + The program to make available as /bin/sh inside + the chroot. If this is set to null, no + /bin/sh is provided at all. + + This is useful for some applications, which for example use the + + system + 3 + library function to execute commands. + ''; + }; + + options.confinement.mode = lib.mkOption { + type = types.enum [ "full-apivfs" "chroot-only" ]; + default = "full-apivfs"; + description = '' + The value full-apivfs (the default) sets up + private /dev, /proc, /sys and /tmp file systems in a separate user + name space. + + If this is set to chroot-only, only the file + system name space is set up along with the call to + chroot + 2 + . + + This doesn't cover network namespaces and is solely for + file system level isolation. + ''; + }; + + config = let + rootName = "${mkPathSafeName name}-chroot"; + inherit (config.confinement) binSh fullUnit; + wantsAPIVFS = lib.mkDefault (config.confinement.mode == "full-apivfs"); + in lib.mkIf config.confinement.enable { + serviceConfig = { + RootDirectory = pkgs.runCommand rootName {} "mkdir \"$out\""; + TemporaryFileSystem = "/"; + PrivateMounts = lib.mkDefault true; + + # https://github.com/NixOS/nixpkgs/issues/14645 is a future attempt + # to change some of these to default to true. + # + # If we run in chroot-only mode, having something like PrivateDevices + # set to true by default will mount /dev within the chroot, whereas + # with "chroot-only" it's expected that there are no /dev, /proc and + # /sys file systems available. + # + # However, if this suddenly becomes true, the attack surface will + # increase, so let's explicitly set these options to true/false + # depending on the mode. + MountAPIVFS = wantsAPIVFS; + PrivateDevices = wantsAPIVFS; + PrivateTmp = wantsAPIVFS; + PrivateUsers = wantsAPIVFS; + ProtectControlGroups = wantsAPIVFS; + ProtectKernelModules = wantsAPIVFS; + ProtectKernelTunables = wantsAPIVFS; + }; + confinement.packages = let + execOpts = [ + "ExecReload" "ExecStart" "ExecStartPost" "ExecStartPre" "ExecStop" + "ExecStopPost" + ]; + execPkgs = lib.concatMap (opt: let + isSet = config.serviceConfig ? ${opt}; + in lib.optional isSet config.serviceConfig.${opt}) execOpts; + unitAttrs = toplevelConfig.systemd.units."${name}.service"; + allPkgs = lib.singleton (builtins.toJSON unitAttrs); + unitPkgs = if fullUnit then allPkgs else execPkgs; + in unitPkgs ++ lib.optional (binSh != null) binSh; + }; + })); + }; + + config.assertions = lib.concatLists (lib.mapAttrsToList (name: cfg: let + whatOpt = optName: "The 'serviceConfig' option '${optName}' for" + + " service '${name}' is enabled in conjunction with" + + " 'confinement.enable'"; + in lib.optionals cfg.confinement.enable [ + { assertion = !cfg.serviceConfig.RootDirectoryStartOnly or false; + message = "${whatOpt "RootDirectoryStartOnly"}, but right now systemd" + + " doesn't support restricting bind-mounts to 'ExecStart'." + + " Please either define a separate service or find a way to run" + + " commands other than ExecStart within the chroot."; + } + { assertion = !cfg.serviceConfig.DynamicUser or false; + message = "${whatOpt "DynamicUser"}. Please create a dedicated user via" + + " the 'users.users' option instead as this combination is" + + " currently not supported."; + } + ]) config.systemd.services); + + config.systemd.packages = lib.concatLists (lib.mapAttrsToList (name: cfg: let + rootPaths = let + contents = lib.concatStringsSep "\n" cfg.confinement.packages; + in pkgs.writeText "${mkPathSafeName name}-string-contexts.txt" contents; + + chrootPaths = pkgs.runCommand "${mkPathSafeName name}-chroot-paths" { + closureInfo = pkgs.closureInfo { inherit rootPaths; }; + serviceName = "${name}.service"; + excludedPath = rootPaths; + } '' + mkdir -p "$out/lib/systemd/system" + serviceFile="$out/lib/systemd/system/$serviceName" + + echo '[Service]' > "$serviceFile" + + # /bin/sh is special here, because the option value could contain a + # symlink and we need to properly resolve it. + ${lib.optionalString (cfg.confinement.binSh != null) '' + binsh=${lib.escapeShellArg cfg.confinement.binSh} + realprog="$(readlink -e "$binsh")" + echo "BindReadOnlyPaths=$realprog:/bin/sh" >> "$serviceFile" + ''} + + while read storePath; do + if [ -L "$storePath" ]; then + # Currently, systemd can't cope with symlinks in Bind(ReadOnly)Paths, + # so let's just bind-mount the target to that location. + echo "BindReadOnlyPaths=$(readlink -e "$storePath"):$storePath" + elif [ "$storePath" != "$excludedPath" ]; then + echo "BindReadOnlyPaths=$storePath" + fi + done < "$closureInfo/store-paths" >> "$serviceFile" + ''; + in lib.optional cfg.confinement.enable chrootPaths) config.systemd.services); +} diff --git a/nixos/modules/system/boot/systemd-lib.nix b/nixos/modules/system/boot/systemd-lib.nix index 68a40377ee1332..28ad4f121bbee3 100644 --- a/nixos/modules/system/boot/systemd-lib.nix +++ b/nixos/modules/system/boot/systemd-lib.nix @@ -9,12 +9,11 @@ in rec { shellEscape = s: (replaceChars [ "\\" ] [ "\\\\" ] s); + mkPathSafeName = lib.replaceChars ["@" ":" "\\" "[" "]"] ["-" "-" "-" "" ""]; + makeUnit = name: unit: - let - pathSafeName = lib.replaceChars ["@" ":" "\\" "[" "]"] ["-" "-" "-" "" ""] name; - in if unit.enable then - pkgs.runCommand "unit-${pathSafeName}" + pkgs.runCommand "unit-${mkPathSafeName name}" { preferLocalBuild = true; allowSubstitutes = false; inherit (unit) text; @@ -24,7 +23,7 @@ in rec { echo -n "$text" > $out/${shellEscape name} '' else - pkgs.runCommand "unit-${pathSafeName}-disabled" + pkgs.runCommand "unit-${mkPathSafeName name}-disabled" { preferLocalBuild = true; allowSubstitutes = false; } diff --git a/nixos/tests/all-tests.nix b/nixos/tests/all-tests.nix index fa1c0d44655686..f5f53a8f769fa7 100644 --- a/nixos/tests/all-tests.nix +++ b/nixos/tests/all-tests.nix @@ -217,6 +217,7 @@ in switchTest = handleTest ./switch-test.nix {}; syncthing-relay = handleTest ./syncthing-relay.nix {}; systemd = handleTest ./systemd.nix {}; + systemd-confinement = handleTest ./systemd-confinement.nix {}; taskserver = handleTest ./taskserver.nix {}; telegraf = handleTest ./telegraf.nix {}; tomcat = handleTest ./tomcat.nix {}; diff --git a/nixos/tests/systemd-confinement.nix b/nixos/tests/systemd-confinement.nix new file mode 100644 index 00000000000000..b7b10fb36aac44 --- /dev/null +++ b/nixos/tests/systemd-confinement.nix @@ -0,0 +1,168 @@ +import ./make-test.nix { + name = "systemd-confinement"; + + machine = { pkgs, lib, ... }: let + testServer = pkgs.writeScript "testserver.sh" '' + #!${pkgs.stdenv.shell} + export PATH=${lib.escapeShellArg "${pkgs.coreutils}/bin"} + ${lib.escapeShellArg pkgs.stdenv.shell} 2>&1 + echo "exit-status:$?" + ''; + + testClient = pkgs.writeScriptBin "chroot-exec" '' + #!${pkgs.stdenv.shell} -e + output="$(echo "$@" | nc -NU "/run/test$(< /teststep).sock")" + ret="$(echo "$output" | sed -nre '$s/^exit-status:([0-9]+)$/\1/p')" + echo "$output" | head -n -1 + exit "''${ret:-1}" + ''; + + mkTestStep = num: { description, config ? {}, testScript }: { + systemd.sockets."test${toString num}" = { + description = "Socket for Test Service ${toString num}"; + wantedBy = [ "sockets.target" ]; + socketConfig.ListenStream = "/run/test${toString num}.sock"; + socketConfig.Accept = true; + }; + + systemd.services."test${toString num}@" = { + description = "Confined Test Service ${toString num}"; + confinement = (config.confinement or {}) // { enable = true; }; + serviceConfig = (config.serviceConfig or {}) // { + ExecStart = testServer; + StandardInput = "socket"; + }; + } // removeAttrs config [ "confinement" "serviceConfig" ]; + + __testSteps = lib.mkOrder num '' + subtest '${lib.escape ["\\" "'"] description}', sub { + $machine->succeed('echo ${toString num} > /teststep'); + ${testScript} + }; + ''; + }; + + in { + imports = lib.imap1 mkTestStep [ + { description = "chroot-only confinement"; + config.confinement.mode = "chroot-only"; + testScript = '' + $machine->succeed( + 'test "$(chroot-exec ls -1 / | paste -sd,)" = bin,nix', + 'test "$(chroot-exec id -u)" = 0', + 'chroot-exec chown 65534 /bin', + ); + ''; + } + { description = "full confinement with APIVFS"; + testScript = '' + $machine->fail( + 'chroot-exec ls -l /etc', + 'chroot-exec ls -l /run', + 'chroot-exec chown 65534 /bin', + ); + $machine->succeed( + 'test "$(chroot-exec id -u)" = 0', + 'chroot-exec chown 0 /bin', + ); + ''; + } + { description = "check existence of bind-mounted /etc"; + config.serviceConfig.BindReadOnlyPaths = [ "/etc" ]; + testScript = '' + $machine->succeed('test -n "$(chroot-exec cat /etc/passwd)"'); + ''; + } + { description = "check if User/Group really runs as non-root"; + config.serviceConfig.User = "chroot-testuser"; + config.serviceConfig.Group = "chroot-testgroup"; + testScript = '' + $machine->succeed('chroot-exec ls -l /dev'); + $machine->succeed('test "$(chroot-exec id -u)" != 0'); + $machine->fail('chroot-exec touch /bin/test'); + ''; + } + (let + symlink = pkgs.runCommand "symlink" { + target = pkgs.writeText "symlink-target" "got me\n"; + } "ln -s \"$target\" \"$out\""; + in { + description = "check if symlinks are properly bind-mounted"; + config.confinement.packages = lib.singleton symlink; + testScript = '' + $machine->fail('chroot-exec test -e /etc'); + $machine->succeed('chroot-exec cat ${symlink} >&2'); + $machine->succeed('test "$(chroot-exec cat ${symlink})" = "got me"'); + ''; + }) + { description = "check if StateDirectory works"; + config.serviceConfig.User = "chroot-testuser"; + config.serviceConfig.Group = "chroot-testgroup"; + config.serviceConfig.StateDirectory = "testme"; + testScript = '' + $machine->succeed('chroot-exec touch /tmp/canary'); + $machine->succeed('chroot-exec "echo works > /var/lib/testme/foo"'); + $machine->succeed('test "$(< /var/lib/testme/foo)" = works'); + $machine->succeed('test ! -e /tmp/canary'); + ''; + } + { description = "check if /bin/sh works"; + testScript = '' + $machine->succeed( + 'chroot-exec test -e /bin/sh', + 'test "$(chroot-exec \'/bin/sh -c "echo bar"\')" = bar', + ); + ''; + } + { description = "check if suppressing /bin/sh works"; + config.confinement.binSh = null; + testScript = '' + $machine->succeed( + 'chroot-exec test ! -e /bin/sh', + 'test "$(chroot-exec \'/bin/sh -c "echo foo"\')" != foo', + ); + ''; + } + { description = "check if we can set /bin/sh to something different"; + config.confinement.binSh = "${pkgs.hello}/bin/hello"; + testScript = '' + $machine->succeed( + 'chroot-exec test -e /bin/sh', + 'test "$(chroot-exec /bin/sh -g foo)" = foo', + ); + ''; + } + { description = "check if only Exec* dependencies are included"; + config.environment.FOOBAR = pkgs.writeText "foobar" "eek\n"; + testScript = '' + $machine->succeed('test "$(chroot-exec \'cat "$FOOBAR"\')" != eek'); + ''; + } + { description = "check if all unit dependencies are included"; + config.environment.FOOBAR = pkgs.writeText "foobar" "eek\n"; + config.confinement.fullUnit = true; + testScript = '' + $machine->succeed('test "$(chroot-exec \'cat "$FOOBAR"\')" = eek'); + ''; + } + ]; + + options.__testSteps = lib.mkOption { + type = lib.types.lines; + description = "All of the test steps combined as a single script."; + }; + + config.environment.systemPackages = lib.singleton testClient; + + config.users.groups.chroot-testgroup = {}; + config.users.users.chroot-testuser = { + description = "Chroot Test User"; + group = "chroot-testgroup"; + }; + }; + + testScript = { nodes, ... }: '' + $machine->waitForUnit('multi-user.target'); + ${nodes.machine.config.__testSteps} + ''; +}