diff --git a/nixos/modules/virtualisation/containers.nix b/nixos/modules/virtualisation/containers.nix index f570813805cfa2..1410a14817040e 100644 --- a/nixos/modules/virtualisation/containers.nix +++ b/nixos/modules/virtualisation/containers.nix @@ -129,6 +129,9 @@ let --setenv HOST_ADDRESS6="$HOST_ADDRESS6" \ --setenv LOCAL_ADDRESS6="$LOCAL_ADDRESS6" \ --setenv PATH="$PATH" \ + ${if cfg.additionalCapabilities != null then + ''--capability="${concatStringsSep " " cfg.additionalCapabilities}"'' else "" + } \ ${containerInit cfg} "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/init" ''; @@ -205,6 +208,41 @@ let '' ); + serviceDirectives = cfg: { + ExecReload = pkgs.writeScript "reload-container" + '' + #! ${pkgs.stdenv.shell} -e + ${pkgs.nixos-container}/bin/nixos-container run "$INSTANCE" -- \ + bash --login -c "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/bin/switch-to-configuration test" + ''; + + SyslogIdentifier = "container %i"; + + EnvironmentFile = "-/etc/containers/%i.conf"; + + Type = "notify"; + + # Note that on reboot, systemd-nspawn returns 133, so this + # unit will be restarted. On poweroff, it returns 0, so the + # unit won't be restarted. + RestartForceExitStatus = "133"; + SuccessExitStatus = "133"; + + Restart = "on-failure"; + + # Hack: we don't want to kill systemd-nspawn, since we call + # "machinectl poweroff" in preStop to shut down the + # container cleanly. But systemd requires sending a signal + # (at least if we want remaining processes to be killed + # after the timeout). So send an ignored signal. + KillMode = "mixed"; + KillSignal = "WINCH"; + + DevicePolicy = "closed"; + DeviceAllow = map (d: "${d.node} ${d.modifier}") cfg.allowedDevices; + }; + + system = config.nixpkgs.system; bindMountOpts = { name, config, ... }: { @@ -235,6 +273,27 @@ let }; + allowedDeviceOpts = { name, config, ... }: { + options = { + node = mkOption { + example = "/dev/net/tun"; + type = types.str; + description = "Path to device node"; + }; + modifier = mkOption { + example = "rw"; + type = types.str; + description = '' + Device node access modifier. Takes a combination + r (read), w (write), and + m (mknod). See the + systemd.resource-control(5) man page for more + information.''; + }; + }; + }; + + mkBindFlag = d: let flagPrefix = if d.isReadOnly then " --bind-ro=" else " --bind="; mountstr = if d.hostPath != null then "${d.hostPath}:${d.mountPoint}" else "${d.mountPoint}"; @@ -302,6 +361,8 @@ let dummyConfig = { extraVeths = {}; + additionalCapabilities = []; + allowedDevices = []; hostAddress = null; hostAddress6 = null; localAddress = null; @@ -368,6 +429,26 @@ in ''; }; + additionalCapabilities = mkOption { + type = types.listOf types.str; + default = []; + example = [ "CAP_NET_ADMIN" "CAP_MKNOD" ]; + description = '' + Grant additional capabilities to the container. See the + capabilities(7) and systemd-nspawn(1) man pages for more + information. + ''; + }; + enableTun = mkOption { + type = types.bool; + default = false; + description = '' + Allows the container to create and setup tunnel interfaces + by granting the NET_ADMIN capability and + enabling access to /dev/net/tun. + ''; + }; + privateNetwork = mkOption { type = types.bool; default = false; @@ -422,6 +503,16 @@ in ''; }; + allowedDevices = mkOption { + type = types.listOf types.optionSet; + options = [ allowedDeviceOpts ]; + default = []; + example = [ { node = "/dev/net/tun"; modifier = "rw"; } ]; + description = '' + A list of device nodes to which the containers has access to. + ''; + }; + } // networkOptions; config = mkMerge @@ -488,59 +579,39 @@ in restartIfChanged = false; - serviceConfig = { - ExecReload = pkgs.writeScript "reload-container" - '' - #! ${pkgs.stdenv.shell} -e - ${pkgs.nixos-container}/bin/nixos-container run "$INSTANCE" -- \ - bash --login -c "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/bin/switch-to-configuration test" - ''; - - SyslogIdentifier = "container %i"; - - EnvironmentFile = "-/etc/containers/%i.conf"; - - Type = "notify"; - - # Note that on reboot, systemd-nspawn returns 133, so this - # unit will be restarted. On poweroff, it returns 0, so the - # unit won't be restarted. - RestartForceExitStatus = "133"; - SuccessExitStatus = "133"; - - Restart = "on-failure"; - - # Hack: we don't want to kill systemd-nspawn, since we call - # "machinectl poweroff" in preStop to shut down the - # container cleanly. But systemd requires sending a signal - # (at least if we want remaining processes to be killed - # after the timeout). So send an ignored signal. - KillMode = "mixed"; - KillSignal = "WINCH"; - - DevicePolicy = "closed"; - }; + serviceConfig = serviceDirectives dummyConfig; }; in { systemd.services = listToAttrs (filter (x: x.value != null) ( # The generic container template used by imperative containers [{ name = "container@"; value = unit; }] # declarative containers - ++ (mapAttrsToList (name: cfg: nameValuePair "container@${name}" ( - unit // { - preStart = preStartScript cfg; - script = startScript cfg; - postStart = postStartScript cfg; - } // ( - if cfg.autoStart then - { - wantedBy = [ "multi-user.target" ]; - wants = [ "network.target" ]; - after = [ "network.target" ]; - restartTriggers = [ cfg.path ]; - reloadIfChanged = true; - } - else {}) + ++ (mapAttrsToList (name: cfg: nameValuePair "container@${name}" (let + config = cfg // ( + if cfg.enableTun then + { + allowedDevices = cfg.allowedDevices + ++ [ { node = "/dev/net/tun"; modifier = "rw"; } ]; + additionalCapabilities = cfg.additionalCapabilities + ++ [ "CAP_NET_ADMIN" ]; + } + else {}); + in + unit // { + preStart = preStartScript config; + script = startScript config; + postStart = postStartScript config; + serviceConfig = serviceDirectives config; + } // ( + if config.autoStart then + { + wantedBy = [ "multi-user.target" ]; + wants = [ "network.target" ]; + after = [ "network.target" ]; + restartTriggers = [ config.path ]; + reloadIfChanged = true; + } + else {}) )) config.containers) ));