Skip to content

Commit

Permalink
nixos/qemu-vm: use persistent block device names
Browse files Browse the repository at this point in the history
This change removes the bespoke logic around identifying block devices.
Instead of trying to find the right device by iterating over
`qemu.drives` and guessing the right partition number (e.g.
/dev/vda{1,2}), devices are now identified by persistent names provided
by udev in /dev/disk/by-*.

Before this change, the root device was formatted on demand in the
initrd. However, this makes it impossible to use filesystem identifiers
to identify devices. Now, the formatting step is performed before the VM
is started. Because some tests, however, rely on this behaviour, a
utility function to replace this behaviour in added in
/nixos/tests/common/auto-format-root-device.nix.

Devices that contain neither a partition table nor a filesystem are
identified by their hardware serial number which is injecetd via QEMU
(and is thus persistent and predictable). PCI paths are not a reliably
way to identify devices because their availability and numbering depends
on the QEMU machine type.

This change makes the module more robust against changes in QEMU and the
kernel (non-persistent device naming) and by decoupling abstractions
(i.e. rootDevice, bootPartition, and bootLoaderDevice) enables further
improvement down the line.
  • Loading branch information
nikstur committed Jun 12, 2023
1 parent aa33756 commit ee60d30
Show file tree
Hide file tree
Showing 13 changed files with 118 additions and 101 deletions.
1 change: 1 addition & 0 deletions nixos/lib/make-disk-image.nix
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,7 @@ let format' = format; in let
# In this throwaway resource, we only have /dev/vda, but the actual VM may refer to another disk for bootloader, e.g. /dev/vdb
# Use this option to create a symlink from vda to any arbitrary device you want.
${optionalString (config.boot.loader.grub.device != "/dev/vda") ''
mkdir -p $(dirname ${config.boot.loader.grub.device})
ln -s /dev/vda ${config.boot.loader.grub.device}
''}
Expand Down
163 changes: 65 additions & 98 deletions nixos/modules/virtualisation/qemu-vm.nix
Original file line number Diff line number Diff line change
Expand Up @@ -81,25 +81,6 @@ let

drivesCmdLine = drives: concatStringsSep "\\\n " (imap1 driveCmdline drives);


# Creates a device name from a 1-based a numerical index, e.g.
# * `driveDeviceName 1` -> `/dev/vda`
# * `driveDeviceName 2` -> `/dev/vdb`
driveDeviceName = idx:
let letter = elemAt lowerChars (idx - 1);
in if cfg.qemu.diskInterface == "scsi" then
"/dev/sd${letter}"
else
"/dev/vd${letter}";

lookupDriveDeviceName = driveName: driveList:
(findSingle (drive: drive.name == driveName)
(throw "Drive ${driveName} not found")
(throw "Multiple drives named ${driveName}") driveList).device;

addDeviceNames =
imap1 (idx: drive: drive // { device = driveDeviceName idx; });

# Shell script to start the VM.
startVM =
''
Expand All @@ -109,25 +90,41 @@ let
set -e
# Create an empty ext4 filesystem image. A filesystem image does not
# contain a partition table but just a filesystem.
createEmptyFilesystemImage() {
local name=$1
local size=$2
local temp=$(mktemp)
${qemu}/bin/qemu-img create -f raw "$temp" "$size"
${pkgs.e2fsprogs}/bin/mkfs.ext4 -L ${rootFilesystemLabel} "$temp"
${qemu}/bin/qemu-img convert -f raw -O qcow2 "$temp" "$name"
rm "$temp"
}
NIX_DISK_IMAGE=$(readlink -f "''${NIX_DISK_IMAGE:-${toString config.virtualisation.diskImage}}") || test -z "$NIX_DISK_IMAGE"
if test -n "$NIX_DISK_IMAGE" && ! test -e "$NIX_DISK_IMAGE"; then
echo "Disk image do not exist, creating the virtualisation disk image..."
# If we are using a bootloader and default filesystems layout.
# We have to reuse the system image layout as a backing image format (CoW)
# So we can write on the top of it.
# If we are not using the default FS layout, potentially, we are interested into
# performing operations in postDeviceCommands or at early boot on the raw device.
# We can still boot through QEMU direct kernel boot feature.
# CoW prevent size to be attributed to an image.
# FIXME: raise this issue to upstream.
${qemu}/bin/qemu-img create \
${concatStringsSep " \\\n" ([ "-f qcow2" ]
++ optional (cfg.useBootLoader && cfg.useDefaultFilesystems) "-F qcow2 -b ${systemImage}/nixos.qcow2"
++ optional (!(cfg.useBootLoader && cfg.useDefaultFilesystems)) "-o size=${toString config.virtualisation.diskSize}M"
++ [ ''"$NIX_DISK_IMAGE"'' ])}
${if (cfg.useBootLoader && cfg.useDefaultFilesystems) then ''
# Create a writable qcow2 image using the systemImage as a backing
# image.
# CoW prevent size to be attributed to an image.
# FIXME: raise this issue to upstream.
${qemu}/bin/qemu-img create \
-f qcow2 \
-b ${systemImage}/nixos.qcow2 \
-F qcow2 \
"$NIX_DISK_IMAGE"
'' else if cfg.useDefaultFilesystems then ''
createEmptyFilesystemImage "$NIX_DISK_IMAGE" "${toString cfg.diskSize}M"
'' else ''
# Create an empty disk image without a filesystem.
${qemu}/bin/qemu-img create -f qcow2 "$NIX_DISK_IMAGE" "${toString cfg.diskSize}M"
''
}
echo "Virtualisation disk image created."
fi
Expand All @@ -148,6 +145,7 @@ let
${pkgs.erofs-utils}/bin/mkfs.erofs \
--force-uid=0 \
--force-gid=0 \
-L ${nixStoreFilesystemLabel} \
-U eb176051-bd15-49b7-9e6b-462e0b467019 \
-T 0 \
--exclude-regex="$(
Expand Down Expand Up @@ -218,13 +216,27 @@ let

regInfo = pkgs.closureInfo { rootPaths = config.virtualisation.additionalPaths; };

# Use well-defined and persistent filesystem labels to identify block devices.
rootFilesystemLabel = "nixos";
espFilesystemLabel = "ESP"; # Hard-coded by make-disk-image.nix
nixStoreFilesystemLabel = "nix-store";

# The root drive is a raw disk which does not necessarily contain a
# filesystem or partition table. It thus cannot be identified via the typical
# persistent naming schemes (e.g. /dev/disk/by-{label, uuid, partlabel,
# partuuid}. Instead, supply a well-defined and persistent serial attribute
# via QEMU. Inside the running system, the disk can then be identified via
# the /dev/disk/by-id scheme.
rootDriveSerialAttr = "root";

# System image is akin to a complete NixOS install with
# a boot partition and root partition.
systemImage = import ../../lib/make-disk-image.nix {
inherit pkgs config lib;
additionalPaths = [ regInfo ];
format = "qcow2";
onlyNixStore = false;
label = rootFilesystemLabel;
partitionTableType = selectPartitionTableLayout { inherit (cfg) useDefaultFilesystems useEFIBoot; };
# Bootloader should be installed on the system image only if we are booting through bootloaders.
# Though, if a user is not using our default filesystems, it is possible to not have any ESP
Expand All @@ -247,6 +259,7 @@ let
additionalPaths = [ regInfo ];
format = "qcow2";
onlyNixStore = true;
label = nixStoreFilesystemLabel;
partitionTableType = "none";
installBootLoader = false;
touchEFIVars = false;
Expand All @@ -255,28 +268,6 @@ let
copyChannel = false;
};

bootConfiguration =
if cfg.useDefaultFilesystems
then
if cfg.useBootLoader
then
if cfg.useEFIBoot then "efi_bootloading_with_default_fs"
else "legacy_bootloading_with_default_fs"
else
if cfg.directBoot.enable then "direct_boot_with_default_fs"
else "custom"
else
"custom";
suggestedRootDevice = {
"efi_bootloading_with_default_fs" = "${cfg.bootLoaderDevice}2";
"legacy_bootloading_with_default_fs" = "${cfg.bootLoaderDevice}1";
"direct_boot_with_default_fs" = cfg.bootLoaderDevice;
# This will enforce a NixOS module type checking error
# to ask explicitly the user to set a rootDevice.
# As it will look like `rootDevice = lib.mkDefault null;` after
# all "computations".
"custom" = null;
}.${bootConfiguration};
in

{
Expand Down Expand Up @@ -343,44 +334,39 @@ in
virtualisation.bootLoaderDevice =
mkOption {
type = types.path;
default = lookupDriveDeviceName "root" cfg.qemu.drives;
defaultText = literalExpression ''lookupDriveDeviceName "root" cfg.qemu.drives'';
example = "/dev/vda";
default = "/dev/disk/by-id/virtio-${rootDriveSerialAttr}";
defaultText = literalExpression ''/dev/disk/by-id/virtio-${rootDriveSerialAttr}'';
example = "/dev/disk/by-id/virtio-boot-loader-device";
description =
lib.mdDoc ''
The disk to be used for the boot filesystem.
By default, it is the same disk as the root filesystem.
The path (inside th VM) to the device to boot from when legacy booting.
'';
};

virtualisation.bootPartition =
mkOption {
type = types.nullOr types.path;
default = if cfg.useEFIBoot then "${cfg.bootLoaderDevice}1" else null;
defaultText = literalExpression ''if cfg.useEFIBoot then "''${cfg.bootLoaderDevice}1" else null'';
example = "/dev/vda1";
default = if cfg.useEFIBoot then "/dev/disk/by-label/${espFilesystemLabel}" else null;
defaultText = literalExpression ''if cfg.useEFIBoot then "/dev/disk/by-label/${espFilesystemLabel}" else null'';
example = "/dev/disk/by-label/esp";
description =
lib.mdDoc ''
The boot partition to be used to mount /boot filesystem.
In legacy boots, this should be null.
By default, in EFI boot, it is the first partition of the boot device.
The path (inside the VM) to the device containing the EFI System Partition (ESP).
If you are *not* booting from a UEFI firmware, this value is, by
default, `null`. The ESP is mounted under `/boot`.
'';
};

virtualisation.rootDevice =
mkOption {
type = types.nullOr types.path;
example = "/dev/vda2";
default = "/dev/disk/by-label/${rootFilesystemLabel}";
defaultText = literalExpression ''/dev/disk/by-label/${rootFilesystemLabel}'';
example = "/dev/disk/by-label/nixos";
description =
lib.mdDoc ''
The disk or partition to be used for the root filesystem.
By default (read the source code for more details):
- under EFI with a bootloader: 2nd partition of the boot disk
- in legacy boot with a bootloader: 1st partition of the boot disk
- in direct boot (i.e. without a bootloader): whole disk
In case you are not using a default boot device or a default filesystem, you have to set explicitly your root device.
The path (inside the VM) to the device containing the root filesystem.
'';
};

Expand Down Expand Up @@ -711,7 +697,6 @@ in
mkOption {
type = types.listOf (types.submodule driveOpts);
description = lib.mdDoc "Drives passed to qemu.";
apply = addDeviceNames;
};

diskInterface =
Expand Down Expand Up @@ -975,29 +960,11 @@ in
# FIXME: make a sense of this mess wrt to multiple ESP present in the system, probably use boot.efiSysMountpoint?
boot.loader.grub.device = mkVMOverride (if cfg.useEFIBoot then "nodev" else cfg.bootLoaderDevice);
boot.loader.grub.gfxmodeBios = with cfg.resolution; "${toString x}x${toString y}";
virtualisation.rootDevice = mkDefault suggestedRootDevice;

boot.initrd.kernelModules = optionals (cfg.useNixStoreImage && !cfg.writableStore) [ "erofs" ];

boot.loader.supportsInitrdSecrets = mkIf (!cfg.useBootLoader) (mkVMOverride false);

boot.initrd.extraUtilsCommands = lib.mkIf (cfg.useDefaultFilesystems && !config.boot.initrd.systemd.enable)
''
# We need mke2fs in the initrd.
copy_bin_and_libs ${pkgs.e2fsprogs}/bin/mke2fs
'';

boot.initrd.postDeviceCommands = lib.mkIf (cfg.useDefaultFilesystems && !config.boot.initrd.systemd.enable)
''
# If the disk image appears to be empty, run mke2fs to
# initialise.
FSTYPE=$(blkid -o value -s TYPE ${cfg.rootDevice} || true)
PARTTYPE=$(blkid -o value -s PTTYPE ${cfg.rootDevice} || true)
if test -z "$FSTYPE" -a -z "$PARTTYPE"; then
mke2fs -t ext4 ${cfg.rootDevice}
fi
'';

boot.initrd.postMountCommands = lib.mkIf (!config.boot.initrd.systemd.enable)
''
# Mark this as a NixOS machine.
Expand Down Expand Up @@ -1112,6 +1079,7 @@ in
driveExtraOpts.cache = "writeback";
driveExtraOpts.werror = "report";
deviceExtraOpts.bootindex = "1";
deviceExtraOpts.serial = rootDriveSerialAttr;
}])
(mkIf cfg.useNixStoreImage [{
name = "nix-store";
Expand Down Expand Up @@ -1154,7 +1122,6 @@ in
} else {
device = cfg.rootDevice;
fsType = "ext4";
autoFormat = true;
});
"/tmp" = lib.mkIf config.boot.tmp.useTmpfs {
device = "tmpfs";
Expand All @@ -1164,7 +1131,7 @@ in
options = [ "mode=1777" "strictatime" "nosuid" "nodev" "size=${toString config.boot.tmp.tmpfsSize}" ];
};
"/nix/${if cfg.writableStore then ".ro-store" else "store"}" = lib.mkIf cfg.useNixStoreImage {
device = "${lookupDriveDeviceName "nix-store" cfg.qemu.drives}";
device = "/dev/disk/by-label/${nixStoreFilesystemLabel}";
neededForBoot = true;
options = [ "ro" ];
};
Expand All @@ -1174,7 +1141,7 @@ in
neededForBoot = true;
};
"/boot" = lib.mkIf (cfg.useBootLoader && cfg.bootPartition != null) {
device = cfg.bootPartition; # 1 for e.g. `vda1`, as created in `systemImage`
device = cfg.bootPartition;
fsType = "vfat";
noCheck = true; # fsck fails on a r/o filesystem
};
Expand Down
29 changes: 29 additions & 0 deletions nixos/tests/common/auto-format-root-device.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# This is a test utility that automatically formats
# `config.virtualisation.rootDevice` in the initrd.
# Note that when you are using
# `boot.initrd.systemd.enable = true`, you can use
# `virtualisation.fileSystems."/".autoFormat = true`
# instead.

{ config, pkgs, ... }:

let
rootDevice = config.virtualisation.rootDevice;
in
{

boot.initrd.extraUtilsCommands = ''
# We need mke2fs in the initrd.
copy_bin_and_libs ${pkgs.e2fsprogs}/bin/mke2fs
'';

boot.initrd.postDeviceCommands = ''
# If the disk image appears to be empty, run mke2fs to
# initialise.
FSTYPE=$(blkid -o value -s TYPE ${rootDevice} || true)
PARTTYPE=$(blkid -o value -s PTTYPE ${rootDevice} || true)
if test -z "$FSTYPE" -a -z "$PARTTYPE"; then
mke2fs -t ext4 ${rootDevice}
fi
'';
}
10 changes: 7 additions & 3 deletions nixos/tests/fsck.nix
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,17 @@ import ./make-test-python.nix {
boot.initrd.systemd.enable = systemdStage1;
};

testScript = ''
testScript = { nodes, ...}:
let
rootDevice = nodes.machine.virtualisation.rootDevice;
in
''
machine.wait_for_unit("default.target")
with subtest("root fs is fsckd"):
machine.succeed("journalctl -b | grep '${if systemdStage1
then "fsck.*vda.*clean"
else "fsck.ext4.*/dev/vda"}'")
then "fsck.*${builtins.baseNameOf rootDevice}.*clean"
else "fsck.ext4.*${rootDevice}"}'")
with subtest("mnt fs is fsckd"):
machine.succeed("journalctl -b | grep 'fsck.*vdb.*clean'")
Expand Down
1 change: 1 addition & 0 deletions nixos/tests/hibernate.nix
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ in makeTest {
imports = [
../modules/profiles/installation-device.nix
../modules/profiles/base.nix
./common/auto-format-root-device.nix
];

nix.settings = {
Expand Down
3 changes: 3 additions & 0 deletions nixos/tests/initrd-luks-empty-passphrase.nix
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ in {
name = "initrd-luks-empty-passphrase";

nodes.machine = { pkgs, ... }: {
imports = lib.optionals (!systemdStage1) [ ./common/auto-format-root-device.nix ];

virtualisation = {
emptyDiskImages = [ 512 ];
useBootLoader = true;
Expand All @@ -23,6 +25,7 @@ in {
# the new root device is /dev/vdb
# an empty 512MiB drive, containing no Nix store.
mountHostNixStore = true;
fileSystems."/".autoFormat = lib.mkIf systemdStage1 true;
};

boot.loader.systemd-boot.enable = true;
Expand Down
5 changes: 5 additions & 0 deletions nixos/tests/installer.nix
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,13 @@ let
../modules/profiles/installation-device.nix
../modules/profiles/base.nix
extraInstallerConfig
./common/auto-format-root-device.nix
];

# In systemdStage1, also automatically format the device backing the
# root filesystem.
virtualisation.fileSystems."/".autoFormat = systemdStage1;

# builds stuff in the VM, needs more juice
virtualisation.diskSize = 8 * 1024;
virtualisation.cores = 8;
Expand Down
2 changes: 2 additions & 0 deletions nixos/tests/luks.nix
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ import ./make-test-python.nix ({ lib, pkgs, ... }: {
name = "luks";

nodes.machine = { pkgs, ... }: {
imports = [ ./common/auto-format-root-device.nix ];

# Use systemd-boot
virtualisation = {
emptyDiskImages = [ 512 512 ];
Expand Down
Loading

0 comments on commit ee60d30

Please sign in to comment.