From 44ba59bce0b607d3c458e8759c63570dcf00db1e Mon Sep 17 00:00:00 2001 From: jinlong Date: Wed, 27 May 2026 11:46:24 +0800 Subject: [PATCH] feat(deploy): add cgroup v2 CPU controller preflight check - Validate cgroup v2 'cpu' controller availability in install.sh and online-install.sh - Add troubleshooting entry in deployment docs for Ubuntu/Debian cgroup v2 CPU controller issues Signed-off-by: jinlong --- deploy/one-click/install.sh | 44 +++++++++++++++++++++ deploy/one-click/online-install.sh | 36 ++++++++++++++++- docs/guide/troubleshooting/deployment.md | 1 + docs/zh/guide/troubleshooting/deployment.md | 1 + 4 files changed, 81 insertions(+), 1 deletion(-) diff --git a/deploy/one-click/install.sh b/deploy/one-click/install.sh index 7d6a3e9d..0263bf34 100755 --- a/deploy/one-click/install.sh +++ b/deploy/one-click/install.sh @@ -201,6 +201,49 @@ check_cubelet_fs_preflight() { fi } +check_cgroup_cpu_preflight() { + local cgroot="/sys/fs/cgroup" + local fstype + fstype="$(stat -fc %T "${cgroot}" 2>/dev/null || echo unknown)" + + # cgroup v1 systems still work via the v1 handle in cubelet; only validate + # cgroup v2 hosts here (which is what every recent distro defaults to). + if [[ "${fstype}" != "cgroup2fs" ]]; then + return 0 + fi + + local controllers="" + if [[ -r "${cgroot}/cgroup.controllers" ]]; then + controllers="$(cat "${cgroot}/cgroup.controllers" 2>/dev/null || true)" + fi + if ! grep -qw cpu <<<"${controllers}"; then + die "Kernel cgroup v2 does not expose the 'cpu' controller (cgroup.controllers='${controllers:-}'). + cubelet cannot set CPU quotas without it. + See: https://github.com/TencentCloud/CubeSandbox/issues/366" + fi + + local subtree="" + if [[ -r "${cgroot}/cgroup.subtree_control" ]]; then + subtree="$(cat "${cgroot}/cgroup.subtree_control" 2>/dev/null || true)" + fi + if grep -qw cpu <<<"${subtree}"; then + return 0 + fi + + log "cgroup v2 'cpu' controller not enabled on ${cgroot}/cgroup.subtree_control; trying to enable it" + if printf '+cpu\n' >"${cgroot}/cgroup.subtree_control" 2>/dev/null; then + log "enabled '+cpu' on ${cgroot}/cgroup.subtree_control" + return 0 + fi + + die "Failed to enable the cgroup v2 'cpu' controller on ${cgroot}/cgroup.subtree_control. + On Ubuntu / Debian this is usually caused by 'multipathd' (or another service) running real-time threads under the root cgroup, which blocks '+cpu' with 'Invalid argument'. + Quick fix: + systemctl disable --now multipathd.service multipathd.socket + echo +cpu > ${cgroot}/cgroup.subtree_control + Full repro and fix: https://github.com/TencentCloud/CubeSandbox/issues/366" +} + check_install_preflight() { # install.sh itself. require_cmd tar @@ -374,6 +417,7 @@ require_root # to ensure we fail fast before installing or modifying any local system packages. check_hardware_preflight check_cubelet_fs_preflight +check_cgroup_cpu_preflight CUBE_SANDBOX_NODE_IP="$(detect_node_ip)" export CUBE_SANDBOX_NODE_IP diff --git a/deploy/one-click/online-install.sh b/deploy/one-click/online-install.sh index 97fa3312..1dc34eb3 100755 --- a/deploy/one-click/online-install.sh +++ b/deploy/one-click/online-install.sh @@ -140,7 +140,41 @@ check_early_preflight() { exit 3 fi - # 8. Check deployment role early and check Docker/DNS installability (for control role) + # 8. cgroup v2 'cpu' controller check (mirrors check_cgroup_cpu_preflight in install.sh) + local cgroot="/sys/fs/cgroup" + local cg_fstype + cg_fstype="$(stat -fc %T "${cgroot}" 2>/dev/null || echo unknown)" + if [[ "${cg_fstype}" == "cgroup2fs" ]]; then + local cg_controllers="" + if [[ -r "${cgroot}/cgroup.controllers" ]]; then + cg_controllers="$(cat "${cgroot}/cgroup.controllers" 2>/dev/null || true)" + fi + if ! grep -qw cpu <<<"${cg_controllers}"; then + echo "[online-install] ERROR: Kernel cgroup v2 does not expose the 'cpu' controller (cgroup.controllers='${cg_controllers:-}')." >&2 + echo "[online-install] cubelet cannot set CPU quotas without it." >&2 + echo "[online-install] See: https://github.com/TencentCloud/CubeSandbox/issues/366" >&2 + exit 3 + fi + local cg_subtree="" + if [[ -r "${cgroot}/cgroup.subtree_control" ]]; then + cg_subtree="$(cat "${cgroot}/cgroup.subtree_control" 2>/dev/null || true)" + fi + if ! grep -qw cpu <<<"${cg_subtree}"; then + echo "[online-install] cgroup v2 'cpu' controller not enabled on ${cgroot}/cgroup.subtree_control; trying to enable it" >&2 + if ! printf '+cpu\n' >"${cgroot}/cgroup.subtree_control" 2>/dev/null; then + echo "[online-install] ERROR: Failed to enable the cgroup v2 'cpu' controller on ${cgroot}/cgroup.subtree_control." >&2 + echo "[online-install] On Ubuntu / Debian this is usually caused by 'multipathd' (or another service) running real-time threads under the root cgroup, which blocks '+cpu' with 'Invalid argument'." >&2 + echo "[online-install] Quick fix:" >&2 + echo "[online-install] systemctl disable --now multipathd.service multipathd.socket" >&2 + echo "[online-install] echo +cpu > ${cgroot}/cgroup.subtree_control" >&2 + echo "[online-install] Full repro and fix: https://github.com/TencentCloud/CubeSandbox/issues/366" >&2 + exit 3 + fi + echo "[online-install] enabled '+cpu' on ${cgroot}/cgroup.subtree_control" >&2 + fi + fi + + # 9. Check deployment role early and check Docker/DNS installability (for control role) local deploy_role="${ONE_CLICK_DEPLOY_ROLE:-control}" case "${deploy_role}" in control|compute) ;; diff --git a/docs/guide/troubleshooting/deployment.md b/docs/guide/troubleshooting/deployment.md index 14dde1ca..8ea42d53 100644 --- a/docs/guide/troubleshooting/deployment.md +++ b/docs/guide/troubleshooting/deployment.md @@ -9,3 +9,4 @@ lang: en-US | --- | --- | --- | | `/data/cubelet` must be on XFS (reflink) | `cubelet` stores container writable layers under `/data/cubelet` and relies on XFS reflink. Deploying on ext4-rooted hosts (Ubuntu / Debian / WSL) makes the one-click pre-flight reject with `not XFS`. Workaround: mount a loopback `.img` formatted as XFS at `/data/cubelet`. For production, attach a dedicated XFS data disk (100–300 GiB). For fresh installs prefer OpenCloudOS 9 / RHEL family. | [#311](https://github.com/TencentCloud/CubeSandbox/issues/311), [#245](https://github.com/TencentCloud/CubeSandbox/issues/245) | | Template Creation Times Out When the Sandbox CIDR Overlaps the LAN | The one-click deployment defaults the sandbox network to `192.168.0.0/18`. If the host LAN also uses `192.168.1.x`, Cube may allocate sandbox IPs that overlap the physical network, causing template creation or port probing to fail with `context deadline exceeded`. Change the Cubelet CIDR to a non-overlapping range and remove the old TAP devices plus `cube-dev` before restarting. | [Guide](./local-network-cidr-conflict.md) | +| cgroup v2 `cpu` controller not enabled on Ubuntu, cubelet CPU quotas don't take effect | Ubuntu / Debian cloud images don't delegate the cgroup v2 `cpu` controller to child cgroups by default, and `multipathd`'s RT threads make `+cpu` writes fail with `Invalid argument`. See the issue for full repro and fix. | [#366](https://github.com/TencentCloud/CubeSandbox/issues/366) | diff --git a/docs/zh/guide/troubleshooting/deployment.md b/docs/zh/guide/troubleshooting/deployment.md index 69b85980..c18685dc 100644 --- a/docs/zh/guide/troubleshooting/deployment.md +++ b/docs/zh/guide/troubleshooting/deployment.md @@ -9,3 +9,4 @@ lang: zh-CN | --- | --- | --- | | `/data/cubelet` 必须是 XFS(reflink) | `cubelet` 把 `/data/cubelet` 作为容器可写层的存储目录,依赖 XFS 的 reflink 特性。在 Ubuntu / Debian / WSL 等 ext4 根盘的环境上部署,one-click 前置检查会以 `not XFS` 报错退出。Workaround:用 loopback `.img` 格式化为 XFS 后挂到 `/data/cubelet`;生产建议挂独立 XFS 数据盘(100–300 GiB);新装机器推荐 OpenCloudOS 9 / RHEL 系。 | [#311](https://github.com/TencentCloud/CubeSandbox/issues/311), [#245](https://github.com/TencentCloud/CubeSandbox/issues/245) | | 沙箱网段和局域网冲突导致创建模板超时 | one-click 部署默认沙箱网段是 `192.168.0.0/18`。如果宿主机局域网也使用 `192.168.1.x`,Cube 可能给沙箱分配到和真实局域网重叠的 IP 导致模板创建或端口探测以 `context deadline exceeded` 失败。将 Cubelet CIDR 改成不冲突的网段,并在重启前清理旧 TAP 网卡和 `cube-dev`。 | [指南](./local-network-cidr-conflict.md) | +| Ubuntu 上 cgroup v2 没启用 `cpu` controller,cubelet CPU quota 不生效 | Ubuntu / Debian 云镜像默认不会把 cgroup v2 的 `cpu` controller 委托到子 cgroup,且 `multipathd` 的 RT 线程会让 `+cpu` 写入返回 `Invalid argument`。详细复现和修复见 issue。 | [#366](https://github.com/TencentCloud/CubeSandbox/issues/366) |