Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions deploy/one-click/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,49 @@ check_cubelet_fs_preflight() {
fi
}

check_cgroup_cpu_preflight() {
local cgroot="/sys/fs/cgroup"
local fstype
fstype="$(stat -fc %T "${cgroot}" 2>/dev/null || echo unknown)"

# cgroup v1 systems still work via the v1 handle in cubelet; only validate
# cgroup v2 hosts here (which is what every recent distro defaults to).
if [[ "${fstype}" != "cgroup2fs" ]]; then
return 0
fi

local controllers=""
if [[ -r "${cgroot}/cgroup.controllers" ]]; then
controllers="$(cat "${cgroot}/cgroup.controllers" 2>/dev/null || true)"
fi
if ! grep -qw cpu <<<"${controllers}"; then
die "Kernel cgroup v2 does not expose the 'cpu' controller (cgroup.controllers='${controllers:-<empty>}').
cubelet cannot set CPU quotas without it.
See: https://github.com/TencentCloud/CubeSandbox/issues/366"
fi

local subtree=""
if [[ -r "${cgroot}/cgroup.subtree_control" ]]; then
subtree="$(cat "${cgroot}/cgroup.subtree_control" 2>/dev/null || true)"
fi
if grep -qw cpu <<<"${subtree}"; then
return 0
fi

log "cgroup v2 'cpu' controller not enabled on ${cgroot}/cgroup.subtree_control; trying to enable it"
if printf '+cpu\n' >"${cgroot}/cgroup.subtree_control" 2>/dev/null; then
log "enabled '+cpu' on ${cgroot}/cgroup.subtree_control"
return 0
fi

die "Failed to enable the cgroup v2 'cpu' controller on ${cgroot}/cgroup.subtree_control.
On Ubuntu / Debian this is usually caused by 'multipathd' (or another service) running real-time threads under the root cgroup, which blocks '+cpu' with 'Invalid argument'.
Quick fix:
systemctl disable --now multipathd.service multipathd.socket
echo +cpu > ${cgroot}/cgroup.subtree_control
Full repro and fix: https://github.com/TencentCloud/CubeSandbox/issues/366"
}

check_install_preflight() {
# install.sh itself.
require_cmd tar
Expand Down Expand Up @@ -374,6 +417,7 @@ require_root
# to ensure we fail fast before installing or modifying any local system packages.
check_hardware_preflight
check_cubelet_fs_preflight
check_cgroup_cpu_preflight

CUBE_SANDBOX_NODE_IP="$(detect_node_ip)"
export CUBE_SANDBOX_NODE_IP
Expand Down
36 changes: 35 additions & 1 deletion deploy/one-click/online-install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,41 @@ check_early_preflight() {
exit 3
fi

# 8. Check deployment role early and check Docker/DNS installability (for control role)
# 8. cgroup v2 'cpu' controller check (mirrors check_cgroup_cpu_preflight in install.sh)
local cgroot="/sys/fs/cgroup"
local cg_fstype
cg_fstype="$(stat -fc %T "${cgroot}" 2>/dev/null || echo unknown)"
if [[ "${cg_fstype}" == "cgroup2fs" ]]; then
local cg_controllers=""
if [[ -r "${cgroot}/cgroup.controllers" ]]; then
cg_controllers="$(cat "${cgroot}/cgroup.controllers" 2>/dev/null || true)"
fi
if ! grep -qw cpu <<<"${cg_controllers}"; then
echo "[online-install] ERROR: Kernel cgroup v2 does not expose the 'cpu' controller (cgroup.controllers='${cg_controllers:-<empty>}')." >&2
echo "[online-install] cubelet cannot set CPU quotas without it." >&2
echo "[online-install] See: https://github.com/TencentCloud/CubeSandbox/issues/366" >&2
exit 3
fi
local cg_subtree=""
if [[ -r "${cgroot}/cgroup.subtree_control" ]]; then
cg_subtree="$(cat "${cgroot}/cgroup.subtree_control" 2>/dev/null || true)"
fi
if ! grep -qw cpu <<<"${cg_subtree}"; then
echo "[online-install] cgroup v2 'cpu' controller not enabled on ${cgroot}/cgroup.subtree_control; trying to enable it" >&2
if ! printf '+cpu\n' >"${cgroot}/cgroup.subtree_control" 2>/dev/null; then
echo "[online-install] ERROR: Failed to enable the cgroup v2 'cpu' controller on ${cgroot}/cgroup.subtree_control." >&2
echo "[online-install] On Ubuntu / Debian this is usually caused by 'multipathd' (or another service) running real-time threads under the root cgroup, which blocks '+cpu' with 'Invalid argument'." >&2
echo "[online-install] Quick fix:" >&2
echo "[online-install] systemctl disable --now multipathd.service multipathd.socket" >&2
echo "[online-install] echo +cpu > ${cgroot}/cgroup.subtree_control" >&2
echo "[online-install] Full repro and fix: https://github.com/TencentCloud/CubeSandbox/issues/366" >&2
exit 3
fi
echo "[online-install] enabled '+cpu' on ${cgroot}/cgroup.subtree_control" >&2
fi
fi

# 9. Check deployment role early and check Docker/DNS installability (for control role)
local deploy_role="${ONE_CLICK_DEPLOY_ROLE:-control}"
case "${deploy_role}" in
control|compute) ;;
Expand Down
1 change: 1 addition & 0 deletions docs/guide/troubleshooting/deployment.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ lang: en-US
| --- | --- | --- |
| `/data/cubelet` must be on XFS (reflink) | `cubelet` stores container writable layers under `/data/cubelet` and relies on XFS reflink. Deploying on ext4-rooted hosts (Ubuntu / Debian / WSL) makes the one-click pre-flight reject with `not XFS`. Workaround: mount a loopback `.img` formatted as XFS at `/data/cubelet`. For production, attach a dedicated XFS data disk (100–300 GiB). For fresh installs prefer OpenCloudOS 9 / RHEL family. | [#311](https://github.com/TencentCloud/CubeSandbox/issues/311), [#245](https://github.com/TencentCloud/CubeSandbox/issues/245) |
| Template Creation Times Out When the Sandbox CIDR Overlaps the LAN | The one-click deployment defaults the sandbox network to `192.168.0.0/18`. If the host LAN also uses `192.168.1.x`, Cube may allocate sandbox IPs that overlap the physical network, causing template creation or port probing to fail with `context deadline exceeded`. Change the Cubelet CIDR to a non-overlapping range and remove the old TAP devices plus `cube-dev` before restarting. | [Guide](./local-network-cidr-conflict.md) |
| cgroup v2 `cpu` controller not enabled on Ubuntu, cubelet CPU quotas don't take effect | Ubuntu / Debian cloud images don't delegate the cgroup v2 `cpu` controller to child cgroups by default, and `multipathd`'s RT threads make `+cpu` writes fail with `Invalid argument`. See the issue for full repro and fix. | [#366](https://github.com/TencentCloud/CubeSandbox/issues/366) |
1 change: 1 addition & 0 deletions docs/zh/guide/troubleshooting/deployment.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ lang: zh-CN
| --- | --- | --- |
| `/data/cubelet` 必须是 XFS(reflink) | `cubelet` 把 `/data/cubelet` 作为容器可写层的存储目录,依赖 XFS 的 reflink 特性。在 Ubuntu / Debian / WSL 等 ext4 根盘的环境上部署,one-click 前置检查会以 `not XFS` 报错退出。Workaround:用 loopback `.img` 格式化为 XFS 后挂到 `/data/cubelet`;生产建议挂独立 XFS 数据盘(100–300 GiB);新装机器推荐 OpenCloudOS 9 / RHEL 系。 | [#311](https://github.com/TencentCloud/CubeSandbox/issues/311), [#245](https://github.com/TencentCloud/CubeSandbox/issues/245) |
| 沙箱网段和局域网冲突导致创建模板超时 | one-click 部署默认沙箱网段是 `192.168.0.0/18`。如果宿主机局域网也使用 `192.168.1.x`,Cube 可能给沙箱分配到和真实局域网重叠的 IP 导致模板创建或端口探测以 `context deadline exceeded` 失败。将 Cubelet CIDR 改成不冲突的网段,并在重启前清理旧 TAP 网卡和 `cube-dev`。 | [指南](./local-network-cidr-conflict.md) |
| Ubuntu 上 cgroup v2 没启用 `cpu` controller,cubelet CPU quota 不生效 | Ubuntu / Debian 云镜像默认不会把 cgroup v2 的 `cpu` controller 委托到子 cgroup,且 `multipathd` 的 RT 线程会让 `+cpu` 写入返回 `Invalid argument`。详细复现和修复见 issue。 | [#366](https://github.com/TencentCloud/CubeSandbox/issues/366) |
Loading