Skip to content

Commit

Permalink
Enable on demand GPU scheduling support for GKE (#1407)
Browse files Browse the repository at this point in the history
  • Loading branch information
jpurusho65 committed Jun 6, 2023
1 parent 740261a commit ca3b9c0
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 4 deletions.
6 changes: 4 additions & 2 deletions src/terraform/gke/nodepools.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ resource "google_container_node_pool" "primary_nodes" {
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/devstorage.read_only"
]

labels = {
Expand Down Expand Up @@ -60,6 +61,7 @@ resource "google_container_node_pool" "gpu_nodes" {
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/devstorage.read_only"
]

labels = {
Expand All @@ -69,7 +71,7 @@ resource "google_container_node_pool" "gpu_nodes" {

disk_size_gb = var.disk_size_in_gb
preemptible = true
machine_type = var.gpu_node_type
machine_type = var.cpu_node_type
tags = ["gke-node", "${var.project_id}-gke"]
metadata = {
disable-legacy-endpoints = "true"
Expand All @@ -90,4 +92,4 @@ resource "google_container_node_pool" "gpu_nodes" {
max_surge = 1
max_unavailable = 0
}
}
}
29 changes: 29 additions & 0 deletions src/terraform/gke/plugin.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
data "google_container_cluster" "cluster" {
depends_on = [ google_container_cluster.primary ]
project = var.project_id
name = var.cluster_name
location = var.region
}

data "google_client_config" "default" {}

provider "kubernetes" {
host = data.google_container_cluster.cluster.endpoint
token = data.google_client_config.default.access_token
cluster_ca_certificate = base64decode(data.google_container_cluster.cluster.master_auth[0].cluster_ca_certificate)
}

provider "kubectl" {
host = data.google_container_cluster.cluster.endpoint
token = data.google_client_config.default.access_token
cluster_ca_certificate = base64decode(data.google_container_cluster.cluster.master_auth[0].cluster_ca_certificate)
load_config_file = false
}

data "http" "nvidia_driver_installer_manifest" {
url = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml"
}

resource "kubectl_manifest" "nvidia_driver_installer" {
yaml_body = data.http.nvidia_driver_installer_manifest.body
}
4 changes: 2 additions & 2 deletions src/terraform/gke/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,13 @@ variable "max_gpu_node" {
variable "initial_node_count" {
description = "Initial number of nodes in this pool"
type = number
default = 1
default = 1
}

variable "create_gpu_node_pool" {
description = "Decide if this resource pool has to be created"
type = bool
default = false
default = true
}

variable "disk_size_in_gb" {
Expand Down
5 changes: 5 additions & 0 deletions src/terraform/gke/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ terraform {
version = "4.64.0"
}

kubectl = {
source = "gavinbunney/kubectl"
version = "1.14.0"
}

kubernetes = {
source = "hashicorp/kubernetes"
}
Expand Down
39 changes: 39 additions & 0 deletions src/ui/common/src/components/resources/dialogs/gcpDialog.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@ const Placeholders: OndemandGKEConfig = {
gcp_config_serialized: '',
keepalive: '1200',
cpu_node_type: 'n1-standard-4',
gpu_node_type: 'nvidia-tesla-t4',
min_cpu_node: '1',
max_cpu_node: '1',
min_gpu_node: '0',
max_gpu_node: '1',
};

const GCPPlaceholders: GCPConfig = {
Expand Down Expand Up @@ -81,6 +84,18 @@ export const GCPDialog: React.FC<
}}
/>

<ResourceTextInputField
name="gpu_node_type"
spellCheck={false}
required={false}
label="GPU Accelerator Type"
description="Type of compatible GPU accelarator for the compute node."
placeholder={Placeholders.gpu_node_type}
onChange={(event) => {
setValue('gpu_node_type', event.target.value);
}}
/>

<ResourceTextInputField
name="min_cpu_node"
spellCheck={false}
Expand All @@ -104,6 +119,30 @@ export const GCPDialog: React.FC<
setValue('max_cpu_node', event.target.value);
}}
/>

<ResourceTextInputField
name="min_gpu_node"
spellCheck={false}
required={false}
label="Min GPU node"
description="Minimum number of nodes in the GPU node group."
placeholder={Placeholders.min_gpu_node}
onChange={(event) => {
setValue('min_gpu_node', event.target.value);
}}
/>

<ResourceTextInputField
name="max_gpu_node"
spellCheck={false}
required={false}
label="Max GPU node"
description="Maximum number of nodes in the GPU node group."
placeholder={Placeholders.max_gpu_node}
onChange={(event) => {
setValue('max_gpu_node', event.target.value);
}}
/>
</Box>
);

Expand Down
3 changes: 3 additions & 0 deletions src/ui/common/src/utils/resources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,11 @@ export type OndemandGKEConfig = {
gcp_config_serialized: string;
keepalive: string;
cpu_node_type: string;
gpu_node_type: string;
min_cpu_node: string;
max_cpu_node: string;
min_gpu_node: string;
max_gpu_node: string;
};

export type GCPConfig = {
Expand Down

0 comments on commit ca3b9c0

Please sign in to comment.