From adbcbba1bf5c9c301b63a76381bc67dada88cafe Mon Sep 17 00:00:00 2001 From: Ankush Agarwal Date: Wed, 30 May 2018 14:59:54 -0700 Subject: [PATCH] Add GKE Security Features to Deployment Manager config (#879) * Add GKE Security Features to bootstrapper Use gke-default oauth scopes Update to gkeApiVersion Add securityConfig Enable iam api in dm * Add oauth scopes manually. gke-default does not work * Fix oauthscopes * Add PodSecurityPolicy only to v1beta1 --- docs/gke/configs/cluster-kubeflow.yaml | 32 +++++- docs/gke/configs/cluster.jinja | 137 ++++++++++++++++--------- 2 files changed, 114 insertions(+), 55 deletions(-) diff --git a/docs/gke/configs/cluster-kubeflow.yaml b/docs/gke/configs/cluster-kubeflow.yaml index 52d63fcaac6..14280436113 100644 --- a/docs/gke/configs/cluster-kubeflow.yaml +++ b/docs/gke/configs/cluster-kubeflow.yaml @@ -21,13 +21,15 @@ resources: # 1. Use a single template (.jinja file for all resources) or # 2. Create two separate deployments and launch the boot strapper # after the cluster is created. - # - # Two separate deployments doesn't make much sense; we could just use + # + # Two separate deployments doesn't make much sense; we could just use # kubectl at that point. So we put all resources in a single deployment. - name: kubeflow type: cluster.jinja properties: zone: us-east1-d + # Set this to v1beta1 to use beta features such as private clusters, + gkeApiVersion: v1 # An arbitrary string appending to name of nodepools # bump this if you want to modify the node pools. # This will cause existing node pools to be deleted and new ones to be created. @@ -35,14 +37,34 @@ resources: pool-version: v1 # Two is small enough to fit within default quota. cpu-pool-initialNodeCount: 2 - gpu-pool-initialNodeCount: 0 + gpu-pool-initialNodeCount: 0 # Whether to deploy the new Stackdriver Kubernetes agents stackdriver-kubernetes: false + securityConfig: + # Whether to use a cluster with private IPs + # Use v1beta1 api + privatecluster: false + # masterIpv4CidrBlock for private clusters, if enabled + # Use v1beta1 api + masterIpv4CidrBlock: 172.16.0.16/28 + # Protect worker node metadata from pods + # Use v1beta1 api + secureNodeMetadata: false + # Whether to enable Pod Security Policy Admission Controller + # Use v1beta1 api + podSecurityPolicy: false + masterAuthorizedNetworksConfig: + cidrBlocks: + - cidrBlock: 1.2.3.4/32 + - cidrBlock: 5.6.7.8/32 + enabled: false # Path for the bootstrapper image. bootstrapperImage: gcr.io/kubeflow-images-public/bootstrapper:latest # This is the name of the GCP static ip address to reserve for your domain. # This must be different for each Kubeflow deployment in your project. ipName: kubeflow-ip + # Name of the service account to use for k8s worker node pools + vmServiceAccountName: kubeflow-service-account # Provide the config for the bootstrapper. This should be a string # containing the YAML spec for the bootstrapper. # @@ -58,7 +80,7 @@ resources: # Project - This is the name of your GCP provided project. bootstrapperConfig: | # Apps only apply if on GKE - app: + app: packages: - name: core - name: tf-serving @@ -90,4 +112,4 @@ resources: value: kubeflow.endpoints..cloud.goog - component: kubeflow-core name: jupyterHubAuthenticator - value: iap \ No newline at end of file + value: iap diff --git a/docs/gke/configs/cluster.jinja b/docs/gke/configs/cluster.jinja index 9ddb808dbdb..7dc1457a65c 100644 --- a/docs/gke/configs/cluster.jinja +++ b/docs/gke/configs/cluster.jinja @@ -25,7 +25,9 @@ limitations under the License. {% set RBAC_TYPE_NAME = TYPE_NAME + '-rbac-v1' %} {% set APPS_TYPE_NAME = TYPE_NAME + '-apps-v1' %} -{# A dictionary mapping type name suffixes to the corresponding +{% set VM_OAUTH_SCOPES = ['https://www.googleapis.com/auth/logging.write', 'https://www.googleapis.com/auth/monitoring'] %} + +{# A dictionary mapping type name suffixes to the corresponding Kubernetes API endpoint. #} {% set K8S_ENDPOINTS = {'': 'api/v1', '-v1beta1-extensions': 'apis/extensions/v1beta1', '-rbac-v1': 'apis/rbac.authorization.k8s.io/v1', '-apps-v1': 'apis/apps/v1/'} %} @@ -56,15 +58,25 @@ resources: properties: accountId: {{ KF_ADMIN_NAME }} displayName: Service Account used for Kubeflow admin actions. +- name: kubeflow-cluster-vm-service-account + type: iam.v1.serviceAccount + properties: + accountId: {{ properties['vmServiceAccountName'] }} + displayName: GCP Service Account to use as VM Service Account for Kubeflow Cluster VMs - name: {{ CLUSTER_NAME }} + {% if properties['gkeApiVersion'] == 'v1beta1' %} + type: gcp-types/container-v1beta1:projects.locations.clusters + {% else %} type: container.v1.cluster + {% endif %} properties: + parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }} zone: {{ properties['zone'] }} cluster: name: {{ CLUSTER_NAME }} # Create a very small minimal pool. Actual nodes will be managed - # as additional node pools. This makes it easier to + # as additional node pools. This makes it easier to initialNodeCount: 1 {% if properties['stackdriver-kubernetes'] %} # TODO: remove alpha when 10.2 is public. @@ -79,59 +91,83 @@ resources: monitoringService: monitoring.googleapis.com/kubernetes {% else %} initialClusterVersion: 1.9.6-gke.1 - {% endif %} + {% endif %} + {% if properties['gkeApiVersion'] == 'v1beta1' %} + podSecurityPolicyConfig: + enabled: {{ properties['securityConfig']['podSecurityPolicy'] }} + {% endif %} + {% if properties['securityConfig']['privatecluster'] %} + ipAllocationPolicy: + createSubnetwork: true + useIpAliases: true + masterIpv4CidrBlock: {{ properties['securityConfig']['masterIpv4CidrBlock'] }} + privateCluster: true + masterAuthorizedNetworksConfig: {{ properties['securityConfig']['masterAuthorizedNetworksConfig'] }} + {% endif %} nodeConfig: machineType: n1-standard-1 - oauthScopes: - - https://www.googleapis.com/auth/compute - - https://www.googleapis.com/auth/devstorage.read_only - - https://www.googleapis.com/auth/logging.write - - https://www.googleapis.com/auth/monitoring + serviceAccount: {{ properties['vmServiceAccountName'] }}@{{ env['project'] }}.iam.gserviceaccount.com + {% if properties['securityConfig']['secureNodeMetadata'] %} + workloadMetadataConfig: + nodeMetadata: SECURE + {% endif %} + oauthScopes: {{ VM_OAUTH_SCOPES }} + metadata: + dependsOn: + - kubeflow-cluster-vm-service-account # We manage the node pools as separate resources. # We do this so that if we want to make changes we can delete the existing resource and then recreate it. # Updating doesn't work so well because we are limited in what changes GKE's update method supports. - name: {{ CPU_POOL }} + {% if properties['gkeApiVersion'] == 'v1beta1' %} + type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools + {% else %} type: container.v1.nodePool + {% endif %} properties: + parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }} project: {{ properties['project'] }} zone: {{ properties['zone'] }} clusterId: {{ CLUSTER_NAME }} nodePool: - name: cpu-pool + name: cpu-pool initialNodeCount: {{ properties['cpu-pool-initialNodeCount'] }} - config: + config: + {% if properties['securityConfig']['secureNodeMetadata'] %} + workloadMetadataConfig: + nodeMetadata: SECURE + {% endif %} machineType: n1-standard-8 - oauthScopes: - - https://www.googleapis.com/auth/compute - - https://www.googleapis.com/auth/devstorage.read_only - - https://www.googleapis.com/auth/logging.write - - https://www.googleapis.com/auth/monitoring - + serviceAccount: {{ properties['vmServiceAccountName'] }}@{{ env['project'] }}.iam.gserviceaccount.com + oauthScopes: {{ VM_OAUTH_SCOPES }} metadata: dependsOn: - {{ CLUSTER_NAME }} - name: {{ GPU_POOL }} + {% if properties['gkeApiVersion'] == 'v1beta1' %} + type: gcp-types/container-v1beta1:projects.locations.clusters.nodePools + {% else %} type: container.v1.nodePool + {% endif %} properties: - project: {{ properties['project'] }} + parent: projects/{{ env['project'] }}/locations/{{ properties['zone'] }}/clusters/{{ CLUSTER_NAME }} + project: {{ properties['securityConfig']['project'] }} zone: {{ properties['zone'] }} clusterId: {{ CLUSTER_NAME }} nodePool: - name: gpu-pool + name: gpu-pool initialNodeCount: {{ properties['gpu-pool-initialNodeCount'] }} - config: + config: + {% if properties['securityConfig']['secureNodeMetadata'] %} + workloadMetadataConfig: + nodeMetadata: SECURE + {% endif %} machineType: n1-standard-8 - oauthScopes: - # Attaching cloud-platform scope to nodes is not good practice - # But it simplifies demos. - - https://www.googleapis.com/auth/cloud-platform - - https://www.googleapis.com/auth/compute - - https://www.googleapis.com/auth/devstorage.read_only - - https://www.googleapis.com/auth/logging.write - - https://www.googleapis.com/auth/monitoring + serviceAccount: {{ properties['vmServiceAccountName'] }}@{{ env['project'] }}.iam.gserviceaccount.com + oauthScopes: {{ VM_OAUTH_SCOPES }} accelerators: - acceleratorCount: 1 acceleratorType: nvidia-tesla-k80 @@ -143,13 +179,13 @@ resources: {# Project defaults to the project of the deployment. #} - name: {{ properties['ipName'] }} - type: compute.v1.globalAddress + type: compute.v1.globalAddress properties: description: "Static IP for Kubeflow ingress." {# -Define TypeProviders for different K8s endpoints. +Define TypeProviders for different K8s endpoints. https://cloud.google.com/deployment-manager/docs/configuration/type-providers/process-adding-api This allows K8s resources to be created using Deployment manager. We use this to create the minimal resources needed to startup and deploy Kubeflow via the bootstrapper; @@ -192,12 +228,12 @@ e.g. creating namespaces, service accounts, stateful set to run the bootstrapper If activating multiple APIs you might want to serialize them. We use an action and not the type deploymentmanager.v2.virtual.enableService - because we only want to create it; we don't want to delete it. + because we only want to create it; we don't want to delete it. Deleting the service corresponds to deactivating the API and that causes problems. #} - name: resource-manager-api action: 'gcp-types/servicemanagement-v1:servicemanagement.services.enable' - properties: + properties: consumerId: {{ 'project:' + env['project'] }} serviceName: cloudresourcemanager.googleapis.com @@ -207,30 +243,31 @@ TODO(jlewi): Do we need to serialize API activation #} - name: endpoints-api action: 'gcp-types/servicemanagement-v1:servicemanagement.services.enable' - properties: + properties: consumerId: {{ 'project:' + env['project'] }} serviceName: endpoints.googleapis.com - name: iam-api action: 'gcp-types/servicemanagement-v1:servicemanagement.services.enable' - properties: + properties: consumerId: {{ 'project:' + env['project'] }} serviceName: iam.googleapis.com {# Get the IAM policy first so that we do not remove any existing bindings. #} - name: get-iam-policy action: gcp-types/cloudresourcemanager-v1:cloudresourcemanager.projects.getIamPolicy - properties: + properties: resource: {{ env['project'] }} - + metadata: dependsOn: - resource-manager-api - runtimePolicy: - - UPDATE_ALWAYS + - iam-api + runtimePolicy: + - UPDATE_ALWAYS {# Set the IAM policy patching the existing policy with what ever is currently in the - config. + config. We need to make the cloudservices account a GKE cluster admin because deployment manager users the cloudservices account; so this will be the identity used with the K*s cluster. @@ -240,16 +277,16 @@ TODO(jlewi): Do we need to serialize API activation #} - name: patch-iam-policy action: gcp-types/cloudresourcemanager-v1:cloudresourcemanager.projects.setIamPolicy - properties: + properties: resource: {{ env['project'] }} policy: $(ref.get-iam-policy) gcpIamPolicyPatch: - add: + add: - role: roles/container.admin members: - {{ 'serviceAccount:' + env['project_number'] + '@cloudservices.gserviceaccount.com' }} - - role: roles/servicemanagement.admin + - role: roles/servicemanagement.admin members: - {{ 'serviceAccount:' + KF_ADMIN_NAME + '@' + env['project'] + '.iam.gserviceaccount.com' }} @@ -260,8 +297,8 @@ TODO(jlewi): Do we need to serialize API activation - get-iam-policy - iam-api - {{ KF_ADMIN_NAME }} - runtimePolicy: - - UPDATE_ALWAYS + runtimePolicy: + - UPDATE_ALWAYS {# A note about K8s resources. The type value should be defined using a reference to the corresponding type provider. @@ -278,7 +315,7 @@ the corresponding type provider. apiVersion: v1 kind: Namespace metadata: - name: kubeflow-admin + name: kubeflow-admin spec: {# The deployment manager uses the cloudservices account. We need to create @@ -333,11 +370,11 @@ the corresponding type provider. properties: apiVersion: v1 kind: PersistentVolumeClaim - {# Namespace is a property because its used by deployment manager in + {# Namespace is a property because its used by deployment manager in the URL #} namespace: kubeflow-admin metadata: - name: kubeflow-ksonnet-pvc + name: kubeflow-ksonnet-pvc labels: app: kubeflow-ksonnet spec: @@ -356,7 +393,7 @@ the corresponding type provider. type: {{ env['project'] }}/$(ref.{{ TYPE_NAME }}.name):{{ CM_COLLECTION }} properties: apiVersion: v1 - {# Namespace is a property because its used bye deployment manager in + {# Namespace is a property because its used bye deployment manager in the URL #} kind: ConfigMap namespace: kubeflow-admin @@ -376,7 +413,7 @@ the corresponding type provider. type: {{ env['project'] }}/$(ref.{{ APPS_TYPE_NAME }}.name):{{ STATEFULSETS_COLLECTION }} properties: apiVersion: apps/v1 - {# Namespace is a property because its used bye deployment manager in + {# Namespace is a property because its used bye deployment manager in the URL #} kind: StatefulSet namespace: kubeflow-admin @@ -398,7 +435,7 @@ the corresponding type provider. - name: kubeflow-bootstrapper image: {{ properties["bootstrapperImage"] }} workingDir: /opt/bootstrap - command: + command: - /opt/kubeflow/bootstrapper - --in-cluster=true - --apply=true @@ -414,7 +451,7 @@ the corresponding type provider. persistentVolumeClaim: claimName: kubeflow-ksonnet-pvc - name: kubeflow-bootstrapper - configMap: + configMap: name: kubeflow-bootstrapper metadata: