Skip to content

Commit

Permalink
Move EKS jobs cleanup to cleanup_k8s.py script
Browse files Browse the repository at this point in the history
  • Loading branch information
ilausuch committed Mar 20, 2023
1 parent 35456a6 commit 1d86445
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 249 deletions.
3 changes: 3 additions & 0 deletions cleanup_k8s.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
from ocw.lib.gke import GKE
from ocw.lib.eks import EKS
from ocw.enums import ProviderChoice
from webui.PCWConfig import PCWConfig

Expand All @@ -12,6 +13,8 @@ def main():
try:
if ProviderChoice.GCE in providers:
GKE(namespace).cleanup_k8s_jobs()
if ProviderChoice.EC2 in providers:
EKS(namespace).cleanup_k8s_jobs()
except Exception:
logger.exception("[%s] Cleanup failed!", namespace)

Expand Down
85 changes: 0 additions & 85 deletions ocw/lib/EC2.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import os
import traceback
import time
from datetime import date, datetime, timedelta, timezone
from typing import Dict
import boto3
from botocore.exceptions import ClientError
from dateutil.parser import parse
import kubernetes
from webui.PCWConfig import PCWConfig, ConfigFile
from ocw.lib.emailnotify import send_mail
from ocw.lib.k8s import clean_jobs
from .provider import Provider
from ..models import Instance

Expand All @@ -34,9 +31,7 @@ def __new__(cls, vault_namespace: str):
if vault_namespace not in EC2.__instances:
EC2.__instances[vault_namespace] = self = object.__new__(cls)
self.__ec2_client = {}
self.__eks_client = {}
self.__ec2_resource = {}
self.__kubectl_client = {}
self.__secret = None
self.__key = None

Expand Down Expand Up @@ -69,50 +64,6 @@ def ec2_client(self, region: str) -> "boto3.session.Session.client":
region_name=region)
return self.__ec2_client[region]

def eks_client(self, region: str) -> "boto3.session.Session.client":
if region not in self.__eks_client:
self.__eks_client[region] = boto3.client('eks', aws_access_key_id=self.__key,
aws_secret_access_key=self.__secret,
region_name=region)
return self.__eks_client[region]

# pylint: disable=no-else-return
def kubectl_client(self, region: str, cluster_name: str):
region_cluster = f"{region}/{cluster_name}"

if region_cluster not in self.__kubectl_client:
kubeconfig = f"~/.kube/eks_config_{region}_{cluster_name}"

res = self.cmd_exec(f"aws eks update-kubeconfig --region {region} --name {cluster_name} " +
f"--kubeconfig {kubeconfig}")
if res.returncode != 0:
self.log_err(f"Cannot get the kubeconfig for the cluster {cluster_name} on region {region}")
return None
else:
kubernetes.config.load_kube_config(config_file=kubeconfig)
self.__kubectl_client[region_cluster] = kubernetes.client.BatchV1Api()

return self.__kubectl_client[region_cluster]
# pylint: enable=no-else-return

def all_clusters(self) -> dict:
clusters = {}
for region in self.cluster_regions:
self.log_dbg("Checking clusters in {}", region)
response = self.eks_client(region).list_clusters()
if 'clusters' in response and len(response['clusters']) > 0:
clusters[region] = []
self.log_dbg("Found {} clusters in {}", len(response['clusters']), region)
for cluster in response['clusters']:
cluster_description = self.eks_client(region).describe_cluster(name=cluster)
if 'cluster' not in cluster_description or 'tags' not in cluster_description['cluster']:
self.log_err("Unexpected cluster description: {}", cluster_description)
elif Instance.TAG_IGNORE not in cluster_description['cluster']['tags']:
clusters[region].append(cluster)
if len(clusters[region]) == 0:
del clusters[region]
return clusters

@staticmethod
def is_outdated(creation_time: datetime, valid_period_days: float) -> bool:
return datetime.date(creation_time) < (date.today() - timedelta(days=valid_period_days))
Expand Down Expand Up @@ -376,39 +327,3 @@ def cleanup_images(self, valid_period_days: float) -> None:
else:
self.log_info("Delete image '{}' (ami:{})".format(img['Name'], img['ImageId']))
self.ec2_client(region).deregister_image(ImageId=img['ImageId'], DryRun=False)

def cleanup_k8s_jobs(self) -> None:
self.log_dbg('Call cleanup_k8s_jobs')
try:
self.create_credentials_file()
except Exception as exception:
self.log_err(str(exception))
return

clusters = {}
for region in self.cluster_regions:
response = self.eks_client(region).list_clusters()
if 'clusters' in response and len(response['clusters']) > 0:
clusters[region] = []
self.log_dbg("Found {} clusters in {}", len(response['clusters']), region)
for cluster_name in response['clusters']:
client = self.kubectl_client(region, cluster_name)
clean_jobs(self, client, cluster_name)

def create_credentials_file(self, user_home_dir: str = "/root") -> None:
aws_dir = f"{user_home_dir}/.aws"
creds_file = f"{aws_dir}/credentials"

if not os.path.exists(creds_file):
if not os.path.exists(aws_dir):
os.mkdir(aws_dir)

with open(creds_file, "w", encoding="utf8") as file_handle:
file_handle.write("[default]\n")
file_handle.write(f"aws_access_key_id={self.__key}\n")
file_handle.write(f"aws_secret_access_key={self.__secret}\n")

res = self.cmd_exec("aws sts get-caller-identity")
if res.returncode != 0:
raise Exception("Invalid credentials, the credentials cannot be verified by"
f"'aws sts get-caller-identity' with the error: {res.stderr}")
16 changes: 0 additions & 16 deletions ocw/lib/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,22 +43,6 @@ def list_clusters():
send_mail('{} on List clusters in [{}]'.format(type(ex).__name__, namespace), traceback.format_exc())


def cleanup_k8s():
for namespace in PCWConfig.get_namespaces_for('k8sclusters'):
try:
providers = PCWConfig.get_providers_for('k8sclusters', namespace)
logger.debug("[%s] Run k8s cleanup for %s", namespace, ','.join(providers))

if ProviderChoice.EC2 in providers:
EC2(namespace).cleanup_k8s_jobs()

except Exception as exception:
logger.exception("[%s] k8s cleanup failed!", namespace)
send_mail('{} on k8s cleanup in [{}]'.format(type(exception).__name__, namespace), traceback.format_exc())


def init_cron():
getScheduler().add_job(cleanup_run, trigger='interval', minutes=60, id='cleanup_all', misfire_grace_time=1800)
getScheduler().add_job(list_clusters, trigger='interval', hours=18, id='list_clusters', misfire_grace_time=10000)
getScheduler().add_job(cleanup_k8s, trigger='interval', minutes=1440, id='cleanup_k8s_all',
misfire_grace_time=1800)
90 changes: 90 additions & 0 deletions ocw/lib/eks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import os
import json
import kubernetes
import boto3
from ocw.lib.provider import Provider
from ocw.lib.k8s import clean_jobs

TAG_IGNORE = 'pcw_ignore'


class EKS(Provider):
__instances = {}
default_region: str = 'eu-central-1'

def __new__(cls, vault_namespace):
if vault_namespace not in EKS.__instances:
EKS.__instances[vault_namespace] = self = object.__new__(cls)
self.__eks_client = {}
self.__kubectl_client = {}
self.__cluster_regions = None
self.__aws_dir = None

return EKS.__instances[vault_namespace]

def __init__(self, namespace: str):
super().__init__(namespace)
self.create_credentials_file()

def aws_dir(self):
if self.__aws_dir is None:
self.__aws_dir = os.path.expanduser("~/.aws")
return self.__aws_dir

def list_regions(self):
if self.__cluster_regions is None:
regions_query = self.cmd_exec(
f"aws ec2 describe-regions --query 'Regions[].RegionName' --output json --region {EKS.default_region}")
self.__cluster_regions = json.loads(regions_query.stdout)
return self.__cluster_regions

def create_credentials_file(self) -> None:
creds_file = f"{self.aws_dir()}/credentials"

if not os.path.exists(creds_file):
if not os.path.exists(self.aws_dir()):
os.mkdir(self.aws_dir())

with open(creds_file, "w", encoding="utf8") as file_handle:
file_handle.write("[default]\n")
file_handle.write(f"aws_access_key_id={self.auth_json['access_key']}\n")
file_handle.write(f"aws_secret_access_key={self.auth_json['secret_key']}\n")

res = self.cmd_exec("aws sts get-caller-identity")
if res.returncode != 0:
raise RuntimeError("Invalid credentials, the credentials cannot be verified by"
f"'aws sts get-caller-identity' with the error: {res.stderr}")

def eks_client(self, region: str) -> "boto3.session.Session.client":
if region not in self.__eks_client:
self.__eks_client[region] = boto3.client('eks',
aws_access_key_id=self.auth_json['access_key'],
aws_secret_access_key=self.auth_json['secret_key'],
region_name=region)
return self.__eks_client[region]

def kubectl_client(self, region: str, cluster_name: str):
region_cluster = f"{region}/{cluster_name}"

if region_cluster not in self.__kubectl_client:
kubeconfig = f"~/.kube/eks_config_{region}_{cluster_name}"
kubeconfig = os.path.expanduser(kubeconfig)

res = self.cmd_exec(f"aws eks update-kubeconfig --region {region} --name {cluster_name} " +
f"--kubeconfig {kubeconfig}")
if res.returncode != 0:
raise RuntimeError(f"Cannot get the kubeconfig for the cluster {cluster_name} on region {region}")

kubernetes.config.load_kube_config(config_file=kubeconfig)
self.__kubectl_client[region_cluster] = kubernetes.client.BatchV1Api()

return self.__kubectl_client[region_cluster]

def cleanup_k8s_jobs(self):
for region in self.list_regions():
self.log_dbg(f"Region {region}")
clusters = self.eks_client(region).list_clusters()['clusters']
for cluster_name in clusters:
self.log_dbg(f"Clean up of cluster {cluster_name} in region {region}")
client = self.kubectl_client(region, cluster_name)
clean_jobs(self, client, cluster_name)
9 changes: 0 additions & 9 deletions ocw/management/commands/cleanupk8s.py

This file was deleted.

1 change: 1 addition & 0 deletions requirements_k8s.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
oauth2client
kubernetes
google-api-python-client==2.55.0
boto3
Loading

0 comments on commit 1d86445

Please sign in to comment.