## 混部集群性能分析
    混部：在线电商+离线ODPS作业
    
    在线电商：容器化/JAVA应用等，数量好几万
    离线ODPS作业：每天几十万-几百万作业
    
    
    本脚本为一个集群（AY49C）的混部性能数据

In [None]:
import pandas as pd
import numpy as np
import os
import sys
%matplotlib inline
import matplotlib.pyplot as plt

sys.path.append("../../../")

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option('max_colwidth',4000)
pd.options.display.expand_frame_repr = False

%load_ext autoreload
%autoreload 2



In [None]:
from pets.core.data_loader.res_eff.data_source import get_vm_pdf, get_nc_pdf
from pets.core.data_loader.walle.data_source import get_walle_odps_host_cpu_util, get_walle_online_offline_cpu_util, get_walle_odps_container_cpu_util
import pets.core.data_loader.sunfire.data_source as sunfire
import pets.core.data_loader.kmon.data_source as kmon
import pets.core.data_loader.fuxi.data_source as fuxi

In [None]:
cluster_group_name = ["phyhost-ecs_na610_ay49c"]

In [None]:
dates = ['20220222']

### Node Info

In [None]:
nc_pdf = get_nc_pdf(ds=dates, 
                    app_group_names=cluster_group_name)

In [None]:
nc_pdf.head(2)

In [None]:
nc_pdf.shape

### POD Info

In [None]:
nc_ips = nc_pdf.nc_ip.unique().tolist()
len(nc_ips)

In [None]:
pod_pdf = get_vm_pdf(ds=dates, 
                     nc_ips=nc_ips)

In [None]:
pod_pdf[pod_pdf["nc_ip"]=="33.9.176.199"]

In [None]:
pod_pdf.head(10)

In [None]:
pod_pdf.container_app_group_name.nunique()

### 电商容器性能数据

In [None]:
start_time = '2022-02-22 00:00:00'
end_time = '2022-02-23 00:00:00'

In [None]:
buy2_ips = pod_pdf[pod_pdf["container_app_group_name"]=="ay49c_tubo_server"
                  ].container_ip.unique().tolist()
len(buy2_ips)

In [None]:
buy2_sunfire_metric_df = sunfire.get_sunfire_odps_df(ds=dates,
                                                       ips=buy2_ips,
                                                       start_time=start_time,
                                                       end_time=end_time)

In [None]:
target = pod_pdf[pod_pdf["nc_ip"]=="33.9.160.123"]
container_ip = list(target["container_ip"].values)
target_machine = sunfire.get_sunfire_odps_df(ds=dates, ips=container_ip, start_time=start_time, end_time=end_time)

In [None]:
target_machine.head(20)
target_machine.sort_values(by=["container_ip", "sample_time__m", "throughput_type"], inplace=True)
target_machine.to_csv("service_performance.csv")

In [None]:
buy2_sunfire_metric_df.head(10)

In [None]:
buy2_sunfire_metric_df.to_csv("./buy2host.csv")

In [None]:
buy2_sunfire_metric_df.throughput_type.unique()

### 离线作业性能数据
    ay49c_tubo_server 为离线分组

In [None]:
odps_kata_ips = pod_pdf[pod_pdf["container_app_group_name"]=="ay49c_tubo_server"].container_ip.unique().tolist()
pod_pdf[pod_pdf["container_app_group_name"]=="ay49c_tubo_server"]

In [None]:
# fuxi jobs
eval_fuxi_job_pdf = fuxi.get_fuxi_job_df(ds=dates,
                                        cluster_name=["ay49c"],
                                        start_time=start_time,
                                        end_time=end_time).to_pandas()

In [None]:
eval_fuxi_job_pdf.head(10)

In [None]:
eval_fuxi_job_pdf.shape

In [None]:
fuxi_job_names = eval_fuxi_job_pdf.head(100).fuxi_job_name.unique().tolist()
len(fuxi_job_names)

In [None]:
# fuxi task/fuxi_instance_df
eval_fuxi_instance_pdf = fuxi.get_fuxi_instance_df(ds=dates,
                                                   fuxi_job_name=fuxi_job_names,
                                                   start_time=start_time,
                                                   end_time=end_time).to_pandas()

In [None]:
eval_fuxi_instance_pdf.drop(columns=["terminated_message","profile_info"], 
                            inplace=True)

In [None]:
eval_fuxi_instance_pdf.head(10)

In [None]:
eval_fuxi_instance_pdf.shape

In [None]:
# fuxi sensor

In [None]:
fuxi_sensor_df = fuxi.get_fuxi_sensor_df(ds=dates,
                                              start_time=start_time,
                                              end_time=end_time,
                                              fuxi_job_name=fuxi_job_names,
                                              cluster_name=['AY49C'])

fuxi_sensor_df = fuxi_sensor_df.groupby(["hostname", "jobname", "rolename", "name"]) \
                                        .agg(
                                        fuxi_sensor_df["time"].max().rename("timestamp"),
                                        fuxi_sensor_df["system_cpu_cgroup_usage_nano"].sum().rename("cpu_time__ns"),
                                        fuxi_sensor_df["system_mem_cgroup_cache"].sum().rename("system_mem_cgroup_cache"),
                                        fuxi_sensor_df["system_mem_cgroup_rss"].sum().rename(
                                            "system_mem_cgroup_rss"),
                                        fuxi_sensor_df["system_mem_cgroup_total"].sum().rename(
                                            "system_mem_cgroup_total"),
                                    )
fuxi_sensor_df["cpu_time__ms"] = fuxi_sensor_df["cpu_time__ns"] / 1e3

fuxi_sensor_df = fuxi_sensor_df[
    fuxi_sensor_df.hostname.rename("nc_hostname"),
    fuxi_sensor_df.jobname.rename("fuxi_job_name"),
    fuxi_sensor_df.rolename.rename("fuxi_task_name"),
    fuxi_sensor_df.name.rename("fuxi_worker_name"),
    "timestamp",
    "cpu_time__ms",
    "system_mem_cgroup_cache",
    "system_mem_cgroup_rss",
    "system_mem_cgroup_total"
]

fuxi_sensor_pdf = fuxi_sensor_df.to_pandas()

In [None]:
fuxi_sensor_pdf

### Node的性能数据

In [None]:
nc_top_10_sns = nc_pdf.head(10).nc_sn.unique().tolist()
len(nc_top_10_sns)

In [None]:
host_cpu_util_pdf = get_walle_odps_host_cpu_util(ds=dates, 
                                                 start_time=start_time, 
                                                 end_time=end_time, 
                                                 nc_sn=nc_top_10_sns).to_pandas()