# Data Visualization

In [18]:
from query_tool import M100DataClient
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import calendar

plt.style.use("seaborn-v0_8")

GRAPHS = False

date_dataset = datetime(2022, 9, 1)

dataset_path = "./dataset/22-09/year_month=" + date_dataset.strftime("%y-%m")
client = M100DataClient(dataset_path)

# Available plugins
plugins = list(client.metrics_per_plugin.keys())
print(plugins)

['ipmi', 'ganglia', 'vertiv', 'schneider', 'weather', 'logics', 'nagios', 'slurm', 'job_table']


In [19]:
all_metrics = {plugin: client.metrics_per_plugin[plugin] for plugin in plugins}

In [20]:
(metrics_per_plugin, tags_per_metric, dtype_per_metric, _, _, _) = client.getMetadata()

In [21]:
dtype_per_metric

{'p1_core18_temp': DataType(int32),
 'dimm2_temp': DataType(int32),
 'p1_core5_temp': DataType(int32),
 'ps0_input_power': DataType(int32),
 'p0_core12_temp': DataType(int32),
 'p1_power': DataType(int32),
 'gpu3_core_temp': DataType(int32),
 'ps0_output_volta': DataType(float),
 'p0_core14_temp': DataType(int32),
 'dimm11_temp': DataType(int32),
 'ps0_output_curre': DataType(int32),
 'dimm5_temp': DataType(int32),
 'p0_core2_temp': DataType(int32),
 'gpu1_core_temp': DataType(int32),
 'dimm0_temp': DataType(int32),
 'p0_core9_temp': DataType(int32),
 'p1_core13_temp': DataType(int32),
 'p0_core18_temp': DataType(int32),
 'p0_core13_temp': DataType(int32),
 'dimm13_temp': DataType(int32),
 'p0_core23_temp': DataType(int32),
 'ambient': DataType(float),
 'gv100card0': DataType(int32),
 'ps0_input_voltag': DataType(int32),
 'p1_core20_temp': DataType(int32),
 'dimm14_temp': DataType(int32),
 'p1_core16_temp': DataType(int32),
 'p0_core5_temp': DataType(int32),
 'gv100card4': DataType(int

In [22]:
tags_per_metric

{'p1_core18_temp': ['plugin',
  'metric',
  'year_month',
  'timestamp',
  'value',
  'node'],
 'dimm2_temp': ['plugin',
  'metric',
  'year_month',
  'timestamp',
  'value',
  'node'],
 'p1_core5_temp': ['plugin',
  'metric',
  'year_month',
  'timestamp',
  'value',
  'node'],
 'ps0_input_power': ['plugin',
  'metric',
  'year_month',
  'timestamp',
  'value',
  'node'],
 'p0_core12_temp': ['plugin',
  'metric',
  'year_month',
  'timestamp',
  'value',
  'node'],
 'p1_power': ['plugin', 'metric', 'year_month', 'timestamp', 'value', 'node'],
 'gpu3_core_temp': ['plugin',
  'metric',
  'year_month',
  'timestamp',
  'value',
  'node'],
 'ps0_output_volta': ['plugin',
  'metric',
  'year_month',
  'timestamp',
  'value',
  'node'],
 'p0_core14_temp': ['plugin',
  'metric',
  'year_month',
  'timestamp',
  'value',
  'node'],
 'dimm11_temp': ['plugin',
  'metric',
  'year_month',
  'timestamp',
  'value',
  'node'],
 'ps0_output_curre': ['plugin',
  'metric',
  'year_month',
  'timestam

In [23]:
metrics_per_plugin

{'ipmi': ['p1_core18_temp',
  'dimm2_temp',
  'p1_core5_temp',
  'ps0_input_power',
  'p0_core12_temp',
  'p1_power',
  'gpu3_core_temp',
  'ps0_output_volta',
  'p0_core14_temp',
  'dimm11_temp',
  'ps0_output_curre',
  'dimm5_temp',
  'p0_core2_temp',
  'gpu1_core_temp',
  'dimm0_temp',
  'p0_core9_temp',
  'p1_core13_temp',
  'p0_core18_temp',
  'p0_core13_temp',
  'dimm13_temp',
  'p0_core23_temp',
  'ambient',
  'gv100card0',
  'ps0_input_voltag',
  'p1_core20_temp',
  'dimm14_temp',
  'p1_core16_temp',
  'p0_core5_temp',
  'gv100card4',
  'gpu3_mem_temp',
  'fan1_0',
  'p0_vdd_temp',
  'ps1_input_voltag',
  'p1_core19_temp',
  'p1_mem_power',
  'p1_core22_temp',
  'p0_mem_power',
  'p1_core7_temp',
  'dimm10_temp',
  'p0_core16_temp',
  'p1_core15_temp',
  'p0_io_power',
  'fan3_1',
  'p0_core22_temp',
  'dimm4_temp',
  'p0_core20_temp',
  'dimm7_temp',
  'dimm15_temp',
  'p0_core0_temp',
  'p0_core11_temp',
  'gpu4_mem_temp',
  'p1_core23_temp',
  'p1_core2_temp',
  'gpu1_mem_te

In [4]:
xticks_days = [
    date_dataset.replace(day=i).strftime("%Y-%m-%d")
    for i in range(1, calendar.monthrange(date_dataset.year, date_dataset.month)[1] + 1)
]

# Nodes with the most data available 

In [5]:
%%script false --no-raise-error

highest_nodeID = 1162
nodes_rows = pd.DataFrame({"rows": [0] * highest_nodeID}, dtype=int)
fill_na_rows = pd.DataFrame({"node": range(0, highest_nodeID + 1, 1)}, dtype=int)

for plugin, metrics in all_metrics.items():
    for met in metrics:
        # Create dataset with {nodeID, number of rows of the metric}
        results = (
            client.query(metrics=[met], columns=["node"])
            .groupby("node", observed=False)
            .size()
            .reset_index(name="rows")
            .astype({"node": int})
            .sort_values("node")
            .reset_index(drop=True)
        )
        results = pd.merge(fill_na_rows, results, on="node", how="left")
        nodes_rows["rows"] += results["rows"].fillna(0)

display(
    nodes_rows.sort_values(by="rows", ascending=False)
    .head(10)
    .style.set_caption("Nodes with most data available")
)

# # Results:
# 3	    28246529
# 1	    28246382
# 2	    28243448
# 0	    28219495
# 880	27026042
# 703	27023216
# 334	27020755
# 41	27011895
# 961	27008536
# 585	27003582

# Nodes with most Nagios anomalies

In [6]:
%%script false --no-raise-error
results = (
    client.query_plugins(plugins=["nagios"], columns=["node", "value"])
    .query("value in [1,2,3]")
    .groupby("node", observed=False)
    .size()
    .reset_index(name="rows")
    .astype({"node": int})
    .sort_values("node")
    .reset_index(drop=True)
)

results = nodes_rows.iloc[
    results.sort_values(by="rows", ascending=False).iloc[0:10, 0].values, :
]

display(
    results.sort_values(by="rows", ascending=False)
    .head(10)
    .style.set_caption("Nodes with most data available")
)

# Results
# 19	26281005
# 949	25556767
# 298	25544516
# 322	24589350
# 38	24106684
# 335	24094853
# 10	23496599
# 1007	52237
# 1008	10668
# 1002	5334

In [7]:
# good examples: 10, 38, 335
node_selected = "10"

# Nagios

In [8]:
# df_nagios = client.query_plugins(plugins="nagios", node=node_selected).sort_values(by="timestamp", ascending=True)
df_nagios = client.query_plugins(plugins="nagios").sort_values(by="timestamp", ascending=True).iloc[[1], :]

df_nagios.info()
display(df_nagios.head(5))

Future exception was never retrieved
future: <Future finished exception=BrokenPipeError(32, 'Broken pipe')>
Traceback (most recent call last):
  File "/usr/lib/python3.10/asyncio/unix_events.py", line 676, in write
    n = os.write(self._fileno, data)
BrokenPipeError: [Errno 32] Broken pipe


<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, 68616 to 68616
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   state_type     1 non-null      string             
 1   year_month     0 non-null      category           
 2   timestamp      1 non-null      datetime64[ms, UTC]
 3   plugin         1 non-null      category           
 4   value          1 non-null      Int32              
 5   host_group     1 non-null      category           
 6   metric         1 non-null      category           
 7   node           1 non-null      category           
 8   description    1 non-null      category           
 9   nagiosdrained  1 non-null      string             
dtypes: Int32(1), category(6), datetime64[ms, UTC](1), string(2)
memory usage: 65.7 KB


Unnamed: 0,state_type,year_month,timestamp,plugin,value,host_group,metric,node,description,nagiosdrained
68616,1,,2022-09-01 00:00:00+00:00,nagios_pub,0,compute,state,823,alive::ping,0


In [9]:
if GRAPHS:
    tags = df_nagios["description"].sort_values().unique()

    for tag in tags:
        _, axes = plt.subplots(figsize=(16, 3))
        df_nagios_plot = df_nagios.loc[df_nagios["description"] == tag][["timestamp", "value", "nagiosdrained"]]
        axes.plot(df_nagios_plot["timestamp"], df_nagios_plot["value"], label="state")
        axes.plot(df_nagios_plot["timestamp"], df_nagios_plot["nagiosdrained"].astype(int), label="nagiosdrained")
        axes.set_title("{}\n {}\n (node {})".format("nagios", tag, node_selected))
        axes.legend()
        axes.set_xticks(xticks_days), axes.set_yticks([0, 1, 2, 3])
        axes.tick_params(axis="x", labelrotation=45)
        plt.tight_layout()
        plt.show()

In [10]:
%reset_selective -f df_nagios
%reset_selective -f df_nagios_plot
%reset_selective -f tag
%reset_selective -f tags
%reset_selective -f ax
%reset_selective -f axes

# Ganglia Metric

In [11]:
# df_ganglia = client.query_plugins(plugins="ganglia", node=node_selected).sort_values(by="timestamp", ascending=True)
df_ganglia = client.query_plugins(plugins="ganglia").sort_values(by="timestamp", ascending=True).iloc[[1], :]

df_ganglia.info()
display(df_ganglia.head(5))

: 

In [52]:
if GRAPHS:
    df_ganglia = df_ganglia[["timestamp", "value", "metric"]]

    # Divide into different categories
    ganglia_metrics_available = df_ganglia["metric"].unique()
    ganglia_metrics_dict = {
        "gpu0": [s for s in ganglia_metrics_available if s.startswith("Gpu0")],
        "gpu1": [s for s in ganglia_metrics_available if s.startswith("Gpu1")],
        "gpu2": [s for s in ganglia_metrics_available if s.startswith("Gpu2")],
        "gpu3": [s for s in ganglia_metrics_available if s.startswith("Gpu3")],
        "system": [s for s in ganglia_metrics_available if s in ["boottime", "machine_type", "os_name", "os_release"]],
        "network": [s for s in ganglia_metrics_available if s in ["bytes_in", "bytes_out", "pkts_in", "pkts_out"]],
        "cpu": [s for s in ganglia_metrics_available if s.startswith("cpu")],
        "disk": [s for s in ganglia_metrics_available if s.startswith("disk") or s in ["part_max_used"]],
        "core": [s for s in ganglia_metrics_available if s in ["gexec"]],
        "load": [s for s in ganglia_metrics_available if s.startswith("load")],
        "memory": [s for s in ganglia_metrics_available if s.startswith("mem") or s.startswith("swap")],
        "process": [s for s in ganglia_metrics_available if s.startswith("proc")],
    }

In [53]:
from sklearn.preprocessing import MinMaxScaler

if GRAPHS:
    for metric_name, val in ganglia_metrics_dict.items():
        _, axes = plt.subplots(figsize=(16, 5))
        empty = True
        for met in val:
            df_metric = df_ganglia.loc[df_ganglia["metric"] == met][["timestamp", "value"]]

            # Remove metrics with unmuted values
            if (df_metric["value"] == df_metric["value"].tolist()[0]).all():
                continue

            # Remove metrics non numerical
            if ~df_metric["value"].apply(lambda x: pd.to_numeric(x, errors="coerce")).notnull().all():
                continue

            empty = False

            # Scaling for plotting
            df_metric["value"] = MinMaxScaler().fit_transform(df_metric[["value"]].astype(float))
            axes.scatter(df_metric["timestamp"], df_metric["value"], label=met)
            axes.set_title("{} - (node {})".format(metric_name, node_selected))
            axes.set_xticks(xticks_days)
            axes.legend(loc="upper left")
            axes.tick_params(axis="x", labelrotation=45)

        if empty:
            axes.remove()
        plt.tight_layout()
        plt.show()

In [54]:
%reset_selective -f ganglia_metrics_dict
%reset_selective -f ganglia_metrics_available
%reset_selective -f df_ganglia
%reset_selective -f metric_name
%reset_selective -f val
%reset_selective -f met
%reset_selective -f df_metric
%reset_selective -f ax
%reset_selective -f axes

# IPMI

In [55]:
df_ipmi = client.query_plugins(plugins="ipmi", node=node_selected).sort_values(by="timestamp", ascending=True)

df_ipmi.info()
display(df_ipmi.head(5))

<class 'pandas.core.frame.DataFrame'>
Index: 10547519 entries, 3146510 to 2512647
Data columns (total 6 columns):
 #   Column      Dtype              
---  ------      -----              
 0   value       Float64            
 1   plugin      category           
 2   timestamp   datetime64[ms, UTC]
 3   node        category           
 4   metric      category           
 5   year_month  category           
dtypes: Float64(1), category(4), datetime64[ms, UTC](1)
memory usage: 311.9 MB


Unnamed: 0,value,plugin,timestamp,node,metric,year_month
3146510,33.0,ipmi_pub,2022-09-01 00:00:00+00:00,10,gpu0_mem_temp,
7225737,42.0,ipmi_pub,2022-09-01 00:00:00+00:00,10,p1_core15_temp,
6588083,18.0,ipmi_pub,2022-09-01 00:00:00+00:00,10,p0_mem_power,
154409,31.0,ipmi_pub,2022-09-01 00:00:00+00:00,10,dimm10_temp,
6870054,42.0,ipmi_pub,2022-09-01 00:00:00+00:00,10,p1_core10_temp,


In [56]:
if GRAPHS:
    df_ipmi = df_ipmi[["value", "metric", "timestamp"]]

    tags = df_ipmi["metric"].sort_values().unique()

    for tag in tags:
        _, axes = plt.subplots(figsize=(16, 3))
        df_ipmi_plot = df_ipmi.loc[df_ipmi["metric"] == tag][["timestamp", "value"]]
        axes.plot(df_ipmi_plot["timestamp"], df_ipmi_plot["value"])
        axes.set_title("{} - {} - (node {})".format("ipmi", tag, node_selected))
        axes.set_xticks(xticks_days)
        axes.tick_params(axis="x", labelrotation=45)
        plt.tight_layout()
        plt.show()

In [57]:
%reset_selective -f df_ipmi
%reset_selective -f df_ipmi_plot
%reset_selective -f tags
%reset_selective -f tag
%reset_selective -f ax
%reset_selective -f axes

# Job table

In [58]:
df_jobtable = (
    client.query_plugins(plugins="job_table")
    .loc[lambda df: df["nodes"].str.contains(node_selected)]
    .sort_values("submit_time")
)

df_jobtable.info()
display(df_jobtable.head(5))

<class 'pandas.core.frame.DataFrame'>
Index: 4022 entries, 62218 to 81337
Data columns (total 92 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   start_time             4022 non-null   datetime64[ms, UTC]
 1   tres_per_job           0 non-null      string             
 2   cpus_allocated         0 non-null      string             
 3   req_switch             0 non-null      UInt16             
 4   cpus_per_task          0 non-null      UInt32             
 5   metric                 4022 non-null   category           
 6   sockets_per_board      0 non-null      UInt16             
 7   show_flags             0 non-null      UInt16             
 8   accrue_time            0 non-null      datetime64[ms, UTC]
 9   resv_name              40 non-null     string             
 10  batch_host             0 non-null      string             
 11  shared                 0 non-null      string           

Unnamed: 0,start_time,tres_per_job,cpus_allocated,req_switch,cpus_per_task,metric,sockets_per_board,show_flags,accrue_time,resv_name,...,array_max_tasks,wait4switch,boards_per_node,ntasks_per_board,resize_time,partition,array_job_id,sched_nodes,pn_min_cpus,threads_per_core
62218,2022-09-03 02:29:25+00:00,,,,,job_info_marconi100,,,NaT,,...,,,,,NaT,1,2048434,,,
137213,2022-09-10 12:46:35+00:00,,,,,job_info_marconi100,,,NaT,,...,,,,,NaT,1,1259399,,,
169598,2022-09-16 04:42:26+00:00,,,,,job_info_marconi100,,,NaT,,...,,,,,NaT,1,5054747,,,
181243,2022-09-13 04:37:58+00:00,,,,,job_info_marconi100,,,NaT,,...,,,,,NaT,1,6132524,,,
203969,2022-09-02 03:47:04+00:00,,,,,job_info_marconi100,,,NaT,,...,,,,,NaT,1,1978696,,,


In [59]:
if GRAPHS:
    df_jobtable = df_jobtable[["job_state", "start_time", "end_time"]]

    job_states = df_jobtable["job_state"].unique()

    _, ax = plt.subplots(len(job_states), figsize=(16, 10))
    for tag, axes in zip(job_states, ax.ravel()):
        df_jobtable_plot = df_jobtable.loc[df_jobtable["job_state"] == tag]
        axes.scatter(df_jobtable_plot["start_time"], df_jobtable_plot["job_state"], label="start")
        axes.scatter(df_jobtable_plot["end_time"], df_jobtable_plot["job_state"], label="end")
        axes.legend()
        axes.set_title("{}\n {}\n (node {})".format("job_state", tag, node_selected))
        axes.set_xticks(xticks_days)
        axes.tick_params(axis="x", labelrotation=45)
    plt.tight_layout()
    plt.show()

In [60]:
%reset_selective -f df_jobtable
%reset_selective -f df_jobtable_plot
%reset_selective -f job_states
%reset_selective -f tag
%reset_selective -f ax
%reset_selective -f axes

# Logics

In [7]:
df_logics = client.query_plugins(plugins="logics")

df_logics.info()
display(df_logics.head(5))

# Data of the position of the node w.r.t. the panel code is not available

  df = pd.concat([df, table.to_pandas(**to_pandas_kwargs)], ignore_index=True, axis=0)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38835580 entries, 0 to 38835579
Data columns (total 7 columns):
 #   Column      Dtype              
---  ------      -----              
 0   metric      category           
 1   value       Float64            
 2   plugin      category           
 3   timestamp   datetime64[ms, UTC]
 4   year_month  category           
 5   panel       object             
 6   device      object             
dtypes: Float64(1), category(3), datetime64[ms, UTC](1), object(2)
memory usage: 1.3+ GB


Unnamed: 0,metric,value,plugin,timestamp,year_month,panel,device
0,Bad_values,1.0,logics_pub,2022-09-04 12:25:50+00:00,,marconi-a3,qe-20a-n
1,Bad_values,0.0,logics_pub,2022-09-04 12:26:50+00:00,,marconi-a3,qe-20a-n
2,Bad_values,1.0,logics_pub,2022-09-04 12:33:53+00:00,,marconi-a3,qe-20a-n
3,Bad_values,0.0,logics_pub,2022-09-04 12:42:56+00:00,,marconi-a3,qe-20a-n
4,Bad_values,1.0,logics_pub,2022-09-04 12:43:56+00:00,,marconi-a3,qe-20a-n


# Schneider

In [62]:
df_schneider = client.query_plugins(plugins="schneider")

df_schneider.info()
display(df_schneider.head(5))

# Data of the position of the node w.r.t. the panel code is not available

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33554432 entries, 0 to 33554431
Data columns (total 6 columns):
 #   Column      Dtype              
---  ------      -----              
 0   value       Int32              
 1   plugin      category           
 2   timestamp   datetime64[ms, UTC]
 3   panel       category           
 4   metric      category           
 5   year_month  category           
dtypes: Int32(1), category(4), datetime64[ms, UTC](1)
memory usage: 576.0 MB


Unnamed: 0,value,plugin,timestamp,panel,metric,year_month
0,0,schneider_pub,2022-09-23 22:00:00+00:00,Q101,Alm_TY141,
1,0,schneider_pub,2022-09-23 23:00:00+00:00,Q101,Alm_TY141,
2,0,schneider_pub,2022-09-02 22:00:00+00:00,Q101,Alm_TY141,
3,0,schneider_pub,2022-09-02 23:00:00+00:00,Q101,Alm_TY141,
4,0,schneider_pub,2022-09-03 22:00:00+00:00,Q101,Alm_TY141,


# Slurm 

In [63]:
df_slurm = client.query_plugins(plugins="slurm")

df_slurm.info()
display(df_slurm.head(5))

# Data of the position of the node w.r.t. the partition code is not available

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59574539 entries, 0 to 59574538
Data columns (total 9 columns):
 #   Column      Dtype              
---  ------      -----              
 0   value       Float64            
 1   year_month  category           
 2   timestamp   datetime64[ms, UTC]
 3   qos         category           
 4   user_id     UInt32             
 5   job_state   category           
 6   plugin      category           
 7   partition   category           
 8   metric      category           
dtypes: Float64(1), UInt32(1), category(6), datetime64[ms, UTC](1)
memory usage: 1.6 GB


Unnamed: 0,value,year_month,timestamp,qos,user_id,job_state,plugin,partition,metric
0,1348166.0,,2022-09-14 22:00:00+00:00,,393,FAILED,slurm_pub,1,job_id
1,1653075.0,,2022-09-07 22:00:00+00:00,,366,COMPLETED,slurm_pub,1,job_id
2,1147583.0,,2022-09-13 23:00:00+00:00,,478,COMPLETED,slurm_pub,1,job_id
3,3995923.0,,2022-09-25 23:00:00+00:00,,1474,OUT_OF_MEMORY,slurm_pub,1,job_id
4,4080670.0,,2022-09-04 22:00:00+00:00,,1737,COMPLETED,slurm_pub,1,job_id
