In [1]:
import glob
import os
import pandas as pd
import numpy as np
from IPython.display import display
pd.set_option('display.max_columns', None)

In [2]:
#UTILITY FUNCTIONS
def swap_columns(df, col1, col2):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df



# BUILDING sensors_measures.csv FILE (all samples from all sensors on all nodes)

In [3]:

path = r'./sensors_data/' 
all_files = glob.glob(os.path.join(path , "*.csv"))
li = []
for filename in all_files:
    print("Working on: "+filename)
    df = pd.read_csv(filename, index_col=None,header=0,sep='\t', dtype={'note': "string"})
    df = df.drop('tempo', axis=1)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)
df = swap_columns(df, 'nodename', 'timestamp_measure')
df= df.rename(columns={"timestamp_measure": "measure_ts", "note": "notes", "dcenergy": "node_energy"})

print("Found "+str(len(df))+ " samples.")
df.drop_duplicates(inplace=True, subset=['jobid','measure_ts','nodename'],)
print("Found "+str(len(df))+ " unique samples.")
df.head()


Working on: ./sensors_data/20240106_measures.csv
Working on: ./sensors_data/20231217_measures.csv
Working on: ./sensors_data/20231210_measures.csv
Working on: ./sensors_data/20231205_measures.csv
Working on: ./sensors_data/20231202_measures.csv
Working on: ./sensors_data/20231226_measures.csv
Working on: ./sensors_data/20231221_measures.csv
Working on: ./sensors_data/20231203_measures.csv
Working on: ./sensors_data/20231204_measures.csv
Working on: ./sensors_data/20231211_measures.csv
Working on: ./sensors_data/20231216_measures.csv
Working on: ./sensors_data/20231220_measures.csv
Working on: ./sensors_data/20231227_measures.csv
Working on: ./sensors_data/20231126_measures.csv
Working on: ./sensors_data/20231121_measures.csv
Working on: ./sensors_data/20231209_measures.csv
Working on: ./sensors_data/20231208_measures.csv
Working on: ./sensors_data/20231127_measures.csv
Working on: ./sensors_data/20231129_measures-2.csv
Working on: ./sensors_data/20231229_measures.csv
Working on: ./sens

Unnamed: 0,jobid,measure_ts,nodename,sys_power,cpu_power,mem_power,sys_util,cpu_util,mem_util,io_util,amb_temp,cpu1_temp,cpu2_temp,exh_temp,sysairflow,fan1a,fan1b,fan2a,fan2b,fan3a,fan3b,fan4a,fan4b,fan5a,fan5b,node_energy,notes,delta_e
0,944853,1704495940,cresco6x187,130.0,100.0,10.0,85.0,97.0,0.0,0.0,21.0,55.0,53.0,31.0,14.0,6400.0,5504.0,8320.0,8064.0,8448.0,8064.0,8320.0,8064.0,6400.0,5504.0,394.8489,,4e-05
1,944853,1704495940,cresco6x328,80.0,50.0,5.0,54.0,97.0,0.0,0.0,18.0,65.0,55.0,41.0,20.0,6272.0,6272.0,8320.0,8704.0,8320.0,8704.0,8320.0,8704.0,6144.0,6272.0,264.67395,,5e-05
2,944853,1704495940,cresco6x132,120.0,90.0,9.0,81.0,98.0,40.0,0.0,21.0,75.0,54.0,44.0,23.0,6272.0,6272.0,8320.0,8704.0,8320.0,8704.0,8320.0,8704.0,6144.0,6272.0,400.0363,,7e-05
3,944853,1704495938,cresco6x187,130.0,100.0,9.0,85.0,97.0,0.0,0.0,21.0,61.0,53.0,31.0,23.0,6400.0,5504.0,8320.0,8064.0,8448.0,8064.0,8320.0,8064.0,6400.0,5504.0,394.84886,,8e-05
4,944853,1704495938,cresco6x328,80.0,50.0,5.0,98.0,97.0,0.0,0.0,18.0,65.0,55.0,41.0,20.0,6272.0,6272.0,8320.0,8704.0,8320.0,8704.0,8320.0,8704.0,6144.0,6272.0,264.6739,,5e-05


In [4]:
# FIXING some glitches in delta_e column

temp=df.loc[df.groupby(['jobid','nodename']).measure_ts.idxmin()]
#display(temp)
for index, row in temp.iterrows():
    if row['delta_e']!=0:
        #print('resetting index '+str(index))
        df.at[index,'delta_e']=0
#print("DONE")    

#REMOVING REDUNDANT INFO
df['nodename']=df['nodename'].replace("cresco6x","", regex=True)
df = df.astype(str)
df = df.replace(to_replace = r'\.0+$',value = '', regex = True)
df['delta_e']=df['delta_e'].astype(np.float64)
df.head()


Unnamed: 0,jobid,measure_ts,nodename,sys_power,cpu_power,mem_power,sys_util,cpu_util,mem_util,io_util,amb_temp,cpu1_temp,cpu2_temp,exh_temp,sysairflow,fan1a,fan1b,fan2a,fan2b,fan3a,fan3b,fan4a,fan4b,fan5a,fan5b,node_energy,notes,delta_e
0,944853,1704495940,187,130,100,10,85,97,0,0,21,55,53,31,14,6400,5504,8320,8064,8448,8064,8320,8064,6400,5504,394.8489,,4e-05
1,944853,1704495940,328,80,50,5,54,97,0,0,18,65,55,41,20,6272,6272,8320,8704,8320,8704,8320,8704,6144,6272,264.67395,,5e-05
2,944853,1704495940,132,120,90,9,81,98,40,0,21,75,54,44,23,6272,6272,8320,8704,8320,8704,8320,8704,6144,6272,400.0363,,7e-05
3,944853,1704495938,187,130,100,9,85,97,0,0,21,61,53,31,23,6400,5504,8320,8064,8448,8064,8320,8064,6400,5504,394.84886,,8e-05
4,944853,1704495938,328,80,50,5,98,97,0,0,18,65,55,41,20,6272,6272,8320,8704,8320,8704,8320,8704,6144,6272,264.6739,,5e-05


In [5]:
# WRITE ON SINGLE FILE
#df.to_csv('sensors_measures.csv',index=False, float_format='%f')
#print("sensors_measures.csv DONE!")



In [14]:
# WRITE ON FILE CHUNKS
num_chunks=6
print(str(len(df))+" measure points")
indexes= ((len(df))//num_chunks)+1
print(str(indexes)+" lines in each csv")
for grp, each_csv in df.groupby(df.index // indexes):
    each_csv.to_csv(f"sensors_measures_{grp}.csv",index=False, float_format='%f')
    print(f"sensors_measures_{grp}.csv DONE!")


4340910 measure points
723486 lines in each csv
sensors_measures_0.csv DONE!
sensors_measures_1.csv DONE!
sensors_measures_2.csv DONE!
sensors_measures_3.csv DONE!
sensors_measures_4.csv DONE!
sensors_measures_5.csv DONE!
