## Process manifests and obtain information about individual nodes used in experiments

This notebook processes a csv file with manifests: `data/manifests.csv`. It also uses `data/nodes.csv` to map node names to hardware types. The first file contains sensitive user information and is excluded from this public repo. Without it, this notebook documents the process of extracting node info from the manifests.

In [1]:
import os
import pandas as pd
import mysql.connector as sql
import configparser
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mticker
import re
import hashlib
import time
import datetime as dt
import sys
import multiprocessing as mp

sys.path.append("geni-lib/")
from geni.rspec.pgmanifest import *

%reload_ext autoreload
%autoreload 2

source_dir = "data/"

if not os.path.isdir(source_dir):
    os.mkdir(dest_dir)
    
manifests = pd.read_csv(source_dir + "manifests.csv", low_memory=False)    
print "Loaded table/dataframe with manifests. Length: %d" % (len(manifests))
display(manifests.head())

nodes = pd.read_csv(source_dir + "nodes.csv")    
print "Loaded table/dataframe with node info. Length: %d" % (len(nodes))
display(nodes.head())

Loaded table/dataframe with manifests. Length: 116924


Unnamed: 0,uuid,manifest
0,15ebd4de-0229-11e5-bd13-38eaa71273fa,"<rspec xmlns=""http://www.geni.net/resources/rs..."
1,15ebd4de-0229-11e5-bd13-38eaa71273fa,"<rspec xmlns=""http://www.geni.net/resources/rs..."
2,afed9c2d-056f-11e5-bd13-38eaa71273fa,"<rspec xmlns:xsi=""http://www.w3.org/2001/XMLSc..."
3,71fd74f6-058c-11e5-bd13-38eaa71273fa,"<rspec xmlns:xsi=""http://www.w3.org/2001/XMLSc..."
4,de8f893a-050e-11e5-bd13-38eaa71273fa,"<rspec xmlns=""http://www.geni.net/resources/rs..."


Loaded table/dataframe with node info. Length: 3169


Unnamed: 0,node_id,type,phys_nodeid,role,inception,temp_boot_osid_vers,next_boot_osid,next_boot_osid_vers,startstatus,ready,...,next_op_mode,osid,ntpdrift,jailflag,rtabid,boot_errno,reserved_pid,uuid,reserved_memory,nonfsmounts
0,c220g1-031103,c220g1,c220g1-031103,testnode,2016-06-30 15:25:01,0,,0,none,0,...,,9997.0,25.306,0,0,0,,b8f86cf0-3f00-11e6-b435-eb3e622294a9,0,0
1,c220g2-011102,c220g2,c220g2-011102,testnode,2016-03-12 22:30:27,0,,0,none,0,...,,9997.0,12.846,0,0,0,,4f998dce-e8d4-11e5-a488-a3bb994732a6,0,0
2,c220g1-031120,c220g1,c220g1-031120,testnode,2016-06-29 22:12:03,0,,0,none,0,...,,10030.0,14.959,0,0,0,,6b165724-3e70-11e6-b435-eb3e622294a9,0,0
3,c220g1-031102,c220g1,c220g1-031102,testnode,2016-06-29 22:12:03,0,,0,none,0,...,,9997.0,7.545,0,0,0,,6ad9998c-3e70-11e6-b435-eb3e622294a9,0,0
4,c220g1-031125,c220g1,c220g1-031125,testnode,2016-06-29 22:12:03,0,,0,none,0,...,,9997.0,28.103,0,0,0,,6af92484-3e70-11e6-b435-eb3e622294a9,0,0


In [2]:
# This code was used to process individual manifests; replaced with parallel code that processes batches of manifests
# -----------

# def process_manifest(manifest_xml, row, uuid):
#     """ Process a single given manifest corresponding to the given uuid """

#     nodes_df = []
    
#     try:
#         mt = Manifest(xml=manifest_xml)

#         # Process all nodes (vnode tags in manifests)
#         for n in mt.nodes:
#             #print "Found node"
#             #hw = n.vnode_hardware_type
#             hw = n.hardware_type
#             if hw:
#                 node_dict = {}
#                 node_dict["hardware_type"] = hw
#                 node_dict["uuid"] = uuid         
#                 node_dict["node_or_host"] = "node"
#                 nodes_df.append(node_dict)

#         # Process all hosts (vhost tags in manifests)
#         for h in mt.hosts:
#             #print "Found host"
#             hw = n.hardware_type
#             if hw:
#                 node_dict = {}
#                 node_dict["hardware_type"] = hw
#                 node_dict["uuid"] = uuid         
#                 node_dict["node_or_host"] = "host"
#                 nodes_df.append(node_dict)
        
#     except Exception as e: 
#         print "Can't process manifest (%s, %s). Error:\n" % (row, uuid), e

#     return pd.DataFrame(nodes_df)

# row = 1357

# print manifests["manifest"][row]
# print "\n****\nProcessing manifest:\n****"

# print process_manifest(manifests["manifest"][row], row, manifests["uuid"][row])

In [3]:
# Sequentially process a number of experiments; replaced with parallel version below
# --------

# all_nodes = pd.DataFrame()

# #for row in range(len(manifests)):
# for row in range(200):
    
#     if len(str(manifests["manifest"][row])) > 50:
#         res = process_manifest(manifests["manifest"][row], row, manifests["uuid"][row])
#         all_nodes = pd.concat([all_nodes, res])

# all_nodes.reset_index(drop=True, inplace=True)
# display(all_nodes)

In [4]:
# Process all manifests in parallel (using multiprocessing module)

def process_manifest_batch(manifest_row_ids):
    """ Process a batch of manifests corresponding to specified row IDs in the manifests dataframe """

    nodes_df = []
    
    for id in manifest_row_ids:
        
        row = manifests.iloc[id]
        
        # It is not a manifest if if is no loger than 50 chars
        if len(str(row.manifest)) > 50:
            try:
                mt = Manifest(xml=row.manifest)

                # Process all nodes (vnode tags in manifests)
                for n in mt.nodes:
                    #print "Found node"
                    #hw = n.vnode_hardware_type
                    hw = n.hardware_type
                    if hw:
                        node_dict = {}
                        node_dict["hardware_type"] = hw
                        node_dict["uuid"] = row.uuid         
                        node_dict["node_or_host"] = "node"
                        nodes_df.append(node_dict)

                # Process all hosts (vhost tags in manifests)
                for h in mt.hosts:
                    #print "Found host"
                    hw = n.hardware_type
                    if hw:
                        node_dict = {}
                        node_dict["hardware_type"] = hw
                        node_dict["uuid"] = row.uuid         
                        node_dict["node_or_host"] = "host"
                        nodes_df.append(node_dict)

            except Exception as e: 
                print "Can't process manifest (%s, %s). Error:\n" % (id, row.uuid), e

    return pd.DataFrame(nodes_df)

# Small test
process_manifest_batch(range(50))
print "Small test ran fine!"

def split_seq(seq, size):
    newseq = []
    splitsize = 1.0/size*len(seq)
    for i in range(size):
        newseq.append(seq[int(round(i*splitsize)):int(round((i+1)*splitsize))])
    return newseq

# Allows for the safe importing of the main module (needed by multiprocessing)
if __name__ == '__main__':
    
    startTime = time.time()
    
    # count()-2 -- avoid starving other processes and do not use all cores 
    use_cpus = mp.cpu_count() - 2
    
    chunks = split_seq(manifests.index.tolist(), use_cpus)
    
    p = mp.Pool(use_cpus)
    
    print "\n***\nStarting to process all manifests on %d processors" % (use_cpus)
   
    tasks_output = p.map(process_manifest_batch, chunks)
    print "All tasks completed."
    
    endTime = time.time()
    
    # Construct summary dataframe with all results
    manifest_nodes = pd.concat(tasks_output)  
    print "Number of rows in combined dataframe:", len(manifest_nodes)   
    
    p.terminate()
    
    print "Finished processing all reservations in (seconds): %.3f" % (endTime-startTime)

Small test ran fine!

***
Starting to process all manifests on 22 processors
All tasks completed.
Number of rows in combined dataframe: 597921
Finished processing all reservations in (seconds): 49.219


In [5]:
display(manifest_nodes.sample(10))

print manifest_nodes.node_or_host.value_counts()

Unnamed: 0,hardware_type,node_or_host,uuid
5598,"(name, c220g2-011103)",node,6d56b1d3-f826-11e5-b570-99cadac50270
7017,"(type, d430)",node,bba5d611-e8f4-11e8-b339-90e2ba22fee4
105777,"(name, hpvm084-102)",node,7e87fcfb-5464-11e8-b228-90e2ba22fee4
14582,"(name, pc-c220m4vm0418-42)",node,815613bd-8bfa-11e5-88c8-277b2fdb9c32
53586,"(name, hpvm133-179)",node,6411cfd7-5265-11e8-b226-90e2ba22fee4
18498,"(name, pcvm784-4)",node,23eff41a-ad67-11e8-b228-90e2ba22fee4
3130,"(name, pc322)",node,a4cfb34f-e7d8-11e6-ac8d-90e2ba22fee4
23860,"(type, m510)",node,c094d1d8-b608-11e8-b228-90e2ba22fee4
9180,"(type, r320)",node,c61bb26b-3a43-11e9-897b-e4434b2381fc
59780,"(name, pcvm840-37)",node,1cd327c8-fca9-11e7-b222-90e2ba22fee4


node    588945
host      8976
Name: node_or_host, dtype: int64


In [6]:
# Process info in the hardware_type column and map node names to hardware types

unidentified_nodes = []

node2type = {n:t for r, (n, t) in nodes[["node_id", "type"]].iterrows()}

def get_type(x):
    t, v = x
    if t == "name":
        if v in node2type.keys(): 
            return node2type[v]
        else:
            if 'vm' in v:
                return 'vm'
            else:
                unidentified_nodes.append(v)
                return None
    else:
        return v
    
manifest_nodes["type"] = manifest_nodes["hardware_type"].apply(get_type)

display(manifest_nodes.sample(10))
display(pd.DataFrame(manifest_nodes["type"].value_counts()).T)

Unnamed: 0,hardware_type,node_or_host,uuid,type
3622,"(name, ms0136)",node,7ea33041-743e-11e5-bf03-38eaa71273fa,m400
2022,"(type, xl170)",node,a78a491a-5846-11e8-b228-90e2ba22fee4,xl170
11297,"(name, c220g1-030608)",node,794ddca9-0638-11e7-ac8d-90e2ba22fee4,c220g1
9202,"(name, pcvm754-8)",node,ad6dd5fa-b9b9-11e6-ac8d-90e2ba22fee4,vm
8769,"(type, m400)",node,afe04754-bdfc-11e7-b179-90e2ba22fee4,m400
13461,"(name, pcvm702-21)",node,92460432-f256-11e7-b222-90e2ba22fee4,vm
6511,"(type, xl170)",node,b5a56d8c-38fa-11e9-897b-e4434b2381fc,xl170
8343,"(type, c220g5)",node,c5dd35d2-ea8e-11e8-b339-90e2ba22fee4,c220g5
1801,"(name, c220g2-010828vm-3)",node,afcc0d29-4f1e-11e9-897b-e4434b2381fc,pcvm
6740,"(name, c220g5-110432)",node,6638c207-8c48-11e8-b228-90e2ba22fee4,c220g5


Unnamed: 0,vm,m400,m510,r320,c8220,c220g2,d430,c6320,c220g1,xl170,...,c220g5,c240g1,nuc7100,nuc8559,d840,m510.1,faros-sfp,c8220-vm,C220M4-vm,pc2400w
type,237236,47530,45316,38619,38292,26815,24307,19916,19351,18870,...,5,4,3,3,2,2,1,1,1,1


In [7]:
# Inspect unidentified_nodes

print "Number of None in type:", manifest_nodes.type.isna().sum()
na_counts = manifest_nodes[manifest_nodes.type.isna()].hardware_type.value_counts()

print "\ntype==None: breakdown by type extracted information (all names):"
print pd.Series([t for t,v in na_counts.index]).value_counts()

print "\ntype==None: names"
na_names = pd.Series([v for t,v in na_counts.index])
print na_names[na_names.apply(lambda x: "vm" not in x)].tolist()

Number of None in type: 3930

type==None: breakdown by type extracted information (all names):
name    128
dtype: int64

type==None: names
['pc-c220m4-r01-04', 'pc-c220m4-r02-08', 'pc-c220m4-r01-18', 'pc-c220m4-r02-10', 'pc-c220m4-r04-03', 'pc-c220m4-r04-19', 'pc-c220m4-r02-12', 'pc-c220m4-r02-06', 'pc-c220m4-r04-01', 'pc-c220m4-r02-14', 'pc-c220m4-r02-07', 'pc-c220m4-r04-05', 'pc-c220m4-r01-17', 'pc-c220m4-r02-17', 'pc-c220m4-r04-13', 'pc-c220m4-r02-04', 'pc-c220m4-r02-03', 'pc-c220m4-r03-01', 'pc-c220m4-r02-16', 'pc-c220m4-r02-20', 'pc-c220m4-r03-08', 'pc-c220m4-r01-05', 'pc-c220m4-r03-09', 'pc-c220m4-r04-17', 'pc-c220m4-r04-02', 'pc-c220m4-r02-01', 'pc-c220m4-r01-08', 'pc-c220m4-r01-07', 'pc-c220m4-r02-18', 'pc-c220m4-r04-09', 'pc-c220m4-r02-05', 'pc-c220m4-r04-14', 'pc-c220m4-r01-20', 'pc-c220m4-r01-13', 'pc-c220m4-r02-15', 'pc-c220m4-r04-18', 'pc-c220m4-r04-11', 'pc-c220m4-r02-09', 'pc-c220m4-r01-01', 'pc-c220m4-r01-06', 'pc-c220m4-r03-15', 'pc-c220m4-r01-03', 'pc-c220m4-r03-02', 

In [8]:
# Save uuid and used hardware as a csv file for identified nodes

manifest_nodes = manifest_nodes[~manifest_nodes.type.isna()]
manifest_nodes[["uuid", "type"]].to_csv("data/used_hardware.csv", index=False)
print "Saved dataframe with %d rows" % len(manifest_nodes)

Saved dataframe with 593991 rows
