Summary:

This is the "debug_stats_report_template.ipynb"

In [None]:
import matplotlib as mpl
mpl.use('agg')
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
import json
import importlib
import sys
import inspect
from collections import defaultdict
from IPython.core.display import display, HTML
import os
import logging
import argparse
import shlex

logging.basicConfig(level=logging.ERROR)
log= logging.getLogger( __name__ )

In [3]:
# Set Pandas to display dataframe w/o truncation
pd.set_option('display.max_colwidth', -1)

In [4]:
def parse_cmd_args(nb_args):
    parser = argparse.ArgumentParser()
    parser.add_argument("--plugin_analyzer_output ", dest='plugin_analyzer_output', type=str)
    parser.add_argument("--ex_lib_path", dest='ex_lib_path', type=str, default="")
    parser.add_argument("--ex_libs", dest='ex_libs', nargs='+', type=str, default="")    
    return parser.parse_args(shlex.split(nb_args))

In [7]:
# HideMe
# Input arguments
nb_args = os.environ.get('NB_ARGS', "")
log.info("nb_args = %s", nb_args)

if not nb_args:
    nb_args = """
--plugin_analyzer_output /1 
--ex_lib_path /1/libs 
--ex_libs   insights_analysis_stats.egg
            insights_core-1.45.0-py2.7.egg
            insights_plugins-1.43.0-py2.7.egg
            Jinja2-2.9.6-py2.7.egg
            MarkupSafe-1.0-py2.7-linux-x86_64.egg
            PyYAML-3.12-py2.7.egg 
    """.strip()

In [8]:
# ReplaceString=@nb_args
# nb_args = "@nb_args"

In [9]:
# Common func

In [10]:
def read_pd_df_json(json_file):
    if not os.path.exists(json_file):
        raise ValueError("there is no file at %s" % json_file)
    return pd.read_json(json_file, lines=True, orient='records')

In [11]:
def display_no_index(df):
    display(HTML(df.to_html(index=False)))

In [12]:
def print_highlight(text, high_lights = []):    
    for high_light in high_lights:
        text = text.replace(high_light, "<mark>%s</mark>" % high_light)
        
    display(HTML('<div class="output_subarea output_stream output_stdout output_text"><pre>%s</pre></div' % text))

# I. Stat App Data

### 1. Configuration

In [13]:
args = parse_cmd_args(nb_args)

In [14]:
args

Namespace(ex_lib_path='/1/libs', ex_libs=['insights_analysis_stats.egg', 'insights_core-1.45.0-py2.7.egg', 'insights_plugins-1.43.0-py2.7.egg', 'Jinja2-2.9.6-py2.7.egg', 'MarkupSafe-1.0-py2.7-linux-x86_64.egg', 'PyYAML-3.12-py2.7.egg'], plugin_analyzer_output='/1')

In [15]:
# set up libs
def add_lib(lib_dir, lib_files = []):
    libs=[]
    for f in lib_files:
        p = os.path.join(lib_dir, f)            
        sys.path.append(p)   
        log.info("add file to classpath = %s", p)        
        spark.sparkContext.addPyFile(p)
    
add_lib(args.ex_lib_path, args.ex_libs)

NameError: global name 'spark' is not defined

In [None]:
# archive location
#Convert=code
stat_data_dir = args.plugin_analyzer_output
if not os.path.exists(stat_data_dir):
    raise ValueError("there is no directory at %s" % stat_data_dir)
    
print "stat_data_dir = '%s'" % args.plugin_analyzer_output

In [None]:
spark.read.parquet("file:%s" % os.path.join(args.plugin_analyzer_output, 'data/rule_analysis')).registerTempTable("ra")

In [None]:
# HideMe
nb_args

In [None]:
rm = read_pd_df_json(os.path.join(args.plugin_analyzer_output, 'data/rule_meta.json'))

In [None]:
# HideMe
rm

In [None]:
rule_module = str(rm['input_rule_pkgs'][0])

In [None]:
# HideMe
rule_module

In [None]:
importlib.import_module(rule_module)

### 2. Archive Sync Time

In [None]:
"Archive Sync time = %s" % sql("""select max(upload_time) as mut from ra""").collect()[0]['mut']

### 3. Data structure

In [None]:
# show the schema of report analysis data

# system_id       : is a system identificiation. A system usually upload an archive to us once a day
# stats_upload_id : is an archive id, so if a system upload 2 archives, we will have 1 system_id and 2 stats_upload_id

# type      : is component type: rule/condition/incident
# name      : your function name for the annotated rule/condition/incident
# value     : is a json value of your function output. If the output is not jsonable, it will be "unjsonable"
# is_fire   : indicate the result of bool(your_function_output)
# error_key : error key generate from rule response

# upload_time : is the time the archive uploaded to s3

sql("""select * from ra""").printSchema()

### 4. Viewing components of your rule

In [None]:
# Show all the name of your condition/indicent/rule method names
sql("""select type, name, count(0) as count from ra group by name, type""").show()

### 5. Collect sample problem archives

In [None]:
import json

def get_exception_samples(exception_df):
    samples = defaultdict(list)
    exception_types = set()
    for index, row in exception_df.iterrows():
        d = row.to_dict() 
        system_id = d['system_id']
        del d['system_id']
        et = json.dumps(d)
        samples[et].append(system_id)
    
    return [{'sample_type': 'exception', 'desc': key, 'count': len(value), 'system_ids': ",".join(value)} for key, value in samples.iteritems()]

    
def get_sample_archives_df(stat_data_dir):        
    
    base_archive_dir = os.path.join(stat_data_dir, 'data')        
    sample_archives = []
    for root, directories, filenames in os.walk(base_archive_dir):    
        for filename in filenames:
            if filename.endswith("_df.json"):
                full_path = os.path.join(root,filename)
                sample_type = filename.split("_df.json")[0]
                df = read_pd_df_json(full_path)
                if df.empty:                    
                    continue
                    
                if 'exception' == sample_type:
                    sample_archives.extend(get_exception_samples(df))  
                    
    sample_archives_df = pd.DataFrame(sample_archives)
    # rearrange column order
    sample_archives_df = sample_archives_df[['sample_type', 'count', 'desc', 'system_ids']]
    
    return sample_archives_df

In [None]:
sample_archives_df = get_sample_archives_df(stat_data_dir)

In [None]:
sample_archives_df[['sample_type', 'count', 'desc']]

# II. Analyze Stat Data

### 1. Common functions

In [None]:
def get_rule_components(rule_module, component_type):
    """
    Return parsers with the component_type for your rule.
    component_type can be either incident/parser/rule 
    """
    rule_reducer = plugins.REDUCERS[rule_module.split(".")[-1]]
    components = plugins.COMPONENTS_BY_TYPE[component_type]
    
    parsers = set()
    for dep in plugins.COMPONENT_DEPENDENCIES[rule_reducer]:
        if dep in components:
            for parser in plugins.COMPONENT_DEPENDENCIES[dep]:
                parsers.add(parser)
    return parsers

In [None]:
def get_system_id_from_archive_path(path):
    return path.split("/")[-1].split('.')[0]

In [None]:
def print_spec_files(archives, spec_name, high_lights = [], filter_high_lights=False):
    """
    print the content of a file based on its spec_name
    :param archives list of archives to view
    :param spec_name parser spec name
    :param high_lights: list of string to high light
    :param filter_high_lights: only show string match one of the high_lights
    :return: None. this will just print to screen directly
    """
    for archive in archives:
        with TarExtractor().from_path(archive, extract_dir=tmp_extract_dir) as ex:
            spec_mapper = SpecMapper(ex)        
            if not spec_mapper.exists(spec_name):
                print "There is no file corresponding to spec_name = %s" % spec_name

            if filter_high_lights:
                lines = spec_mapper.get_content(path=spec_name, split=True)
                filtered_lines = []
                for l in lines:
                    for h in high_lights:
                        if h in l:
                            filtered_lines.append(l)
                            break
                content = "\n".join(filtered_lines)
            else:
                content = spec_mapper.get_content(path=spec_name, split=False)

            if content:

                print "=============================================================================================="
                print "==== spec : %s, system_id = %s" % (spec_name, get_system_id_from_archive_path(archive))
                #print "==== The highlights are the parser filters"
                print "=============================================================================================="            
                print_highlight(content, high_lights)                

### 2. Viewing Parser Content

In [None]:
from insights.core.archives import TarExtractor
from insights.core.specs import SpecMapper
from insights.core import plugins

In [None]:
def print_incident_parser_content(archive, rule_module):
    incident_parsers = get_rule_components(rule_module, plugins.incident)    
    if not incident_parsers:
        print "there is no incident parser"
    
    for parser in incident_parsers:
        for name in parser.symbolic_names:
            content = print_spec_files([archive], name, parser.filters)
            if content:
                break   

* Select a sample archive in here. There is a convience method **`print_incident_parser_content`** to view incident parser content

In [None]:
def get_archive_file_paths(sample_archives_df, index):
    system_ids = sample_archives_df.iloc[[0]]['system_ids'][0].split(",")
    return [os.path.join(stat_data_dir, 'data/samples/archives/%s.tar.gz' % sid) for sid in system_ids]

In [None]:
tmp_extract_dir='/tmp'

In [None]:
sample_archive = None
if not sample_archives_df.empty:        
    sample_archive = get_archive_file_paths(sample_archives_df, 0)[0]    
    
    sample_type = sample_archives_df.iloc[[0]]['sample_type'][0]
    
    if 'misdiagnose' == sample_type:
        print_incident_parser_content(sample_archive, rule_module)
    elif 'exception' == sample_type:
        # just view any spec that you want to investigate, choosing uname because it is the most common one
        # for example, we want to high light 'x86' and 'Linux'
        n = print_spec_files([sample_archive], 'uname', high_lights=['x86', 'Linux'], filter_high_lights=False)
        

* You can view other spec file like below

In [None]:
if sample_archive:
    # print with text high light
    print_spec_files([sample_archive], 'uname', high_lights=['x86', 'Linux'], filter_high_lights=False)

* You can view spec from multiple archives

In [None]:
# print all file of spec name
print_spec_files(get_archive_file_paths(sample_archives_df, 0), 'uname')

### 3. Viewing Rule Component Output

In [None]:
if sample_archive:    
    system_id = get_system_id_from_archive_path(sample_archive)     
    display_no_index(sql("""
    select rhel_major_ver as rhel, type, name, is_fire, value 
    from ra where system_id = '%s'""" % system_id).toPandas())

# III. Modify your rule and test it with sample archives

In [None]:
def clear_reducer():
    for p in plugins.SHARED_PARSERS:
        if hasattr(p, 'filters') and p.filters:
            p.filters = []
        
    """
    clear out your previous reducer, otherwise it will be computed together with your new reducer
    which makes it harder to see the result
    """    
    for m in [plugins.TYPE_OF_COMPONENT, plugins.COMPONENTS_BY_TYPE, 
              plugins.COMPONENT_DEPENDENCIES, plugins.EMITTERS,
              plugins.DELEGATES, plugins.REDUCERS, plugins.PARSERS, 
              plugins.PARSER_FUNCS]:
        m.clear()

### 1. Your code

In [None]:
# Assign plugin_src
#Convert=code
src =inspect.getsource(module) 
src = """
# Clearning up reducer so your previous reducer doesn't compute again\nclear_reducer()
#===================Your code start from here===================

%s""" % src
exec(src)
print src

In [None]:
# ReplaceString=@rule_code
# Clearning up reducer so your previous reducer doesn't compute again
clear_reducer()
#===================Your code start from here===================

"@rule_code"

In [None]:
reducer_func=plugins.REDUCERS['__main__']

In [None]:
from insights_analysis_stats.core.evaluator.stats_evaluator import StatsSingleEvaluator, get_register_components

### 2. Execute your code

In [None]:
if sample_archive:
    with TarExtractor().from_path(sample_archive, extract_dir=tmp_extract_dir) as ex:
        system_id = get_system_id_from_archive_path(sample_archive)
        registered_components = get_register_components(reducer_func.__module__, [plugins.rule, plugins.condition, plugins.incident])    
        spec_mapper = SpecMapper(ex)
        p = StatsSingleEvaluator(spec_mapper)
        evaluator_response = p.process()
        print "================================="
        print "==== evaluator response"
        print "================================="    
        print evaluator_response
        print ""
        print "================================="
        print "==== stats response"
        print "================================="    

        stats_response = p.get_rule_outputs(stats_upload_id=system_id, module_name=rule_module, registered_components=registered_components)    
        display_no_index(pd.DataFrame(stats_response).drop(['stats_upload_id', 'module'], axis=1))