# Generates a Feature Importance Rank Heatmap (for STREAMLINE)
Takes the feature importance scores from each model and generates a feature importance 'rank' heatmap across all algorithms for the target datasets. These are output as an interactive html visualization using bokeh.

This notebook requires additional installation of the bokeh package: 

pip install bokeh

This code for this visualization was written provided by Sy Hwang in September of 2021.


## Import Required Packages

In [1]:

import pandas as pd
pd.set_option('display.max_rows', None)
import os

from bokeh.io import output_file, save, export_png
from bokeh.models import (BasicTicker, ColorBar, ColumnDataSource,
                          ContinuousColorMapper, LinearColorMapper, HoverTool)
from bokeh.plotting import figure
from bokeh.transform import transform
from bokeh.palettes import Cividis256
import pickle

import warnings
warnings.filterwarnings('ignore')

# Jupyter Notebook Hack: This code ensures that the results of multiple commands within a given cell are all displayed, rather than just the last. 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Set Run Parameters

In [2]:
experiment_path = "C:/Users/ryanu/Documents/Analysis/STREAMLINE_Experiments/hcc_demo"
targetDataName = 'None' # 'None' if user wants to generate visualizations for all analyzed datasets
algorithms = [] #use empty list if user wishes re-evaluate all modeling algorithms that were run in pipeline.

In [3]:
#Unpickle metadata from previous phase
file = open(experiment_path+'/'+"metadata.pickle", 'rb')
metadata = pickle.load(file)
file.close()
#Load variables specified earlier in the pipeline from metadata
do_NB = metadata['Naive Bayes']
do_LR = metadata['Logistic Regression']
do_DT = metadata['Decision Tree']
do_RF = metadata['Random Forest']
do_GB = metadata['Gradient Boosting']
do_XGB = metadata['Extreme Gradient Boosting']
do_LGB = metadata['Light Gradient Boosting']
do_SVM = metadata['Support Vector Machine']
do_ANN = metadata['Artificial Neural Network']
do_KNN = metadata['K-Nearest Neightbors']
do_eLCS = metadata['eLCS']
do_XCS = metadata['XCS']
do_ExSTraCS = metadata['ExSTraCS']

#Unpickle algorithm information from previous phase
file = open(experiment_path+'/'+"algInfo.pickle", 'rb')
algInfo = pickle.load(file)
file.close()
algorithms = []
abbrev = {}
for key in algInfo:
    if algInfo[key][0]: # If that algorithm was used
        algorithms.append(key)
        abbrev[key] = (algInfo[key][1])
        
print(algorithms)

['Naive Bayes', 'Logistic Regression', 'Decision Tree']


## Automatically Detect Dataset Names

In [4]:
# Get dataset paths for all completed dataset analyses in experiment folder
datasets = os.listdir(experiment_path)
experiment_name = experiment_path.split('/')[-1] #Name of experiment folder
datasets.remove('metadata.csv')
datasets.remove('metadata.pickle')
datasets.remove('algInfo.pickle')
try:
    datasets.remove('jobsCompleted')
except:
    pass
try:
    datasets.remove('UsefulNotebooks')
except:
    pass
try:
    datasets.remove('logs')
    datasets.remove('jobs')
except:
    pass
try:
    datasets.remove('DatasetComparisons') #If it has been run previously (overwrite)
except:
    pass
try:
    datasets.remove('KeyFileCopy') #If it has been run previously (overwrite)
except:
    pass
try:
    datasets.remove(experiment_name+'_ML_Pipeline_Report.pdf') #If it has been run previously (overwrite)
except:
    pass
datasets = sorted(datasets) #ensures consistent ordering of datasets
print("Analyzed Datasets: "+str(datasets))

Analyzed Datasets: ['hcc-data_example', 'hcc-data_example_no_covariates']


In [5]:
if not targetDataName == 'None': # User specified one analyzed dataset above (if more than one were analyzed)
    for each in datasets:
        if not each == targetDataName:
            datasets.remove(each)
    print("Vizualized Datasets: "+str(datasets))

for each in datasets: 
    print("---------------------------------------")
    print(each)
    print("---------------------------------------")
    full_path = experiment_path+'/'+each

    series = list()
    feats = None
    for algorithm in algorithms:
        filename = full_path+'/model_evaluation/feature_importance/'+abbrev[algorithm]+'_FI.csv'
        df = pd.read_csv(filename)
        if not feats:
            feats = df.abs().mean().keys().to_list()
            series.append(pd.Series(feats, name='feats'))
        fi_avgrank = df.abs().mean().rank(ascending=False).values
        series.append(pd.Series(fi_avgrank, name=algorithm.partition('_')[0]))

    finaldf = pd.concat(series, axis=1).set_index('feats')
    finaldf['MeanRank'] = finaldf.mean(axis=1)
    finaldf.sort_values(by='MeanRank', inplace=True)
    finaldf.columns.name = 'algos'
    inputdf = pd.DataFrame(finaldf.stack(), columns=['ranked']).reset_index()


    source = ColumnDataSource(inputdf)
    mapper = LinearColorMapper(palette=Cividis256, low=inputdf.ranked.min(), high=inputdf.ranked.max())

    tools=["wheel_zoom", "pan", "reset"]
    p = figure(plot_width=900,
                plot_height=1600,
                title="FI Heatmap (All Variables)",
                x_range=list(finaldf.columns),
                y_range=list(reversed(finaldf.index)),
                tools=tools,
                toolbar_location='left',
                x_axis_location="above"
                )
    p.rect(x="algos",
            y="feats",
            width=1,
            height=1,
            source=source,
            line_color="white",
            fill_color={"field":"ranked", "transform": mapper},
            )
    tooltips = [("algo", "@algos"),
                ("feature", "@feats"),
                ("rank", "@ranked")]

    hover = HoverTool(tooltips = tooltips)
    p.add_tools(hover)
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "14px"
    p.title.text_font_size = '24px'
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 1.0

    output_file(full_path+'/model_evaluation/feature_importance/'+'FI_Rank_Heatmap.html')
    save(p)
 

---------------------------------------
hcc-data_example
---------------------------------------


'C:\\Users\\ryanu\\Documents\\Analysis\\STREAMLINE_Experiments\\hcc_demo\\hcc-data_example\\model_evaluation\\feature_importance\\FI_Rank_Heatmap.html'

---------------------------------------
hcc-data_example_no_covariates
---------------------------------------


'C:\\Users\\ryanu\\Documents\\Analysis\\STREAMLINE_Experiments\\hcc_demo\\hcc-data_example_no_covariates\\model_evaluation\\feature_importance\\FI_Rank_Heatmap.html'