In [1]:
help(whatif_tool)

Help on function whatif_tool in module __main__:

whatif_tool(df, feature_cols, predict_func, compare_predict_func=None, label_vocab=None, max_n_rows=1000)
    This is a what-if tool wrapper for non-tensorflow users. If you are a tensorflow user,
    just follow the instruction on the official website and have fun.
    
    Args:
        df:           A pandas dataframe to analyse.
        feature_cols: A list of string, cols in which will be used as features.
        predict_func: Predict function to be performed on df. Usually, the data 
                      preprocessing procedure are embedded in-side by a closure.
        compare_predict_func: Same as predict_func. Used to compare with the result 
                      produced by `predict_func`.
        label_vocab:  Label names for display.
        max_n_rows:   max row limit to protect your browser, ^-^.



In [1]:
import numpy as np
def make_label_column_numeric(df, label_column, test):
    df[label_column] = np.where(test(df[label_column]), 1, 0)

In [2]:
# load train data
import pandas as pd

# Set the path to the CSV containing the dataset to train on.
csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

# Set the column names for the columns in the CSV. If the CSV's first line is a header line containing
# the column names, then set this to None.
csv_columns = [
  "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital-Status",
  "Occupation", "Relationship", "Race", "Sex", "Capital-Gain", "Capital-Loss",
  "Hours-per-week", "Country", "Over-50K"]

# Read the dataset from the provided CSV and print out information about it.
df = pd.read_csv(csv_path, names=csv_columns, skipinitialspace=True)
label_column = 'Over-50K'
make_label_column_numeric(df, label_column, lambda val: val == '>50K')
df

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-week,Country,Over-50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,0
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,1
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,1
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,1


In [3]:
# Set list of all columns from the dataset we will use for model input.
input_features = [
  'Age', 'Workclass', 'Education', 'Marital-Status', 'Occupation',
  'Relationship', 'Race', 'Sex', 'Capital-Gain', 'Capital-Loss',
  'Hours-per-week', 'Country']

# Create a list containing all input features and the label column
features_and_labels = input_features + [label_column]

In [4]:
# classify the columns by data type
def get_cat_num_cols(df):
    cat_cols = []
    num_cols = []
    for i, col in enumerate(df.columns):
        if df.dtypes[i] == 'object':
            cat_cols.append(col)
        else:
            num_cols.append(col)
    return cat_cols, num_cols
cat_cols, num_cols = get_cat_num_cols(df[input_features])

In [5]:
# preprocessing categorical cols and construct train data
cat_data = df[cat_cols]
num_data = df[num_cols]

from sklearn.preprocessing import OrdinalEncoder
oren = OrdinalEncoder().fit(cat_data)
mid_data = oren.transform(cat_data)

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder().fit(mid_data)
cat_data_encoded = ohe.transform(mid_data).toarray()

import numpy as np
train_data = np.concatenate((cat_data_encoded, num_data.values), axis=1)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [6]:
# train lr model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr = lr.fit(train_data, df[label_column])



In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf = rf.fit(train_data, df[label_column])



In [8]:
# This is a closure to make a predict function
# Bassically, you seal the preprcessing context and estimator inside
def make_prediction_func(estimator):
    # variables used for preprocessing
    _cat_cols = cat_cols
    _num_cols = num_cols
    _oren = oren
    _ohe = ohe
    def _predict_func(df):
        cat_data = df[_cat_cols]
        num_data = df[_num_cols]
        mid_data = _oren.transform(cat_data)
        cat_data_encoded = _ohe.transform(mid_data).toarray()
        data = np.concatenate((cat_data_encoded, num_data.values), axis=1)
        return estimator.predict_proba(data)
    return _predict_func

In [9]:
predict_func_lr = make_prediction_func(lr)
predict_func_rf = make_prediction_func(rf)

In [10]:
# load test data
test_csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
test_df = pd.read_csv(test_csv_path, names=csv_columns, skipinitialspace=True, skiprows=1)
make_label_column_numeric(test_df, label_column, lambda val: val == '>50K.')

In [12]:
whatif_tool(test_df[:1000], input_features, predict_func_lr, compare_predict_func=predict_func_rf, 
            label_vocab=['Under 50K', 'Over 50K'], max_n_rows=20000)

WitWidget(config={'are_sequence_examples': False, 'model_type': 'classification', 'label_vocab': ['Under 50K',…