In [134]:
from sklearn.datasets.mldata import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn import metrics
from skater.core.global_interpretation.interpretable_models.brlc import BRLC
%matplotlib inline

In [135]:
feature_labels = ["Pregnant","Glucose_concentration_test","Blood_pressure(mmHg)","Triceps_skin_fold_thickness(mm)","2-Hour_serum_insulin_(mu_U/ml)","Body_mass_index","Diabetes_pedigree_function","Age_(years)"]
    
data = fetch_mldata("diabetes") # get dataset
y = (data.target+1)/2 # to bound the target label from (-1, 1) -> (0, 1)
print(data.DESCR)

mldata.org dataset: diabetes


In [136]:
data_df = pd.DataFrame(data.data, columns=feature_labels)

In [137]:
data_df.shape

(768, 8)

## Observation:
If we take a quick look at the data, we will notice that this dataset has a bunch of continuous features. There might be a need for for discretization to get better accuracy

In [138]:
data_df.head()

Unnamed: 0,Pregnant,Glucose_concentration_test,Blood_pressure(mmHg),Triceps_skin_fold_thickness(mm),2-Hour_serum_insulin_(mu_U/ml),Body_mass_index,Diabetes_pedigree_function,Age_(years)
0,6.0,148.0,72.0,35.0,0.0,33.599998,0.627,50.0
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0
2,8.0,183.0,64.0,0.0,0.0,23.299999,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.099998,2.288,33.0


In [130]:
# Sample code for filtering pandas dataframe by column
# data_df = data_df[["Pregnant","Glucose_concentration_test"]]

In [139]:
# Default test split-size = 0.25
Xtrain, Xtest, ytrain, ytest = train_test_split(data_df, y) # split
# Target Labels: 1:positive 0:negative
np.unique(ytrain)

array([ 0.,  1.])

In [140]:
print(Xtrain.shape)
print(Xtest.shape)

(576, 8)
(192, 8)


In [141]:
print(Xtrain.head())
print(Xtrain.shape)
print(ytrain.shape)

     Pregnant  Glucose_concentration_test  Blood_pressure(mmHg)  \
30        5.0                       109.0                  75.0   
729       2.0                        92.0                  52.0   
381       0.0                       105.0                  68.0   
741       3.0                       102.0                  44.0   
303       5.0                       115.0                  98.0   

     Triceps_skin_fold_thickness(mm)  2-Hour_serum_insulin_(mu_U/ml)  \
30                              26.0                             0.0   
729                              0.0                             0.0   
381                             22.0                             0.0   
741                             20.0                            94.0   
303                              0.0                             0.0   

     Body_mass_index  Diabetes_pedigree_function  Age_(years)  
30         36.000000                       0.546         60.0  
729        30.100000                

In [142]:
# Features to be used for training a Rule List model
print(Xtrain.columns)

Index(['Pregnant', 'Glucose_concentration_test', 'Blood_pressure(mmHg)',
       'Triceps_skin_fold_thickness(mm)', '2-Hour_serum_insulin_(mu_U/ml)',
       'Body_mass_index', 'Diabetes_pedigree_function', 'Age_(years)'],
      dtype='object')


### Train an interpretable model

In [153]:
import time
start_time = time.time()
# Create an instance of the estimator
sbrl_model = BRLC(min_rule_len=1, max_rule_len=10, iterations=10000, n_chains=20, drop_features=True)
# Train a model, by default discretizer is enabled. So, you wish to exclude features then exclude them using 
# the undiscretize_feature_list parameter

#BRLC?

model = sbrl_model.fit(Xtrain, ytrain, bin_labels="default")
print("--- {} seconds ---".format(time.time() - start_time))

Eclat


parameter specification:


 tidLists
 support
 minlen
 maxlen
            target
   ext


    FALSE
     0.1
      1
     10
 frequent itemsets
 FALSE



algorithmic control:


 sparse
 sort
 verbose


      7
   -2
    TRUE



Absolute minimum support count:
 
38
 



create itemset ... 

set transactions ...
[32 item(s),
 380 transaction(s)] done 
[0.00s].


sorting and recoding items ... 
[32 item(s)] 
done [0.00s].


creating bit matrix ... 
[32 row(s), 380 column(s)] 
done [0.00s].

writing  ... 
[75 set(s)] done 
[0.00s].

Creating S4 object  ... 
done 
[0.00s].

Eclat


parameter specification:


 tidLists
 support
 minlen
 maxlen
            target
   ext


    FALSE
     0.1
      1
     10
 frequent itemsets
 FALSE



algorithmic control:


 sparse
 sort
 verbose


      7
   -2
    TRUE



Absolute minimum support count:
 
19
 



create itemset ... 

set transactions ...
[32 item(s),
 196 transaction(s)] done 
[0.00s].


sorting and recoding items ... 
[29 item(s)] 

In [144]:
sbrl_model.feature_names

Index(['Pregnant_q_label', 'Glucose_concentration_test_q_label',
       'Blood_pressure(mmHg)_q_label',
       'Triceps_skin_fold_thickness(mm)_q_label',
       '2-Hour_serum_insulin_(mu_U/ml)_q_label', 'Body_mass_index_q_label',
       'Diabetes_pedigree_function_q_label', 'Age_(years)_q_label'],
      dtype='object')

In [145]:
sbrl_model.print_model()

The rules list is : 

If      {Glucose_concentration_test_q_label=4} (rule[77]) then positive probability = 0.31724138

else if {Body_mass_index_q_label=1} (rule[25]) then positive probability = 0.95384615

else if {Pregnant_q_label=4,Diabetes_pedigree_function_q_label=4,Age_.years._q_label=4} (rule[103]) then positive probability = 0.07142857

else if {Glucose_concentration_test_q_label=1} (rule[52]) then positive probability = 0.87096774

else  (default rule)  then positive probability = 0.64321608



### Persist the model for future use

In [146]:
# Save the trained model as a pickled object
sbrl_model.save_model("diabetes_model.pkl", compress=True)

# load the model and move ahead
sbrl_model.load_model("diabetes_model.pkl")

In [147]:
# Test set
features_to_descritize = Xtest.columns
Xtest_filtered = sbrl_model.discretizer(Xtest, features_to_descritize, labels_for_bin="default")      
Xtest_filtered.head(3)

Unnamed: 0,Pregnant_q_label,Glucose_concentration_test_q_label,Blood_pressure(mmHg)_q_label,Triceps_skin_fold_thickness(mm)_q_label,2-Hour_serum_insulin_(mu_U/ml)_q_label,Body_mass_index_q_label,Diabetes_pedigree_function_q_label,Age_(years)_q_label
481,1,3,4,4,1,3,1,2
49,4,2,1,1,1,1,2,1
734,2,2,3,1,1,1,3,4


In [148]:
# Train set
features_to_descritize = Xtrain.columns
Xtrain_filtered = sbrl_model.discretizer(Xtrain, features_to_descritize, labels_for_bin="default")      
Xtrain_filtered.head(3)

Unnamed: 0,Pregnant_q_label,Glucose_concentration_test_q_label,Blood_pressure(mmHg)_q_label,Triceps_skin_fold_thickness(mm)_q_label,2-Hour_serum_insulin_(mu_U/ml)_q_label,Body_mass_index_q_label,Diabetes_pedigree_function_q_label,Age_(years)_q_label
30,3,2,3,3,1,3,3,4
729,2,1,1,1,1,2,1,1
381,1,2,2,2,1,1,1,1


In [149]:
results_train_sbrl = sbrl_model.predict_proba(Xtrain_filtered)
fpr_sbrl, tpr_sbrl, thresholds_sbrl = metrics.roc_curve(ytrain, results_train_sbrl[1], pos_label=1)
roc_auc_sbrl = metrics.auc(fpr_sbrl, tpr_sbrl)
print("AUC-ROC using SBRL(Train): {}".format(roc_auc_sbrl))
print("Accuracy(Train): {}".format(metrics.accuracy_score(ytrain, sbrl_model.predict(Xtrain_filtered)[1])))

print("---------------------------------\n")

results_test_sbrl = sbrl_model.predict_proba(Xtest_filtered)
fpr_sbrl, tpr_sbrl, thresholds_sbrl = metrics.roc_curve(ytest, results_test_sbrl[1], pos_label=1)
roc_auc_sbrl = metrics.auc(fpr_sbrl, tpr_sbrl)
print("AUC-ROC using SBRL(Test): {}".format(roc_auc_sbrl))
print("Accuracy(Test): {}".format(metrics.accuracy_score(ytest, sbrl_model.predict(Xtest_filtered)[1])))

AUC-ROC using SBRL(Train): 0.8108418367346938
Accuracy(Train): 0.7690972222222222
---------------------------------

AUC-ROC using SBRL(Test): 0.7623842592592593
Accuracy(Test): 0.71875


In [150]:
data_df.head()

Unnamed: 0,Pregnant,Glucose_concentration_test,Blood_pressure(mmHg),Triceps_skin_fold_thickness(mm),2-Hour_serum_insulin_(mu_U/ml),Body_mass_index,Diabetes_pedigree_function,Age_(years)
0,6.0,148.0,72.0,35.0,0.0,33.599998,0.627,50.0
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0
2,8.0,183.0,64.0,0.0,0.0,23.299999,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.099998,2.288,33.0


In [154]:
from sklearn.ensemble import RandomForestClassifier
x_train, x_test, y_train, y_test = train_test_split(data_df, y)

import time
start_time = time.time()
rf_model = RandomForestClassifier(n_estimators=100, random_state=123).fit(x_train, y_train)
print("--- {} seconds ---".format((time.time() - start_time)))


results_test_rf = pd.DataFrame(rf_model.predict_proba(x_test))
result_train_rf = pd.DataFrame(rf_model.predict_proba(x_train))

--- 0.15580415725708008 seconds ---


In [155]:
fpr_rf, tpr_rf, thresholds_rf = metrics.roc_curve(y_train, result_train_rf[1], pos_label=1)
roc_auc_rf = metrics.auc(fpr_rf, tpr_rf)
print("AUC-ROC using Random Forest(Train): {}".format(roc_auc_rf))
print("Accuracy(Train): {}".format(metrics.accuracy_score(y_train, rf_model.predict(x_train))))

print("---------------------------------\n")

fpr_rf, tpr_rf, thresholds_rf = metrics.roc_curve(y_test, results_test_rf[1], pos_label=1)
roc_auc_rf = metrics.auc(fpr_rf, tpr_rf)
print("AUC-ROC using Random Forest(Test): {}".format(roc_auc_rf))
print("Accuracy(Test): {}".format(metrics.accuracy_score(y_test, rf_model.predict(x_test))))

AUC-ROC using Random Forest(Train): 1.0
Accuracy(Train): 1.0
---------------------------------

AUC-ROC using Random Forest(Test): 0.8001262626262626
Accuracy(Test): 0.7395833333333334


In [157]:
# access all rules, or filter rules as needed for debugging and building intuition
sbrl_model.access_learned_rules('all')

['{Age_.years._q_label=1}',
 '{Age_.years._q_label=2}',
 '{Age_.years._q_label=3}',
 '{Age_.years._q_label=4}',
 '{Blood_pressure.mmHg._q_label=1,Age_.years._q_label=1}',
 '{Blood_pressure.mmHg._q_label=1,Body_mass_index_q_label=1}',
 '{Blood_pressure.mmHg._q_label=1,Triceps_skin_fold_thickness.mm._q_label=2}',
 '{Blood_pressure.mmHg._q_label=1,X2.Hour_serum_insulin_.mu_U.ml._q_label=3}',
 '{Blood_pressure.mmHg._q_label=1}',
 '{Blood_pressure.mmHg._q_label=2}',
 '{Blood_pressure.mmHg._q_label=3,Age_.years._q_label=4}',
 '{Blood_pressure.mmHg._q_label=3,Body_mass_index_q_label=4}',
 '{Blood_pressure.mmHg._q_label=3,Diabetes_pedigree_function_q_label=4}',
 '{Blood_pressure.mmHg._q_label=3,X2.Hour_serum_insulin_.mu_U.ml._q_label=4}',
 '{Blood_pressure.mmHg._q_label=3}',
 '{Blood_pressure.mmHg._q_label=4,Age_.years._q_label=4}',
 '{Blood_pressure.mmHg._q_label=4,Body_mass_index_q_label=3}',
 '{Blood_pressure.mmHg._q_label=4,Body_mass_index_q_label=4}',
 '{Blood_pressure.mmHg._q_label=4,Dia