%%bash

conda install -c conda-forge eli5

In [1]:
import eli5
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
X_train = pd.read_csv("../output/X_train.csv", index_col = "index")
y_train = pd.read_csv("../output/y_train.csv", names = ["index", "klasa"], index_col = "index")

X_test = pd.read_csv("../output/X_test.csv", index_col = "index")
y_test = pd.read_csv("../output/y_test.csv", names = ["index", "klasa"], index_col = "index")

assert (X_train.index == y_train.index).all() == True
assert (X_test.index == y_test.index).all() == True

In [3]:
hiperparametry = {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 5, 'min_samples_split': 10}

In [4]:
drzewo = DecisionTreeClassifier(criterion = hiperparametry["criterion"], max_depth = hiperparametry["max_depth"], 
                             min_samples_leaf = hiperparametry["min_samples_leaf"], 
                             min_samples_split = hiperparametry["min_samples_split"], random_state = 42)

In [5]:
drzewo.fit(X = X_train, y = y_train.values.ravel())

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=10,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [6]:
drzewo.feature_importances_

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.07240611, 0.        , 0.02329509, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.64390235, 0.04423903, 0.        , 0.01217696, 0.        ,
       0.        , 0.04066717, 0.16331328, 0.        , 0.        ])

In [7]:
eli5.show_weights(drzewo, feature_names = list(X_train.columns), target_names = ["Z", "Ł"])

Weight,Feature
0.6439,worst radius
0.1633,worst concave points
0.0724,texture error
0.0442,worst texture
0.0407,worst concavity
0.0233,area error
0.0122,worst area
0,compactness error
0,mean area
0,mean smoothness


In [8]:
X_test.iloc[0]

mean radius                  19.550000
mean texture                 28.770000
mean perimeter              133.600000
mean area                  1207.000000
mean smoothness               0.092600
mean compactness              0.206300
mean concavity                0.178400
mean concave points           0.114400
mean symmetry                 0.189300
mean fractal dimension        0.062320
radius error                  0.842600
texture error                 1.199000
perimeter error               7.158000
area error                  106.400000
smoothness error              0.006356
compactness error             0.047650
concavity error               0.038630
concave points error          0.015190
symmetry error                0.019360
fractal dimension error       0.005252
worst radius                 25.050000
worst texture                36.270000
worst perimeter             178.600000
worst area                 1926.000000
worst smoothness              0.128100
worst compactness        

In [9]:
y_test.iloc[0]

klasa    Z
Name: 88649001, dtype: object

In [10]:
eli5.show_prediction(drzewo, doc = X_test.iloc[0])

Contribution?,Feature
0.573,worst radius
0.374,<BIAS>
0.032,texture error
0.021,worst concavity


In [11]:
las = RandomForestClassifier(criterion = hiperparametry["criterion"], random_state = 42, bootstrap = True, 
                             max_depth = hiperparametry["max_depth"], 
                             min_samples_leaf = hiperparametry["min_samples_leaf"], 
                             min_samples_split = hiperparametry["min_samples_split"], n_estimators = 100, 
                             oob_score = True)

In [12]:
las.fit(X = X_train, y = y_train.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [13]:
eli5.show_weights(las, feature_names = list(X_train.columns))

Weight,Feature
0.1560  ± 0.5029,worst area
0.1384  ± 0.4616,worst concave points
0.1012  ± 0.4243,mean concave points
0.0922  ± 0.4186,worst radius
0.0768  ± 0.3415,worst perimeter
0.0693  ± 0.3336,mean radius
0.0689  ± 0.3399,mean perimeter
0.0561  ± 0.2881,mean concavity
0.0434  ± 0.2647,mean area
0.0360  ± 0.1806,worst concavity


In [14]:
eli5.show_prediction(las, doc = X_test.iloc[0])

Contribution?,Feature
0.376,<BIAS>
0.112,worst area
0.077,worst concave points
0.068,worst radius
0.067,mean concave points
0.06,mean radius
0.057,mean perimeter
0.044,worst perimeter
0.036,mean concavity
0.031,mean area
