# Explainable AI: COMPAS Predictions

Importing libraries

In [None]:
import pandas as pd

# for imputing modes and medians
from sklearn.impute import SimpleImputer

# for random forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import shap

# ! Assign target !

In [119]:
target = 'is_recid'

Importing data

In [120]:
# change file name for data if using different version
dfOriginal = pd.read_csv("cox-violent-parsed_filt.csv")

Remove duplicates, only one row per name

In [121]:
dfProcessed = dfOriginal.drop_duplicates(subset=['name'])
dfProcessed.count()

id                          6560
name                       10855
first                      10855
last                       10855
sex                        10855
dob                        10855
age                        10855
age_cat                    10855
race                       10855
juv_fel_count              10855
decile_score               10855
juv_misd_count             10855
juv_other_count            10855
priors_count               10855
days_b_screening_arrest     9781
c_jail_in                   9781
c_jail_out                  9781
c_days_from_compas         10185
c_charge_degree            10185
c_charge_desc              10178
is_recid                   10855
r_charge_degree             3425
r_days_from_arrest          2281
r_offense_date              3425
r_charge_desc               3368
r_jail_in                   2281
violent_recid                  0
is_violent_recid           10855
vr_charge_degree             809
vr_offense_date              809
vr_charge_

Remove unused columns

In [122]:
dfProcessed = dfProcessed[['sex', 'age', 'race', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'c_charge_degree', target]]
dfProcessed.count()

sex                10855
age                10855
race               10855
juv_fel_count      10855
juv_misd_count     10855
juv_other_count    10855
priors_count       10855
c_charge_degree    10185
is_recid           10855
dtype: int64

Remove -1 is_recid (must be binary)

In [123]:
# What we expect to be dropped
# Just using this to double check
dfCheck = dfProcessed.loc[dfProcessed[target] < 0]
dfCheck.count()

sex                648
age                648
race               648
juv_fel_count      648
juv_misd_count     648
juv_other_count    648
priors_count       648
c_charge_degree      1
is_recid           648
dtype: int64

In [124]:
# Dropping the invalid is_recid values
dfProcessed = dfProcessed.loc[dfProcessed[target] > -1]
dfProcessed.count()

sex                10207
age                10207
race               10207
juv_fel_count      10207
juv_misd_count     10207
juv_other_count    10207
priors_count       10207
c_charge_degree    10184
is_recid           10207
dtype: int64

Missing value strategy
1. Numerical values --> MEDIAN imputation
2. Categorical values --> MODE imputation

In [125]:
print("Any NaN values?\n", dfProcessed.isnull().values.any())
# ^^ To check if any NaNs
# put it bc I tried a heatmap and it looked empty so double checking

Any NaN values?
 True


# DATA LEAKAGE ðŸ˜°ðŸ˜°ðŸ˜°ðŸ˜°

In [126]:
# Numerical columns
numCols = ['age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count']
numImputer = SimpleImputer(strategy = 'median')
dfProcessed[numCols] = numImputer.fit_transform(dfProcessed[numCols])

# Categorical columns
catCols = ['sex', 'race', 'c_charge_degree']
catImputer = SimpleImputer(strategy = 'most_frequent') # most_frequent = mode
dfProcessed[catCols] = catImputer.fit_transform(dfProcessed[catCols])

print("Any NaN values?\n", dfProcessed.isnull().values.any())
# no more NaNs :)

Any NaN values?
 False


Smote? Impute from other dataset? Impute from same dataset?

In [127]:
# We should discuss what the plan is for this part

Dummy Model
(idk if this is the right approach?)

In [128]:
# all target = 0

## Random Forest
!Note!

Random Forest requires all data to be in numbers, so the categorical data needs to be enumerated before we are able to use it. This is fine for our situation because the categorical data is not unique to each individual and can easily be turned into numbers.

In [None]:
# get_dummies is a tool for encoding categorical columns
X = pd.get_dummies(dfProcessed.drop(columns=[target]))
y = dfProcessed[target]

testSize = 0.3 # change this variable if you want different train/test split
randNum = 44 # change if you want to adjust the random_state

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize, random_state=randNum)

rf = RandomForestClassifier(n_estimators=300, random_state=randNum, min_samples_leaf=5, max_depth=10, class_weight='balanced')
rf.fit(X_train, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",300
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",10
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",5
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [130]:
# What did the model predict?
predictions = rf.predict(X_test)
predictions

array([0, 1, 0, ..., 1, 1, 0], shape=(3063,))

In [131]:
# The probabilities it gave to each prediction
rf.predict_proba(X_test)

array([[0.5792188 , 0.4207812 ],
       [0.34951405, 0.65048595],
       [0.71898156, 0.28101844],
       ...,
       [0.30348349, 0.69651651],
       [0.49229964, 0.50770036],
       [0.77061865, 0.22938135]], shape=(3063, 2))

In [132]:
# Importance of each feature
importances = rf.feature_importances_
cols = X.columns

for i in range(len(cols)):
    print(f'Importance of {cols[i]} = {round(importances[i] * 100, 2)}%.')

# Note: the categorical features are split into
#   separate cols from using the dummy tool earlier.
# I kept them separate for now so we can see how
#   different races and charge degrees are treated.

Importance of age = 28.45%.
Importance of juv_fel_count = 2.66%.
Importance of juv_misd_count = 4.8%.
Importance of juv_other_count = 5.53%.
Importance of priors_count = 38.59%.
Importance of sex_Female = 2.13%.
Importance of sex_Male = 2.48%.
Importance of race_African-American = 4.26%.
Importance of race_Asian = 0.26%.
Importance of race_Caucasian = 1.63%.
Importance of race_Hispanic = 1.09%.
Importance of race_Native American = 0.03%.
Importance of race_Other = 0.68%.
Importance of c_charge_degree_(CO3) = 0.0%.
Importance of c_charge_degree_(CT) = 0.0%.
Importance of c_charge_degree_(F1) = 0.88%.
Importance of c_charge_degree_(F2) = 1.09%.
Importance of c_charge_degree_(F3) = 2.32%.
Importance of c_charge_degree_(F5) = 0.0%.
Importance of c_charge_degree_(F6) = 0.01%.
Importance of c_charge_degree_(F7) = 0.5%.
Importance of c_charge_degree_(M1) = 1.48%.
Importance of c_charge_degree_(M2) = 0.93%.
Importance of c_charge_degree_(MO3) = 0.17%.
Importance of c_charge_degree_(NI0) = 0.0%

In [133]:
# Classification report returns precision, recall, F1, etc.
from sklearn.metrics import classification_report
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.79      0.66      0.72      2047
           1       0.49      0.65      0.56      1016

    accuracy                           0.66      3063
   macro avg       0.64      0.66      0.64      3063
weighted avg       0.69      0.66      0.67      3063



# TreeSHAP

In [None]:
explainer = shap.TreeExplainer(rf)

shapValues = explainer.shap_values(X_test)

sv = shapValues[1] # 0 for negative is_recid, 1 for positive

In [None]:
shap.summary_plot(sv, X_test, plot_type = "bar")
shap.summary_plot(sv, X_test)

# ANCHORS

In [None]:
# anchors