In [None]:
pip install -r "..\requirements.txt"

In [1]:
# Zelle 1 - Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from alibi.explainers import AnchorTabular
from alibi.utils import gen_category_map
import random

# Reproduzierbarkeit
SEED = 42
np.random.seed(SEED)
random.seed(SEED)





In [2]:
# Zelle 2 - Daten laden
data_raw = pd.read_csv("loan_data.csv")

# Überblick
data_raw.head()


Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [3]:
# Zelle 3 - Spalten definieren
categorical_features = [
    "person_gender",
    "person_education",
    "person_home_ownership",
    "loan_intent",
    "previous_loan_defaults_on_file"
]

numeric_features = [
    "person_age",
    "person_income",
    "person_emp_exp",
    "loan_amnt",
    "loan_int_rate",
    "loan_percent_income",
    "cb_person_cred_hist_length",
    "credit_score"
]

label_col = "loan_status"

data_df = data_raw.drop(columns=[label_col])
data = data_df.to_numpy()
target = data_raw[label_col].to_numpy()
feature_names = data_df.columns.tolist()
category_map = gen_category_map(data=data_df, categorical_columns=categorical_features)

In [4]:
# Mapping der kategorischen Features in ganze Zahlen mittels der category_map
for col, categories in category_map.items():
    cat_to_int = {cat: idx for idx, cat in enumerate(categories)}
    data[:, col] = np.vectorize(cat_to_int.get)(data[:, col])

In [5]:
# Train-Test-Split
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=SEED)

In [6]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer


ordinal_features = [x for x in range(len(feature_names)) if x not in list(category_map.keys())]
ordinal_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                      ('scaler', StandardScaler())])

categorical_features = list(category_map.keys())
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [7]:
preprocessor = ColumnTransformer(transformers=[('num', ordinal_transformer, ordinal_features),
                                               ('cat', categorical_transformer, categorical_features)])
preprocessor.fit(x_train)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [8]:
from sklearn.ensemble import RandomForestClassifier


np.random.seed(0)
# clf = RandomForestClassifier(n_estimators=50)
clf = MLPClassifier(hidden_layer_sizes=(32,), max_iter=100, solver='adam', batch_size=256, random_state=SEED, early_stopping=True, n_iter_no_change=5, verbose=False)
clf.fit(preprocessor.transform(x_train), y_train)

0,1,2
,hidden_layer_sizes,"(32,)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,256
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,100
,shuffle,True


In [9]:
from sklearn.metrics import accuracy_score

predict_fn = lambda x: clf.predict(preprocessor.transform(x))
print('Train accuracy: ', accuracy_score(y_train, predict_fn(x_train)))
print('Test accuracy: ', accuracy_score(y_test, predict_fn(x_test)))

Train accuracy:  0.9143888888888889
Test accuracy:  0.9112222222222223


In [10]:
explainer = AnchorTabular(predict_fn, feature_names, categorical_names=category_map, seed=1)

In [11]:
explainer.fit(x_train, disc_perc=[25, 50, 75])

AnchorTabular(meta={
  'name': 'AnchorTabular',
  'type': ['blackbox'],
  'explanations': ['local'],
  'params': {'seed': 1, 'disc_perc': [25, 50, 75]},
  'version': '0.9.6'}
)

In [12]:
idx = 200
class_names = ['Not Approved', 'Approved']
print('Prediction: ', class_names[explainer.predictor(x_test[idx].reshape(1, -1))[0]])

Prediction:  Not Approved


In [13]:
print(x_test[idx])

[48.0 1 0 84772.0 28 0 21600.0 2 7.51 0.25 11.0 664 1]


In [14]:
if explainer.feature_names == feature_names:
	explanation = explainer.explain(x_test[idx], threshold=0.95)
	print('Anchor: %s' % (' AND '.join(explanation.anchor)))
	print('Precision: %.2f' % explanation.precision)
	print('Coverage: %.2f' % explanation.coverage)
else:
	print("Error: The explainer is not fitted for the loan dataset. Please re-run the cells that define and fit the explainer for the loan data.")

Anchor: previous_loan_defaults_on_file = Yes
Precision: 1.00
Coverage: 0.50
