# Code Example: Build Decision Tree Models to Identify Key Risk Indicators 

### 1. Define functions to calculate Weight of Evidence (WOE) and Information Value (IV) - for Feature Selection

In [None]:
# Load packages
import pandas from pd
import numpy as np

# Read data
data = pd.read_csv('~/raw_data.csv', sep = ',', na_values = ['.']).drop(['unused_feature1'], axis = 1)

# Define function to calculate WOE and IV
def calc_woe_iv(data, target, grp_nbr = 10, print_iv = False):
    
    # Create two empty dataframes for final output
    all_woe, all_iv = pd.DataFrame(), pd.DataFrame()
    
    # Get column names from raw data
    cols = data.columns
    
    # Calculate WOE and IV for all independent variables
    for feature in cols[~cols.isin([target])]:
        if (len(np.unique(data[feature])) > 10):
            x_grp = pd.qcut(data[feature], grp_nbr, duplicates = 'drop')
            data_grp = pd.DataFrame({'x': x_grp, 'y': data[target]})
        else:
            data_grp = pd.DataFrame({'x': data[feature], 'y': data[target]})
            
        data_woe = data_grp.groupby('x', ax_index = False).agg({'y': ['count', 'sum']})
        data_woe.columns = ['group', 'tot_cnt', 'bad_cnt']
        
        # Adjusted WOE is used to avoid zero bad or zero good
        data_woe['bad_percent'] = np.maximum(data_woe['bad_cnt'], 0.5) / data_woe['bad_cnt'].sum()
        data_woe['good_cnt'] = data_woe['tot_cnt'] - data_woe['bad_cnt']
        data_woe['good_percent'] = np.maximum(data_woe['good_cnt'], 0.5) / data_woe['good_cnt'].sum()
        data_woe['woe'] = np.log(data_woe['good_percent'] / data_woe['bad_percent']) 
        data_woe['iv'] = (data_woe['good_percent'] - data_woe['bad_percent']) * data_woe['woe']
        
        # Generate list for WOE and IV
        data_woe.insert(loc = 0, collumn = 'feature', value = feature)
        data_iv = pd.DataFrame({'feature': [feature], 'iv': [data_woe['iv'].sum()]})
        all_woe = pd.concat([all_woe, data_woe], axis = 0)
        all_iv = pd.concat([all_iv, data_iv], axis = 0)
        
        # Show WOE and IV tables
        if print_iv == True:
            print(all_woe)
            print(all_iv)
    return all_woe, all_iv

### 2. Conduct feature selection based on Information Value - using 0.1 as threshold

In [None]:
# Call function to get WOE and IV for all independent variables
info_value, weight_of_evi = calc_woe_iv(data = data, target = 'target', grp_nbr = 10, print_iv = False)

# Remove features with IV < 0.1
feature_drop = info_value[info_value.iv < 0.1]
feature_drop = feature_drop.feature.to_list()

selected_data = data.loc[:, ~data.columns.isin(feature_drop)]

### 3. Build decision trees

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split

# Split dataset
X = selected_data.drop(['target'], axis = 1)
Y = selected_data['target']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

### 4. Evaluate model performance and visualize trees

In [None]:
import matplotlib.pyplot as plt
from sklearn import metrics

# Calculate and print confusion matrix, classification report and accuracy
print(metrics.confusion_matrix(y_test, y_pred))

# Classification report includes precision, recall, f1-score, and support
print(metrics.classification_report(y_test, y_pred))
print("accuracy:", metrics.accuracy_score(y_test, y_pred))

# Calculate and visualize ROC curve
roc_auc = metrics.roc_auc_score(y_test, clf.predict(X_test))
fpr, tpr, threshold = metrics.roc_curve(y_test, clf.predict_prob(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('Receiver Operating Characteristic')
plt.show()

# Plot trees
plt_tree = tree.plot_tree(decision_tree = clf, feature_names = X.columns)