Import essential libraries

In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

Import data from file

In [22]:
data_df = pd.read_csv('loan_data.csv')
data_df

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,27.0,male,Associate,47971.0,6,RENT,15000.0,MEDICAL,15.66,0.31,3.0,645,No,1
44996,37.0,female,Associate,65800.0,17,RENT,9000.0,HOMEIMPROVEMENT,14.07,0.14,11.0,621,No,1
44997,33.0,male,Associate,56942.0,7,RENT,2771.0,DEBTCONSOLIDATION,10.02,0.05,10.0,668,No,1
44998,29.0,male,Bachelor,33164.0,4,RENT,12000.0,EDUCATION,13.23,0.36,6.0,604,No,1


Check N/A variables ratio

In [23]:
data_df.isna().mean() * 100

person_age                        0.0
person_gender                     0.0
person_education                  0.0
person_income                     0.0
person_emp_exp                    0.0
person_home_ownership             0.0
loan_amnt                         0.0
loan_intent                       0.0
loan_int_rate                     0.0
loan_percent_income               0.0
cb_person_cred_hist_length        0.0
credit_score                      0.0
previous_loan_defaults_on_file    0.0
loan_status                       0.0
dtype: float64

Define Decision Tree class using Entropy Algorithm

In [24]:
# Class for Nodes in the Decision Tree
class TreeNode:
    def __init__(self, attribute=None, split_value=None, is_numeric=None, children=None, label=None):
        self.attribute = attribute
        self.split_value = split_value
        self.is_numeric = is_numeric
        self.children = children or []
        self.label = label

    def is_leaf(self):
        return self.label is not None

    # Return next branch for sample prediction
    def get_branch(self, sample):
        if self.is_leaf():
            return None

        sample_value = sample[self.attribute]
        
        if self.is_numeric:
            return self.children[0] if sample_value <= self.split_value else self.children[1]
        else:
            for child, group in zip(self.children, self.split_value):  
                if sample_value in group:
                    return child
        return None

# Decision Tree class for creating and displaying the tree
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, criterion='entropy', numerical_precision=0.75, max_splits=10):
        self.root = None
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.numerical_precision = numerical_precision
        self.max_splits = max_splits

    def calculate_entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / counts.sum()
        return -np.sum(probabilities * np.log2(probabilities))
    
    def calculate_gini(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / counts.sum()
        return 1 - np.sum(probabilities ** 2)
    
    def information_gain(self, X, y, attribute, split_value, is_numeric):
        if self.criterion == 'entropy':
            current_entropy = self.calculate_entropy(y)
            subsets = self.split_data(X, y, attribute, split_value, is_numeric)
            split_entropy = sum((len(subset_y) / len(y)) * self.calculate_entropy(subset_y) for _, subset_y in subsets)
            return current_entropy - split_entropy
        else:
            current_gini = self.calculate_gini(y)
            subsets = self.split_data(X, y, attribute, split_value, is_numeric)
            split_gini = sum((len(subset_y) / len(y)) * self.current_gini(subset_y) for _, subset_y in subsets)
            return current_gini - split_gini

    # Split the data according to the attribute and split value
    def split_data(self, X, y, attribute, split_value, is_numeric):
        if is_numeric:
            left_mask = X[attribute] <= split_value
            right_mask = X[attribute] > split_value
            return [(X[left_mask], y[left_mask]), (X[right_mask], y[right_mask])]
        else:
            return [(X[X[attribute].isin(group)], y[X[attribute].isin(group)]) for group in split_value]

    # Find the best split attribute and value
    def best_split(self, X, y, splits_dict):
        best_attribute, best_value, best_gain, best_is_numeric = None, None, -float("inf"), None

        for attribute, split_values in splits_dict.items():
            is_numeric = isinstance(split_values[0], (int, float))
            for split_value in split_values:
                gain = self.information_gain(X, y, attribute, split_value, is_numeric)
                if gain > best_gain:
                    best_attribute, best_value, best_gain, best_is_numeric = attribute, split_value, gain, is_numeric
            
        return best_attribute, best_value, best_is_numeric

    # Recursive function to build the decision tree
    def build_tree(self, X, y, splits_dict, depth=0):
        if len(y.unique()) == 1 or (self.max_depth and depth >= self.max_depth) or (len(y) < self.min_samples_split):
            return TreeNode(label=y.mode()[0])

        best_attribute, best_value, is_numeric = self.best_split(X, y, splits_dict)
        if best_attribute is None:
            return TreeNode(label=y.mode()[0])
        
        subsets = self.split_data(X, y, best_attribute, best_value, is_numeric)
        splits_dict.pop(best_attribute)
        children = [self.build_tree(subset_X, subset_y, splits_dict.copy(), depth + 1) for subset_X, subset_y in subsets]
        return TreeNode(attribute=best_attribute, split_value=best_value, is_numeric=is_numeric, children=children)
    
    # Predict the class for samples X
    def predict(self, X):
        y_pred = []
        
        for _, sample in X.iterrows():
            node = self.root
            while not node.is_leaf():
                node = node.get_branch(sample)
                if node is None:
                    y_pred.append(None)
                    break
            else:
                y_pred.append(node.label)
        return np.array(y_pred)
    
    def calculate_numerical_splits(self, X):
        numerical_cols = X.select_dtypes(include=["number"]).columns
        numerical_splits_dict = {}
        for numerical_col in numerical_cols:
            sorted_values = sorted(X[numerical_col].dropna().unique())
            numerical_splits = [(a + b) / 2 for a, b in zip(sorted_values, sorted_values[1:])]
            numerical_splits_precision = int(len(numerical_splits)*(1 - self.numerical_precision)) + 1
            numerical_splits_dict[numerical_col] = numerical_splits[::numerical_splits_precision]
        return numerical_splits_dict
    
    def categorical_splits(self, arr):
        possible_splits = []

        def generate_splits(subset, remaining):
            if len(subset) >= self.max_splits:
                return
        
            if not remaining:
                possible_splits.append(subset)
                return

            for i in range(len(subset)):
                generate_splits(subset[:i] + [subset[i] + [remaining[0]]] + subset[i+1:], remaining[1:])
            generate_splits(subset + [[remaining[0]]], remaining[1:])

        generate_splits([], arr)
        return possible_splits
    
    def calculate_categorical_splits(self, X):
        categorical_cols = X.select_dtypes(include=["object"]).columns
        categorical_splits_dict = {}
        for categorical_col in categorical_cols:
            unique_values = list(X[categorical_col].dropna().unique())
            categorical_splits_dict[categorical_col] = self.categorical_splits(unique_values)
        return categorical_splits_dict
    
    def fit(self, X, y):
        # Compute numerical and categorical splits
        numerical_splits_dict = self.calculate_numerical_splits(X)
        categorical_splits_dict = self.calculate_categorical_splits(X)
        splits_dict = numerical_splits_dict | categorical_splits_dict

        self.root = self.build_tree(X, y, splits_dict)

    def evaluate(self, y_true, y_pred):
        y_true = y_true.to_numpy()
        y_pred = y_pred

        accuracy = np.mean(y_pred == y_true)

        tp = np.sum(y_pred & y_true)
        fp = np.sum(y_pred & ~y_true)
        fn = np.sum(~y_pred & y_true)

        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1_score = 2 * (precision * recall) / (precision + recall)

        return {"accuracy": accuracy, "precision": precision, "recall": recall, "F1-Score": f1_score}

    def print_tree(self, node=None, depth=0):
        if node is None:
            node = self.root

        if node.is_leaf():
            print("  " * depth + f"Leaf: {node.label}")
        else:
            operator = "≤" if node.is_numeric else "in"
            print("  " * depth + f"[{node.attribute} {operator} {node.split_value}]")
            for i, child in enumerate(node.children):
                print("  " * (depth + 1) + f"├─ Branch {i + 1}:")
                self.print_tree(child, depth + 2)

Split features and target:

In [25]:
X = data_df.drop(columns=['loan_status'])
y = data_df['loan_status']
X, y

(       person_age person_gender person_education  person_income  \
 0            22.0        female           Master        71948.0   
 1            21.0        female      High School        12282.0   
 2            25.0        female      High School        12438.0   
 3            23.0        female         Bachelor        79753.0   
 4            24.0          male           Master        66135.0   
 ...           ...           ...              ...            ...   
 44995        27.0          male        Associate        47971.0   
 44996        37.0        female        Associate        65800.0   
 44997        33.0          male        Associate        56942.0   
 44998        29.0          male         Bachelor        33164.0   
 44999        24.0          male      High School        51609.0   
 
        person_emp_exp person_home_ownership  loan_amnt        loan_intent  \
 0                   0                  RENT    35000.0           PERSONAL   
 1                   0    

Split the dataset into train, test, and validation subsets

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=10, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.9, random_state=10, stratify=y_train)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

((32400, 13), (32400,), (3600, 13), (3600,), (9000, 13), (9000,))

Evaluate the model

In [27]:
tree_classifier = DecisionTree(max_depth=7, numerical_precision=0.95, max_splits=7)
tree_classifier.fit(X_train, y_train)
tree_classifier.print_tree()
y_pred = tree_classifier.predict(X_test)
tree_classifier.evaluate(y_test, y_pred)

[previous_loan_defaults_on_file in [['No'], ['Yes']]]
  ├─ Branch 1:
    [loan_percent_income ≤ 0.245]
      ├─ Branch 1:
        [loan_int_rate ≤ 13.715]
          ├─ Branch 1:
            [person_income ≤ 26884.0]
              ├─ Branch 1:
                [loan_amnt ≤ 2458.5]
                  ├─ Branch 1:
                    [credit_score ≤ 552.5]
                      ├─ Branch 1:
                        Leaf: 1
                      ├─ Branch 2:
                        [loan_intent in [['EDUCATION'], ['DEBTCONSOLIDATION'], ['MEDICAL'], ['PERSONAL'], ['VENTURE'], ['HOMEIMPROVEMENT']]]
                          ├─ Branch 1:
                            Leaf: 1
                          ├─ Branch 2:
                            Leaf: 0
                          ├─ Branch 3:
                            Leaf: 1
                          ├─ Branch 4:
                            Leaf: 1
                          ├─ Branch 5:
                            Leaf: 1
                          ├─

{'accuracy': np.float64(0.9166666666666666),
 'precision': np.float64(0.8729116945107399),
 'recall': np.float64(0.7315),
 'F1-Score': np.float64(0.7959738846572362)}

Encoding categorical features as one-hot encoded values

In [28]:
categorical_cols = X.select_dtypes(include=["object"]).columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
X

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,person_gender_male,person_education_Bachelor,...,person_education_Master,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,previous_loan_defaults_on_file_Yes
0,22.0,71948.0,0,35000.0,16.02,0.49,3.0,561,False,False,...,True,False,False,True,False,False,False,True,False,False
1,21.0,12282.0,0,1000.0,11.14,0.08,2.0,504,False,False,...,False,False,True,False,True,False,False,False,False,True
2,25.0,12438.0,3,5500.0,12.87,0.44,3.0,635,False,False,...,False,False,False,False,False,False,True,False,False,False
3,23.0,79753.0,0,35000.0,15.23,0.44,2.0,675,False,True,...,False,False,False,True,False,False,True,False,False,False
4,24.0,66135.0,1,35000.0,14.27,0.53,4.0,586,True,False,...,True,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,27.0,47971.0,6,15000.0,15.66,0.31,3.0,645,True,False,...,False,False,False,True,False,False,True,False,False,False
44996,37.0,65800.0,17,9000.0,14.07,0.14,11.0,621,False,False,...,False,False,False,True,False,True,False,False,False,False
44997,33.0,56942.0,7,2771.0,10.02,0.05,10.0,668,True,False,...,False,False,False,True,False,False,False,False,False,False
44998,29.0,33164.0,4,12000.0,13.23,0.36,6.0,604,True,True,...,False,False,False,True,True,False,False,False,False,False


Split the dataset into train, test, and validation subsets

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=10, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.9, random_state=10, stratify=y_train)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

((32400, 22), (32400,), (3600, 22), (3600,), (9000, 22), (9000,))

Comparing my model with scikit-learn's DecisionTreeClassifier

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

scikit_tree_classifier = DecisionTreeClassifier(max_depth=7)
scikit_tree_classifier.fit(X_train, y_train)

y_pred = scikit_tree_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f"accuracy: {accuracy}\nprecision: {precision}\nrecall: {recall}\nF1-Score: {f1}")

accuracy: 0.9175555555555556
precision: 0.8744047619047619
recall: 0.7345
F1-Score: 0.7983695652173913
