## Importing Necessary Libraries

In [35]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

## Loading and Preprocessing the dataset

In [3]:
df=pd.read_csv("AER_credit_card_data.csv")
df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,yes,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,yes,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5


<b>Checking for null values

In [4]:
df.isnull().sum()

card           0
reports        0
age            0
income         0
share          0
expenditure    0
owner          0
selfemp        0
dependents     0
months         0
majorcards     0
active         0
dtype: int64

<b>Information about datatypes

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   card         1319 non-null   object 
 1   reports      1319 non-null   int64  
 2   age          1319 non-null   float64
 3   income       1319 non-null   float64
 4   share        1319 non-null   float64
 5   expenditure  1319 non-null   float64
 6   owner        1319 non-null   object 
 7   selfemp      1319 non-null   object 
 8   dependents   1319 non-null   int64  
 9   months       1319 non-null   int64  
 10  majorcards   1319 non-null   int64  
 11  active       1319 non-null   int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 123.8+ KB


<b>Label Encoding the categorical(datatype: Object) columns

In [None]:
label_encoders = {}
categorical_columns = ['card', 'owner', 'selfemp']

for col in categorical_columns:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])
    label_encoders[col] = encoder


<b>Now they have been converted to numbers

In [37]:
df[categorical_columns].head()

Unnamed: 0,card,owner,selfemp
0,1,1,0
1,1,0,0
2,1,1,0
3,1,0,0
4,1,1,0


<b>Scaling all numeric coulumns such that one feature does not affect the result adversely. Though this is not required for Decision Trees and Random Forests in general, sometimes it might help in better generalization.

In [36]:
scaler = StandardScaler()
numerical_features = ['reports', 'age', 'income', 'share', 'expenditure', 
                      'dependents', 'months', 'majorcards', 'active']

df[numerical_features] = scaler.fit_transform(df[numerical_features])

<b>Final overview of the dataset

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   card         1319 non-null   int32  
 1   reports      1319 non-null   float64
 2   age          1319 non-null   float64
 3   income       1319 non-null   float64
 4   share        1319 non-null   float64
 5   expenditure  1319 non-null   float64
 6   owner        1319 non-null   int32  
 7   selfemp      1319 non-null   int32  
 8   dependents   1319 non-null   float64
 9   months       1319 non-null   float64
 10  majorcards   1319 non-null   float64
 11  active       1319 non-null   float64
dtypes: float64(9), int32(3)
memory usage: 108.3 KB


In [38]:
df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,1,-0.339397,0.439254,0.681894,-0.374787,-0.220766,1,0,1.608362,-0.019135,0.472824,0.793701
1,1,-0.339397,0.003639,-0.558317,-0.671269,-0.643854,0,0,1.608362,-0.321037,0.472824,0.952345
2,1,-0.339397,0.044735,0.670083,-0.682486,-0.624944,1,0,2.410112,0.041245,0.472824,-0.316807
3,1,-0.339397,-0.267592,-0.487448,-0.037185,-0.173411,0,0,-0.796887,-0.456893,0.472824,0.000481
4,1,-0.339397,-0.103209,3.792286,-0.017772,1.328281,1,0,0.806613,0.131816,0.472824,-0.316807


## Defining Decision Tree from scratch

<b>Node class

In [10]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        ''' constructor ''' 
        
        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        
        # for leaf node
        self.value = value


<b>Tree Class

In [25]:
class DecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''
        
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree ''' 
        
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["info_gain"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["info_gain"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_info_gain = -float("inf")
        
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_info_gain = self.information_gain(y, left_y, right_y, "gini")
                    # update the best split if needed
                    if curr_info_gain>max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = curr_info_gain
                        max_info_gain = curr_info_gain
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def information_gain(self, parent, l_child, r_child, mode="entropy"):
        ''' function to compute information gain '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        if mode=="gini":
            gain = self.gini_index(parent) - (weight_l*self.gini_index(l_child) + weight_r*self.gini_index(r_child))
        else:
            gain = self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
        return gain
    
    def entropy(self, y):
        ''' function to compute entropy '''
        
        class_labels = np.unique(y)
        entropy = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            entropy += -p_cls * np.log2(p_cls)
        return entropy
    
    def gini_index(self, y):
        ''' function to compute gini index '''
        
        class_labels = np.unique(y)
        gini = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            gini += p_cls**2
        return 1 - gini
        
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        
        Y = list(Y)
        return max(Y, key=Y.count)
    
    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''
        
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
    
    def predict(self, X):
        ''' function to predict new dataset '''
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions
    
    def make_prediction(self, x, tree):
        ''' function to predict a single data point '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

## Splitting the dataset and training the models

In [12]:
y = df.card
X = df.drop(['card'], axis=1).values

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=41)

In [14]:
y_train.shape

(1055,)

<b>Reshaping y

In [15]:
y_train=y_train.to_numpy().reshape(-1, 1)

<b>Evaluation of Decision Tree

In [26]:
decision_tree = DecisionTreeClassifier(min_samples_split=3, max_depth=3)
decision_tree.fit(X_train,y_train)
decision_tree.print_tree()

X_4 <= -0.680067582792967 ? 0.33149008940208585
 left:X_0 <= -0.3393968014785687 ? 0.0034772152880162416
  left:X_10 <= 0.3177691062974968 ? 0.012424375031437257
    left:X_1 <= -0.8100556113436747 ? 0.0059838366420871675
        left:0.0
        right:0.0
    right:X_5 <= 0.0 ? 0.1519097222222222
        left:1.0
        right:0.0
  right:X_6 <= 0.0 ? 0.0019750071818443616
    left:0.0
    right:X_1 <= 1.4830840679064266 ? 0.21875
        left:0.0
        right:1.0
 right:1.0


In [39]:
y_pred = decision_tree.predict(X_test) 
print("Accuracy of Decision Tree Classifier:")
accuracy_score(y_test, y_pred)

Accuracy of Decision Tree Classifier:


0.9659090909090909

<b>Evaluation of Random Forest Classifier

In [33]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [40]:
y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of Random Forest Classifier: {accuracy_rf}")

Accuracy of Random Forest Classifier: 0.9659090909090909
