## Part A - Data Pre-processing

#### 1. Read in Loan Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt

df = pd.read_csv('loan_dataset.csv')

#### 2. Pre-process the data.

In [None]:
# drop empty rows
df.dropna(inplace=True)
print(df.head(10))
print(df.shape)

There are 13 different features in this data frame and there are 480 different samples, so the data is stored in a 480 by 13 dataframe.

#### 3. Extract Features and Label

In [None]:
# feature data frame

# Remove loan ID since it doesn't help with regression or classification

X = df[['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']]
# seperate all non-numerical columns

X_cat = X.select_dtypes(exclude=['int64', 'float64'])
# seperate all numerical columns

X_dog = X.select_dtypes(include=['int64', 'float64'])
# Transform cateorgical columns to numerical

X_cat['Gender'] = X_cat['Gender'].replace({'Female': 0.0, 'Male': 1.0})
X_cat['Married'] = X_cat['Married'].replace({'No': 0.0, 'Yes': 1.0})
X_cat['Dependents'] = X_cat['Dependents'].replace({'0': 0.0, '1': 1.0, '2': 2.0, '3': 3.0, '3+': 3.0})
X_cat['Education'] = X_cat['Education'].replace({'Not Graduate': 0.0, 'Graduate': 1.0,})
X_cat['Self_Employed'] = X_cat['Self_Employed'].replace({'No': 0.0, 'Yes': 1.0,})
X_cat['Property_Area'] = X_cat['Property_Area'].replace({'Urban': 0.0, 'Rural': 1.0, 'Semiurban': 2.0})
# Merge non-numerical and numerical data back together

X = pd.concat([X_cat, X_dog], axis=1)
print(X)

# lable data frame
y = df[['Loan_Status']]
y['Loan_Status'] = y['Loan_Status'].replace({'N': 0.0, 'Y': 1.0,})
print(y)


#### 4. Extract Features and Label

In [None]:
# Convert dataframes to numpy matrices
Xnp = np.matrix(X.values,dtype=np.float64)
ynp = np.array(y.values,dtype=np.float64)

# print(Xnp.shape)
# print(ynp.shape)

# Define proportions for train, validation, and test sets
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Calculate sizes of each split
train_size = int(train_ratio * len(Xnp))
val_size = int(val_ratio * len(Xnp))
test_size = len(Xnp) - train_size - val_size

# Split the data
X_train, X_val, X_test = Xnp[:train_size], Xnp[train_size:train_size+val_size], Xnp[train_size+val_size:]
y_train, y_val, y_test = ynp[:train_size], ynp[train_size:train_size+val_size], ynp[train_size+val_size:]


## Part B - Decsion Tree Implementation

#### 1. Regression Tree

##### The accuracy of the test and validation sets are outputed below

In [None]:
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        # Index of the feature to split on
        self.feature_index = feature_index
        # Threshold value for the split
        self.threshold = threshold  
        # Left child node
        self.left = left
        # Right child node
        self.right = right  
        # For leaf nodes: mean value of the target variable
        self.value = value

class RegressionTree:
    def __init__(self, max_depth):
        # Initialize the regression tree with maximum depth
        self.max_depth = max_depth

    def mean_squared_error(self, y):
        # Flatten the y vals
        y = y.flatten()
        # Calculate the mean squared error of a set of target values
        return np.mean((y - np.mean(y))**2)

    def splitting_criteria(self, X_column, y):

        # Unique values in the feature
        thresholds = np.unique(X_column)
        # Initialize the best mean squared error
        best_mse = float('inf')  
        # Initialize the best threshold
        best_threshold = None  
        for threshold in thresholds:
            # Split the data based on the threshold
            left_indices = np.where(X_column <= threshold)[0]
            right_indices = np.where(X_column > threshold)[0]
            # Calculate the mean squared error for the split
            left_mse = self.mean_squared_error(y[left_indices])
            right_mse = self.mean_squared_error(y[right_indices])
            # Calculate the weighted average of the mean squared errors
            mse = (len(left_indices) * left_mse + len(right_indices) * right_mse) / len(y)
            # Update the best split if the current split has lower mse
            if mse < best_mse:
                best_mse = mse
                best_threshold = threshold
        return best_mse, best_threshold

    def split(self, X, y):
        # Find the best feature to split on and the corresponding threshold
        n_samples, n_features = X.shape
        # Initialize the best mean squared error
        best_mse = float('inf')  
        # Initialize the best feature index
        best_feature_index = None
        # Initialize the best threshold
        best_threshold = None  
        for feature_index in range(n_features):
            # Calculate the splitting criteria for each feature
            X_column = (np.asarray(X[:, feature_index]))
            mse, threshold = self.splitting_criteria(X_column, y)
            # Update the best split if the current split has lower mse
            if mse < best_mse:
                best_mse = mse
                best_feature_index = feature_index
                best_threshold = threshold
        # Split the data based on the best feature and threshold
        left_indices = np.where(X[:, best_feature_index] <= best_threshold)[0]
        right_indices = np.where(X[:, best_feature_index] > best_threshold)[0]
        return best_feature_index, best_threshold, left_indices, right_indices

    def build_tree(self, X, y, depth=0):
        # Recursively build the decision tree
        if depth >= self.max_depth or len(np.unique(y)) == 1:
            # If the maximum depth is reached or all target values are the same, create a leaf node
            return Node(value=np.mean(y))
        # Find the best feature to split on and the corresponding threshold
        feature_index, threshold, left_indices, right_indices = self.split(X, y)
        # Recursively build the left and right subtrees
        left = self.build_tree(X[left_indices], y[left_indices], depth + 1)
        right = self.build_tree(X[right_indices], y[right_indices], depth + 1)
        # Create a node with the best split
        return Node(feature_index, threshold, left, right)

    def predict_sample(self, x, node):
        # Predict the output for a single sample
        if node.value is not None:
            # If the node is a leaf node, return the mean value of the target variable
            return node.value
        # Recursively traverse the tree to find the leaf node for the sample
        x = x.flatten()
        if x[node.feature_index] <= node.threshold:
            return self.predict_sample(x, node.left)
        else:
            return self.predict_sample(x, node.right)

    def predict(self, X):
        # Predict the output for multiple samples
        predictions = []
        for x in X:
            # Predict the output for each sample
            prediction = self.predict_sample(np.array(x), self.root)
            predictions.append(prediction)
        return np.array(predictions)

# Initialize the regression tree
regression_tree = RegressionTree(max_depth=(X_train.shape[1]-7))
# Train the Tree
regression_tree.root = regression_tree.build_tree(X_train, y_train)


# Predict target values for validation data
predictions_val = regression_tree.predict(X_val)
# Update any non 1 or 0 validation prediction to be 1 or 0 since these regression vals can only be 1 or 0
predictions_val = np.where(predictions_val > 0, 1, predictions_val)
# Calculate Accuracy for validation set
print("Validation set Accuracy of Regression Tree: ", np.mean(predictions_val == y_val.flatten()))



# Predict target values for test data
predictions_test = regression_tree.predict(X_test)
# Update any non 1 or 0 test prediction to be 1 or 0 since these vals can only be 1 or 0 for loan status
predictions_test = np.where(predictions_test > 0, 1, predictions_test)
# Create DataFrame with predictions
test_results = pd.DataFrame({'Predicted': predictions_test})
# Save predictions to CSV file
test_results.to_csv('Test_Result_1.csv', index=False)
# Calculate Accuracy for test set
print("Test set Accuracy of Regression Tree: ", np.mean(predictions_test == y_test.flatten()))


#### 1. Classification Tree
##### The accuracy of the test and validation sets are outputed below

In [None]:
class ClassificationTree:
    def __init__(self, max_depth):
        # Initialize the regression tree with maximum depth
        self.max_depth = max_depth
   
    def gini_impurity(self, y):
        # Flatten the y vals
        y = y.flatten().astype(int)
        # Check for length of y
        if len(y) == 0:
            return 0
        # Calculate the probablity of the feature, can you bin count cause there are only 2 classes 1 or 0
        p = np.bincount(y) / len(y)
        # Return the difference of the sum of square of probability
        return 1 - np.sum(p ** 2)
    
    def splitting_criteria(self, X_column, y):

        # Unique values in the feature
        thresholds = np.unique(X_column)
        # Initialize the best mean squared error
        best_gini = float('inf')  
        # Initialize the best threshold
        best_threshold = None  
        for threshold in thresholds:
            # Split the data based on the threshold
            left_indices = np.where(X_column <= threshold)[0]
            right_indices = np.where(X_column > threshold)[0]
            # Calculate the mean squared error for the split
            left_gini = self.gini_impurity(y[left_indices])
            right_gini = self.gini_impurity(y[right_indices])
            # Calculate the weighted average of the mean squared errors
            gini_val = (len(left_indices) * left_gini + len(right_indices) * right_gini) / len(y)
            # Update the best split if the current split has lower mse
            if gini_val < best_gini:
                best_gini = gini_val
                best_threshold = threshold
        return best_gini, best_threshold

    def split(self, X, y):
        # Find the best feature to split on and the corresponding threshold
        n_samples, n_features = X.shape
        # Initialize the best mean squared error
        best_gini = float('inf')  
        # Initialize the best feature index
        best_feature_index = None
        # Initialize the best threshold
        best_threshold = None  
        for feature_index in range(n_features):
            # Calculate the splitting criteria for each feature
            X_column = (np.asarray(X[:, feature_index]))
            gini, threshold = self.splitting_criteria(X_column, y)
            # Update the best split if the current split has lower mse
            if gini < best_gini:
                best_gini = gini
                best_feature_index = feature_index
                best_threshold = threshold
        # Split the data based on the best feature and threshold
        left_indices = np.where(X[:, best_feature_index] <= best_threshold)[0]
        right_indices = np.where(X[:, best_feature_index] > best_threshold)[0]
        return best_feature_index, best_threshold, left_indices, right_indices

    def build_tree(self, X, y, depth=0):
        # Recursively build the decision tree
        if depth >= self.max_depth or len(np.unique(y)) == 1:
            # If the maximum depth is reached or all target values are the same, create a leaf node
            return Node(value=np.mean(y))
        # Find the best feature to split on and the corresponding threshold
        feature_index, threshold, left_indices, right_indices = self.split(X, y)
        # Recursively build the left and right subtrees
        left = self.build_tree(X[left_indices], y[left_indices], depth + 1)
        right = self.build_tree(X[right_indices], y[right_indices], depth + 1)
        # Create a node with the best split
        return Node(feature_index, threshold, left, right)

    def predict_sample(self, x, node):
        # Predict the output for a single sample
        if node.value is not None:
            # If the node is a leaf node, return the mean value of the target variable
            return node.value
        # Recursively traverse the tree to find the leaf node for the sample
        x = x.flatten()
        if x[node.feature_index] <= node.threshold:
            return self.predict_sample(x, node.left)
        else:
            return self.predict_sample(x, node.right)

    def predict(self, X):
        # Predict the output for multiple samples
        predictions = []
        for x in X:
            # Predict the output for each sample
            prediction = self.predict_sample(np.array(x), self.root)
            predictions.append(prediction)
        return np.array(predictions)

# Initialize the regression tree
classification_tree = ClassificationTree(max_depth=(X_train.shape[1]-7))
# Train the Tree
classification_tree.root = classification_tree.build_tree(X_train, y_train) 


# Predict target values for validation data
predictions_val = classification_tree.predict(X_val)
# Update any non 1 or 0 validation prediction to be 1 or 0 since these regression vals can only be 1 or 0
predictions_val = np.where(predictions_val > 0, 1, predictions_val)
# Calculate Accuracy for validation set
print("Validation set Accuracy of Classification Tree: ", np.mean(predictions_val == y_val.flatten()))

# Predict target values for test data
predictions_test = classification_tree.predict(X_test)
# Update any non 1 or 0 test prediction to be 1 or 0 since these vals can only be 1 or 0 for loan status
predictions_test = np.where(predictions_test > 0, 1, predictions_test)
# Create DataFrame with predictions
test_results = pd.DataFrame({'Predicted': predictions_test})
# Save predictions to CSV file
test_results.to_csv('Test_Result_2.csv', index=False)
# Calculate Accuracy for test set
print("Test set Accuracy of Classification Tree: ", np.mean(predictions_test == y_test.flatten()))


For both models the only tuning done was the adjustmnet of the max depth hyperparameter. I adjusted such that I got the highest possible accuracy.