In [41]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score

In [2]:
loan_df = pd.read_csv('loan_approval_dataset.csv')
loan_df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [3]:
loan_df.isnull().sum()

loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

In [28]:
loan_df.columns = loan_df.columns.str.strip()
loan_df.columns

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')

In [29]:
print(loan_df["education"].unique())
print(loan_df["loan_status"].unique())

[' Graduate' ' Not Graduate']
[' Approved' ' Rejected']


In [30]:
loan_df["education"] = loan_df["education"].str.strip()
loan_df["loan_status"] = loan_df["loan_status"].str.strip()

In [31]:
loan_df["education"] = loan_df["education"].replace({"Graduate": 1, "Not Graduate": 0})
loan_df["loan_status"] = loan_df["loan_status"].replace({"Rejected": 0, "Approved": 1})

  loan_df["education"] = loan_df["education"].replace({"Graduate": 1, "Not Graduate": 0})
  loan_df["loan_status"] = loan_df["loan_status"].replace({"Rejected": 0, "Approved": 1})


In [32]:
loan_df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,1,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,2,0,0,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,3,1,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,4,3,1,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,5,0,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


In [33]:
loan_df.columns

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')

In [34]:
features = ['education', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']
print(features)

['education', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']


In [35]:
X = loan_df[features]

In [37]:
y = loan_df['loan_status']

In [38]:
train_X, test_X, train_y, test_y = train_test_split(X, y)

In [63]:
def result(leaf_nodes, n, train_X, test_X, train_y, test_y):
    model = RandomForestRegressor(max_leaf_nodes=leaf_nodes, n_estimators=n, random_state=0, n_jobs=-1)
    model.fit(train_X, train_y)
    
    pred = model.predict(test_X)
    res = mean_absolute_error(test_y, pred)

    test_y = test_y.astype(int)
    pred = pred.astype(int)

    accuracy = accuracy_score(test_y, pred)

    print(f"n_estimators: {n}, leaf_nodes: {leaf_nodes} --> MAE: {res:.4f}, Accuracy: {accuracy:.4f}")

    return res, accuracy

In [64]:
n_estimators = [100, 500, 1000]
leaf_nodes = [10, 50, 100]

best_mae = float("inf") 
best_acc = 0  
best_mae_params = {}
best_acc_params = {}

for i in n_estimators:
    for j in leaf_nodes:
        mae, acc = result(j, i, train_X, test_X, train_y, test_y)  

        if mae < best_mae:
            best_mae = mae
            best_mae_params = {"n_estimators": i, "leaf_nodes": j}

        if acc > best_acc:
            best_acc = acc
            best_acc_params = {"n_estimators": i, "leaf_nodes": j}

print("\nBest Hyperparameters Found:")
print(f"Best MAE: {best_mae:.4f} with n_estimators = {best_mae_params['n_estimators']}, leaf_nodes = {best_mae_params['leaf_nodes']}")
print(f"Best Accuracy: {best_acc:.4f} with n_estimators = {best_acc_params['n_estimators']}, leaf_nodes = {best_acc_params['leaf_nodes']}")

n_estimators: 100, leaf_nodes: 10 --> MAE: 0.0332, Accuracy: 0.3848
n_estimators: 100, leaf_nodes: 50 --> MAE: 0.0281, Accuracy: 0.3886
n_estimators: 100, leaf_nodes: 100 --> MAE: 0.0281, Accuracy: 0.9120
n_estimators: 500, leaf_nodes: 10 --> MAE: 0.0328, Accuracy: 0.3848
n_estimators: 500, leaf_nodes: 50 --> MAE: 0.0282, Accuracy: 0.3867
n_estimators: 500, leaf_nodes: 100 --> MAE: 0.0282, Accuracy: 0.8933
n_estimators: 1000, leaf_nodes: 10 --> MAE: 0.0328, Accuracy: 0.3848
n_estimators: 1000, leaf_nodes: 50 --> MAE: 0.0282, Accuracy: 0.3848
n_estimators: 1000, leaf_nodes: 100 --> MAE: 0.0283, Accuracy: 0.8801

Best Hyperparameters Found:
Best MAE: 0.0281 with n_estimators = 100, leaf_nodes = 50
Best Accuracy: 0.9120 with n_estimators = 100, leaf_nodes = 100
