In [57]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier # Import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import joblib

In [58]:
df = pd.read_csv('loan.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [60]:
df.drop('Loan_ID', axis=1, inplace=True)
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [61]:
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

In [62]:
categorical_cols = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']
for col in categorical_cols:
    df[col].fillna('Unknown', inplace=True)

In [63]:
for col in categorical_cols + numerical_cols:
    df[col + '_Missing'] = df[col].isnull().astype(int)

In [64]:
print(df.columns)

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'Gender_Missing', 'Married_Missing', 'Dependents_Missing',
       'Self_Employed_Missing', 'Credit_History_Missing',
       'ApplicantIncome_Missing', 'CoapplicantIncome_Missing',
       'LoanAmount_Missing', 'Loan_Amount_Term_Missing'],
      dtype='object')


In [65]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,...,Loan_Status,Gender_Missing,Married_Missing,Dependents_Missing,Self_Employed_Missing,Credit_History_Missing,ApplicantIncome_Missing,CoapplicantIncome_Missing,LoanAmount_Missing,Loan_Amount_Term_Missing
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,...,Y,0,0,0,0,0,0,0,0,0
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,...,N,0,0,0,0,0,0,0,0,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,...,Y,0,0,0,0,0,0,0,0,0
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,...,Y,0,0,0,0,0,0,0,0,0
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,...,Y,0,0,0,0,0,0,0,0,0


In [66]:
label_encoder = LabelEncoder()

for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].astype(str)
    else:
        df[col] = df[col].astype(str)
    df[col] = label_encoder.fit_transform(df[col])

In [67]:
X = df.drop(["Loan_Status"], axis=1)
y = df["Loan_Status"]

In [68]:
selected_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
                     'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
                     'Loan_Amount_Term', 'Credit_History', 'Property_Area',
                     'Gender_Missing', 'Married_Missing', 'Dependents_Missing',
                     'Self_Employed_Missing', 'Credit_History_Missing']
X = X[selected_features]

In [69]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.092, random_state=0)

decision_tree = DecisionTreeClassifier(random_state=1, ccp_alpha=0.005)  # Adjust ccp_alpha as needed
decision_tree.fit(X_train, y_train)

y_pred_dt = decision_tree.predict(X_test)
dt_acc = accuracy_score(y_pred_dt, y_test)
print('Decision Tree accuracy: {:.2f}%'.format(dt_acc * 100))

Decision Tree accuracy: 91.23%


In [70]:
joblib.dump(decision_tree, 'loanmodel_decision_tree_pruned.pkl')

['loanmodel_decision_tree_pruned.pkl']