In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib

In [2]:
df = pd.read_csv("loan_data.csv")

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [4]:
df.shape

(381, 13)

In [5]:
df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
376,LP002953,Male,Yes,3+,Graduate,No,5703,0.0,128.0,360.0,1.0,Urban,Y
377,LP002974,Male,Yes,0,Graduate,No,3232,1950.0,108.0,360.0,1.0,Rural,Y
378,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
379,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
380,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            381 non-null    object 
 1   Gender             376 non-null    object 
 2   Married            381 non-null    object 
 3   Dependents         373 non-null    object 
 4   Education          381 non-null    object 
 5   Self_Employed      360 non-null    object 
 6   ApplicantIncome    381 non-null    int64  
 7   CoapplicantIncome  381 non-null    float64
 8   LoanAmount         381 non-null    float64
 9   Loan_Amount_Term   370 non-null    float64
 10  Credit_History     351 non-null    float64
 11  Property_Area      381 non-null    object 
 12  Loan_Status        381 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 38.8+ KB


In [7]:
#Handling missing values
df.isnull().sum()

Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [8]:
df.isnull().mean()*100

Loan_ID              0.000000
Gender               1.312336
Married              0.000000
Dependents           2.099738
Education            0.000000
Self_Employed        5.511811
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     2.887139
Credit_History       7.874016
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [9]:
df = df.drop('Loan_ID', axis=1)

In [10]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [11]:
df =df.dropna(subset = ['Gender', 'Dependents', 'Loan_Amount_Term'])

In [12]:
df.shape

(358, 12)

In [13]:
df.isnull().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed        20
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [14]:
df['Self_Employed'].unique()

array(['No', 'Yes', nan], dtype=object)

In [15]:
df['Self_Employed'].mode()[0]

'No'

In [16]:
df['Credit_History'].unique()

array([ 1., nan,  0.])

In [17]:
df['Credit_History'].mode()[0]

np.float64(1.0)

In [18]:
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)


In [19]:
df.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 358 entries, 0 to 380
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             358 non-null    object 
 1   Married            358 non-null    object 
 2   Dependents         358 non-null    object 
 3   Education          358 non-null    object 
 4   Self_Employed      358 non-null    object 
 5   ApplicantIncome    358 non-null    int64  
 6   CoapplicantIncome  358 non-null    float64
 7   LoanAmount         358 non-null    float64
 8   Loan_Amount_Term   358 non-null    float64
 9   Credit_History     358 non-null    float64
 10  Property_Area      358 non-null    object 
 11  Loan_Status        358 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 36.4+ KB


In [21]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [22]:
df['Dependents'].unique()

array(['1', '0', '2', '3+'], dtype=object)

In [23]:
df['Dependents'].replace('3+', '4', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Dependents'].replace('3+', '4', inplace=True)


In [24]:
df['Dependents'].unique()

array(['1', '0', '2', '4'], dtype=object)

In [25]:
df['Married'].unique()

array(['Yes', 'No'], dtype=object)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 358 entries, 0 to 380
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             358 non-null    object 
 1   Married            358 non-null    object 
 2   Dependents         358 non-null    object 
 3   Education          358 non-null    object 
 4   Self_Employed      358 non-null    object 
 5   ApplicantIncome    358 non-null    int64  
 6   CoapplicantIncome  358 non-null    float64
 7   LoanAmount         358 non-null    float64
 8   Loan_Amount_Term   358 non-null    float64
 9   Credit_History     358 non-null    float64
 10  Property_Area      358 non-null    object 
 11  Loan_Status        358 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 36.4+ KB


In [27]:
encoding = {
    'Gender' : {'Male': 1, 'Female': 0},
    'Married' : {'Yes': 1, 'No': 0},
    'Dependents' : {'0': 0, '1': 1, '2': 2, '4': 4},
    'Education' : {'Graduate': 1, 'Not Graduate': 0},
    'Self_Employed' : {'Yes': 1, 'No': 0},
    'Property_Area' : {'Rural': 0, 'Semiurban': 2, 'Urban': 1},
    'Loan_Status' : {'Y': 1, 'N': 0}
}

In [28]:
df.replace(encoding, inplace=True)

  df.replace(encoding, inplace=True)


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 358 entries, 0 to 380
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             358 non-null    int64  
 1   Married            358 non-null    int64  
 2   Dependents         358 non-null    int64  
 3   Education          358 non-null    int64  
 4   Self_Employed      358 non-null    int64  
 5   ApplicantIncome    358 non-null    int64  
 6   CoapplicantIncome  358 non-null    float64
 7   LoanAmount         358 non-null    float64
 8   Loan_Amount_Term   358 non-null    float64
 9   Credit_History     358 non-null    float64
 10  Property_Area      358 non-null    int64  
 11  Loan_Status        358 non-null    int64  
dtypes: float64(4), int64(8)
memory usage: 36.4 KB


In [30]:
def estimate_age(row):
    base_age = 60 - (row['Loan_Amount_Term'] / 12)

    total_income = row['ApplicantIncome'] + row['CoapplicantIncome']
    if total_income < 4000:
        base_age -= 2
    elif total_income > 8000:
        base_age += 3
    
    if row['Self_Employed'] == 1:
        base_age += 5
    
    return int(base_age)

In [31]:
df['Age'] = df.apply(estimate_age, axis = 1)

In [32]:
df['LoanAmount'] = df['LoanAmount'] * 1000

In [33]:
def calculate_emi(row):
    P = row['LoanAmount']
    n = row['Loan_Amount_Term']
    r = (8.0 / 12) / 100

    if r == 0:
        return P / n

    emi = (P * r * (1 + r)**n) / ((1 + r)**n - 1)
    return emi

In [34]:
df['EMI'] = df.apply( lambda x: round(calculate_emi(x), 2), axis=1 )

In [35]:
df['EMI_to_Income'] = df['EMI'] / (df['ApplicantIncome'] + df['CoapplicantIncome'])

In [36]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Age,EMI,EMI_to_Income
0,1,1,1,1,0,4583,1508.0,128000.0,360.0,1.0,0,0,30,939.22,0.154198
1,1,1,0,1,1,3000,0.0,66000.0,360.0,1.0,1,1,33,484.28,0.161427
2,1,1,0,0,0,2583,2358.0,120000.0,360.0,1.0,1,1,30,880.52,0.178207
3,1,0,0,1,0,6000,0.0,141000.0,360.0,1.0,1,1,30,1034.61,0.172435
4,1,1,0,0,0,2333,1516.0,95000.0,360.0,1.0,1,1,28,697.08,0.181107


In [37]:
X = df.drop('Loan_Status', axis = 1)
y = df['Loan_Status']

In [38]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Age,EMI,EMI_to_Income
0,1,1,1,1,0,4583,1508.0,128000.0,360.0,1.0,0,30,939.22,0.154198
1,1,1,0,1,1,3000,0.0,66000.0,360.0,1.0,1,33,484.28,0.161427
2,1,1,0,0,0,2583,2358.0,120000.0,360.0,1.0,1,30,880.52,0.178207
3,1,0,0,1,0,6000,0.0,141000.0,360.0,1.0,1,30,1034.61,0.172435
4,1,1,0,0,0,2333,1516.0,95000.0,360.0,1.0,1,28,697.08,0.181107


In [39]:
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'EMI', 'EMI_to_Income']
scalar = StandardScaler()
X[num_cols] = scalar.fit_transform(X[num_cols])

In [40]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Age,EMI,EMI_to_Income
0,1,1,1,1,0,0.71163,0.092069,0.80598,0.285826,1.0,0,30,0.201667,-0.315997
1,1,1,0,1,1,-0.398856,-0.539332,-1.350425,0.285826,1.0,1,33,-0.638896,-0.234122
2,1,1,0,0,0,-0.691384,0.447965,0.527735,0.285826,1.0,1,30,0.093211,-0.044064
3,1,0,0,1,0,1.705666,-0.539332,1.25813,0.285826,1.0,1,30,0.377913,-0.109438
4,1,1,0,0,0,-0.866761,0.095418,-0.341784,0.285826,1.0,1,28,-0.245719,-0.011218


In [41]:
def evaluate_model(model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cross_val = cross_val_score(model, X, y, cv=5)
    avg_cross_val = np.mean(cross_val)
    print(f"{model.__class__.__name__} - Accuracy : {accuracy: .2f}, Cross-Val-Score : {avg_cross_val: .2f}")
    return avg_cross_val

In [42]:
models = {
    LogisticRegression(),
    svm.SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier()
}

In [43]:
model_score = {model.__class__.__name__:evaluate_model(model) for model in models}

GradientBoostingClassifier - Accuracy :  0.81, Cross-Val-Score :  0.80
RandomForestClassifier - Accuracy :  0.83, Cross-Val-Score :  0.84
DecisionTreeClassifier - Accuracy :  0.82, Cross-Val-Score :  0.73


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression - Accuracy :  0.85, Cross-Val-Score :  0.83
SVC - Accuracy :  0.78, Cross-Val-Score :  0.73


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
def tune_model(model, param_grid):
    tuner = RandomizedSearchCV(model, param_grid, cv = 5, n_iter = 20, verbose = True, random_state = 42)
    tuner.fit(X, y)
    print(f"Best Score for {model.__class__.__name__}: {tuner.best_score_:.2f}")
    print(f"Best Parameter for {model.__class__.__name__}: {tuner.best_params_}")
    return tuner.best_estimator_

In [45]:
log_reg_grid = {'C': np.logspace(-4, 4, 20), "solver": ["liblinear"]}
svc_grid = {'C': [0.25, 0.50, 0.75, 1], "kernel": ['linear']}

rf_grid = {
    'n_estimators': np.arange(10, 1000, 10),
    'max_features': ['log2', 'sqrt'], 
    'max_depth': [None, 3, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 20, 50, 100],
    'min_samples_leaf': [1, 2, 5, 10]
}

In [46]:
best_log_reg = tune_model(LogisticRegression(), log_reg_grid)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for LogisticRegression: 0.84
Best Parameter for LogisticRegression: {'solver': 'liblinear', 'C': np.float64(545.5594781168514)}


In [47]:
best_svc_reg = tune_model(svm.SVC(), svc_grid)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




Best Score for SVC: 0.84
Best Parameter for SVC: {'kernel': 'linear', 'C': 0.25}


In [48]:
best_rf = tune_model(RandomForestClassifier(), rf_grid)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for RandomForestClassifier: 0.84
Best Parameter for RandomForestClassifier: {'n_estimators': np.int64(210), 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 20}


In [49]:
#Logistic Regression Final Model
final_model = best_log_reg

In [50]:
final_model_1 = best_rf

In [51]:
joblib.dump(final_model_1, 'rf.pkl')

['rf.pkl']

In [52]:
joblib.dump(final_model, 'home_loan_status_predictor.pkl')

['home_loan_status_predictor.pkl']

In [53]:
joblib.dump(scalar, 'vector.pkl')

['vector.pkl']

In [54]:
# Prediction System

scaler = joblib.load('vector.pkl')

# Function to calculate EMI
def calculate_emi(loan_amount, loan_term, rate=8.0):
    """Calculate EMI based on loan amount, loan term (in months), and interest rate."""
    rate = rate / (12 * 100)  # Monthly Interest Rate
    if rate == 0:  # Edge case for zero interest (unlikely)
        return loan_amount / loan_term  
    emi = (loan_amount * rate * (1 + rate) ** loan_term) / ((1 + rate) ** loan_term - 1)
    return round(emi, 2)

sample_data = pd.DataFrame({
    'Gender': [1],
    'Married': [1],
    'Dependents': [2],
    'Education': [1],
    'Self_Employed': [1],
    'ApplicantIncome': [3000],
    'CoapplicantIncome': [0.0],
    'LoanAmount': [6600],
    'Loan_Amount_Term': [360],
    'Credit_History': [1],         # Loan gets approved on changing credit history to 1
    'Property_Area': [1],
    'Age' : [35]
})

# Calculate EMI for each row
sample_data['EMI'] = sample_data.apply(lambda x: calculate_emi(x['LoanAmount'], x['Loan_Amount_Term']), axis=1)
EMI_value = sample_data['EMI'].iloc[0]
print(EMI_value)

# Calculate EMI-to-Income Ratio
# sample_data['Total_Income'] = sample_data['ApplicantIncome'] + sample_data['CoapplicantIncome']
sample_data['EMI_to_Income'] = sample_data['EMI'] / (sample_data['ApplicantIncome'] + sample_data['CoapplicantIncome'])
EMI_to_Income_value = sample_data['EMI_to_Income'].iloc[0]
print(EMI_to_Income_value)

num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'EMI', 'EMI_to_Income']

sample_data[num_cols] = scaler.transform(sample_data[num_cols])
loaded_model = joblib.load('rf.pkl')
prediction = loaded_model.predict(sample_data)
prob = loaded_model.predict_proba(sample_data)
print(f"Probabilty of Loan approval: {prob[0][1]: .2f}\n")

importances = loaded_model.feature_importances_
for feature, importance in zip(X.columns, importances):
    print(f"{feature}: {importance:.3f}")

result = "Loan Approved" if prediction[0] == 1 and EMI_to_Income_value <= 0.4 else "Loan Not Approved"
print(f"\nPrediction Result: {result}")
print(scalar.mean_)
print(scalar.scale_)

48.43
0.016143333333333332
Probabilty of Loan approval:  0.78

Gender: 0.008
Married: 0.017
Dependents: 0.018
Education: 0.014
Self_Employed: 0.004
ApplicantIncome: 0.103
CoapplicantIncome: 0.068
LoanAmount: 0.074
Loan_Amount_Term: 0.014
Credit_History: 0.399
Property_Area: 0.059
Age: 0.029
EMI: 0.079
EMI_to_Income: 0.111

Prediction Result: Loan Approved
[3.56856983e+03 1.28810872e+03 1.04826816e+05 3.40391061e+02
 8.30071173e+02 1.82097190e-01]
[1.42550183e+03 2.38833883e+03 2.87515490e+04 6.86044702e+01
 5.41232735e+02 8.82895258e-02]
