In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib

In [2]:
df = pd.read_csv("loan_data_expanded2.csv")

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001179,Male,Yes,2,Graduate,No,4520,0,136,180,0,Urban,N
1,LP001811,Male,Yes,0,Not Graduate,No,3152,3915,127,180,0,Semiurban,Y
2,LP001030,Male,Yes,2,Graduate,No,1372,1107,16,240,0,Urban,Y
3,LP001736,Male,Yes,0,Graduate,No,2163,0,71,360,1,Urban,N
4,LP001653,Male,No,0,Not Graduate,No,4921,0,52,240,1,Rural,Y


In [4]:
df.shape

(40000, 13)

In [5]:
df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
39995,LP001819,Male,Yes,1,Not Graduate,No,6594,0,154,180,1,Urban,Y
39996,LP002141,Male,Yes,3+,Graduate,No,3132,1986,99,360,0,Rural,Y
39997,LP002911,Male,Yes,1,Graduate,No,2585,1854,152,180,1,Rural,N
39998,LP001917,Female,No,0,Graduate,No,1743,1768,56,240,0,Urban,Y
39999,LP001836,Female,No,2,Graduate,No,3397,0,103,360,1,Urban,N


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Loan_ID            40000 non-null  object
 1   Gender             40000 non-null  object
 2   Married            40000 non-null  object
 3   Dependents         40000 non-null  object
 4   Education          40000 non-null  object
 5   Self_Employed      40000 non-null  object
 6   ApplicantIncome    40000 non-null  int64 
 7   CoapplicantIncome  40000 non-null  int64 
 8   LoanAmount         40000 non-null  int64 
 9   Loan_Amount_Term   40000 non-null  int64 
 10  Credit_History     40000 non-null  int64 
 11  Property_Area      40000 non-null  object
 12  Loan_Status        40000 non-null  object
dtypes: int64(5), object(8)
memory usage: 4.0+ MB


In [7]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [8]:
df = df.drop('Loan_ID', axis=1)

In [9]:
df['Dependents'].replace('3+', '4', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Dependents'].replace('3+', '4', inplace=True)


In [10]:
(df.select_dtypes(include=[np.number]) < 0).any()

ApplicantIncome      False
CoapplicantIncome    False
LoanAmount           False
Loan_Amount_Term     False
Credit_History       False
dtype: bool

In [11]:
df[(df.select_dtypes(include=[np.number]) < 0).any(axis=1)]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status


In [12]:
median_income = df[df["ApplicantIncome"] > 0]["ApplicantIncome"].median()
df.loc[df["ApplicantIncome"] < 0, "ApplicantIncome"] = int(median_income)

In [13]:
df[(df.select_dtypes(include=[np.number]) < 0).any(axis=1)]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status


In [14]:
encoding = {
    'Gender' : {'Male': 1, 'Female': 0},
    'Married' : {'Yes': 1, 'No': 0},
    'Dependents' : {'0': 0, '1': 1, '2': 2, '4': 4},
    'Education' : {'Graduate': 1, 'Not Graduate': 0},
    'Self_Employed' : {'Yes': 1, 'No': 0},
    'Property_Area' : {'Rural': 0, 'Semiurban': 2, 'Urban': 1},
    'Loan_Status' : {'Y': 1, 'N': 0}
}

In [15]:
df.replace(encoding, inplace=True)

  df.replace(encoding, inplace=True)


In [16]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,2,1,0,4520,0,136,180,0,1,0
1,1,1,0,0,0,3152,3915,127,180,0,2,1
2,1,1,2,1,0,1372,1107,16,240,0,1,1
3,1,1,0,1,0,2163,0,71,360,1,1,0
4,1,0,0,0,0,4921,0,52,240,1,0,1


In [17]:
df["CoapplicantIncome"] = df["CoapplicantIncome"].round().astype(int)
df["LoanAmount"] = df["LoanAmount"].round().astype(int)

In [18]:
def estimate_age(row):
    base_age = 60 - (row['Loan_Amount_Term'] / 12)

    total_income = row['ApplicantIncome'] + row['CoapplicantIncome']
    if total_income < 4000:
        base_age -= 2
    elif total_income > 8000:
        base_age += 3
    
    if row['Self_Employed'] == 1:
        base_age += 5
    
    return int(base_age)

In [19]:
df['Age'] = df.apply(estimate_age, axis = 1)

In [20]:
df['LoanAmount'] = df['LoanAmount'] * 1000

In [21]:
def calculate_emi(row):
    P = row['LoanAmount']
    n = row['Loan_Amount_Term']
    r = (8.0 / 12) / 100

    if r == 0:
        return P / n

    emi = (P * r * (1 + r)**n) / ((1 + r)**n - 1)
    return emi

In [22]:
df['EMI'] = df.apply( lambda x: round(calculate_emi(x), 2), axis=1 )

In [23]:
df['EMI_to_Income'] = df['EMI'] / (df['ApplicantIncome'] + df['CoapplicantIncome'])

In [24]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Age,EMI,EMI_to_Income
0,1,1,2,1,0,4520,0,136000,180,0,1,0,45,1299.69,0.287542
1,1,1,0,0,0,3152,3915,127000,180,0,2,1,45,1213.68,0.171739
2,1,1,2,1,0,1372,1107,16000,240,0,1,1,38,133.83,0.053985
3,1,1,0,1,0,2163,0,71000,360,1,1,0,28,520.97,0.240855
4,1,0,0,0,0,4921,0,52000,240,1,0,1,40,434.95,0.088387


In [25]:
X = df.drop('Loan_Status', axis = 1)
y = df['Loan_Status']

In [26]:
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'EMI', 'EMI_to_Income']
scalar = StandardScaler()
X[num_cols] = scalar.fit_transform(X[num_cols])

In [27]:
X.tail()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Age,EMI,EMI_to_Income
39995,1,1,1,0,0,1.973523,-0.510139,1.583144,-0.507643,1,1,45,1.409699,0.050139
39996,1,1,4,1,0,-0.30189,0.275934,-0.177424,1.516518,0,0,30,-0.71449,-0.866648
39997,1,1,1,1,0,-0.661408,0.223688,1.519124,-0.507643,1,0,45,1.355231,1.224123
39998,0,0,0,1,0,-1.214816,0.189648,-1.553868,0.167078,0,1,38,-1.449905,-0.962829
39999,0,0,2,1,0,-0.127718,-0.510139,-0.049382,1.516518,1,1,28,-0.630835,0.042205


In [28]:
def tune_model(model, param_grid):
    tuner = RandomizedSearchCV(model, param_grid, cv = 5, n_iter = 20, verbose = True, random_state = 42)
    tuner.fit(X, y)
    print(f"Best Score for {model.__class__.__name__}: {tuner.best_score_:.2f}")
    print(f"Best Parameter for {model.__class__.__name__}: {tuner.best_params_}")
    return tuner.best_estimator_

In [29]:
log_reg_grid = {'C': np.logspace(-4, 4, 20), "solver": ["liblinear"]}
svc_grid = {'C': [0.25, 0.50, 0.75, 1], "kernel": ['linear']}

rf_grid = {
    'n_estimators': np.arange(10, 1000, 10),
    'max_features': ['log2', 'sqrt'], 
    'max_depth': [None, 3, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 20, 50, 100],
    'min_samples_leaf': [1, 2, 5, 10]
}

In [30]:
def evaluate_model(model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cross_val = cross_val_score(model, X, y, cv=5)
    avg_cross_val = np.mean(cross_val)
    print(f"{model.__class__.__name__} - Accuracy : {accuracy: .2f}, Cross-Val-Score : {avg_cross_val: .2f}")
    return avg_cross_val

In [31]:
best_rf = tune_model(RandomForestClassifier(), rf_grid)
best_log_reg = tune_model(LogisticRegression(), log_reg_grid)
models = {
    LogisticRegression(),
    svm.SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    best_rf,
    best_log_reg,
    GradientBoostingClassifier()
}

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for RandomForestClassifier: 0.95
Best Parameter for RandomForestClassifier: {'n_estimators': np.int64(840), 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30}
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for LogisticRegression: 0.72
Best Parameter for LogisticRegression: {'solver': 'liblinear', 'C': np.float64(0.0018329807108324356)}


In [32]:
model_score = {model.__class__.__name__:evaluate_model(model) for model in models}

RandomForestClassifier - Accuracy :  0.95, Cross-Val-Score :  0.95
GradientBoostingClassifier - Accuracy :  0.78, Cross-Val-Score :  0.78
RandomForestClassifier - Accuracy :  0.95, Cross-Val-Score :  0.95
SVC - Accuracy :  0.72, Cross-Val-Score :  0.71
DecisionTreeClassifier - Accuracy :  0.96, Cross-Val-Score :  0.95


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression - Accuracy :  0.72, Cross-Val-Score :  0.71
LogisticRegression - Accuracy :  0.73, Cross-Val-Score :  0.72


In [33]:
final_model_1 = best_rf

In [34]:
joblib.dump(final_model_1, 'rfforty.pkl')

['rfforty.pkl']

In [35]:
joblib.dump(scalar, 'vector.pkl')

['vector.pkl']

In [36]:
# Prediction System

scaler = joblib.load('vector.pkl')

# Function to calculate EMI
def calculate_emi(loan_amount, loan_term, rate=8.0):
    """Calculate EMI based on loan amount, loan term (in months), and interest rate."""
    rate = rate / (12 * 100)  # Monthly Interest Rate
    if rate == 0:  # Edge case for zero interest (unlikely)
        return loan_amount / loan_term  
    emi = (loan_amount * rate * (1 + rate) ** loan_term) / ((1 + rate) ** loan_term - 1)
    return round(emi, 2)

sample_data = pd.DataFrame({
    'Gender': [1],
    'Married': [1],
    'Dependents': [2],
    'Education': [1],
    'Self_Employed': [1],
    'ApplicantIncome': [3000],
    'CoapplicantIncome': [0.0],
    'LoanAmount': [6600],
    'Loan_Amount_Term': [360],
    'Credit_History': [1],         # Loan gets approved on changing credit history to 1
    'Property_Area': [1],
    'Age' : [35]
})

# Calculate EMI for each row
sample_data['EMI'] = sample_data.apply(lambda x: calculate_emi(x['LoanAmount'], x['Loan_Amount_Term']), axis=1)
EMI_value = sample_data['EMI'].iloc[0]
print(EMI_value)

# Calculate EMI-to-Income Ratio
# sample_data['Total_Income'] = sample_data['ApplicantIncome'] + sample_data['CoapplicantIncome']
sample_data['EMI_to_Income'] = sample_data['EMI'] / (sample_data['ApplicantIncome'] + sample_data['CoapplicantIncome'])
EMI_to_Income_value = sample_data['EMI_to_Income'].iloc[0]
print(EMI_to_Income_value)

num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'EMI', 'EMI_to_Income']

sample_data[num_cols] = scaler.transform(sample_data[num_cols])
loaded_model = joblib.load('rf.pkl')
prediction = loaded_model.predict(sample_data)
prob = loaded_model.predict_proba(sample_data)
print(f"Probabilty of Loan approval: {prob[0][1]: .2f}\n")

importances = loaded_model.feature_importances_
for feature, importance in zip(X.columns, importances):
    print(f"{feature}: {importance:.3f}")

result = "Loan Approved" if prediction[0] == 1 and EMI_to_Income_value <= 0.4 else "Loan Not Approved"
print(f"\nPrediction Result: {result}")
print(scalar.mean_)
print(scalar.scale_)

48.43
0.016143333333333332
Probabilty of Loan approval:  0.77

Gender: 0.010
Married: 0.024
Dependents: 0.018
Education: 0.018
Self_Employed: 0.005
ApplicantIncome: 0.098
CoapplicantIncome: 0.064
LoanAmount: 0.084
Loan_Amount_Term: 0.007
Credit_History: 0.388
Property_Area: 0.050
Age: 0.033
EMI: 0.099
EMI_to_Income: 0.103

Prediction Result: Loan Approved
[3.59132040e+03 1.28885690e+03 1.04542700e+05 2.25142500e+02
 9.77108196e+02 2.18744094e-01]
[1.52148224e+03 2.52648244e+03 3.12399196e+04 8.89257257e+01
 3.50849310e+02 8.86263129e-02]
