In [70]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib

In [71]:
df = pd.read_csv("loan_data_expanded1.csv")

In [72]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508,128,120,1,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0,66,120,1,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358,120,120,1,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0,141,360,1,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516,95,240,0,Urban,Y


In [73]:
df.shape

(20000, 13)

In [74]:
df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
19995,LP002743,Female,No,0,Graduate,No,2282,0,112,180,1,Semiurban,N
19996,LP002308,Male,Yes,0,Not Graduate,No,2234,2071,89,360,0,Urban,Y
19997,LP002187,Male,No,0,Graduate,No,2467,0,88,120,1,Semiurban,N
19998,LP002953,Male,Yes,3+,Graduate,No,4660,0,139,180,0,Urban,Y
19999,LP001750,Male,Yes,0,Graduate,No,5358,0,127,240,1,Semiurban,Y


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Loan_ID            20000 non-null  object
 1   Gender             20000 non-null  object
 2   Married            20000 non-null  object
 3   Dependents         20000 non-null  object
 4   Education          20000 non-null  object
 5   Self_Employed      20000 non-null  object
 6   ApplicantIncome    20000 non-null  int64 
 7   CoapplicantIncome  20000 non-null  int64 
 8   LoanAmount         20000 non-null  int64 
 9   Loan_Amount_Term   20000 non-null  int64 
 10  Credit_History     20000 non-null  int64 
 11  Property_Area      20000 non-null  object
 12  Loan_Status        20000 non-null  object
dtypes: int64(5), object(8)
memory usage: 2.0+ MB


In [76]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [77]:
df.isnull().mean()*100

Loan_ID              0.0
Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
Loan_Status          0.0
dtype: float64

In [79]:
df = df.drop('Loan_ID', axis=1)

In [104]:
df['Dependents'].replace('3+', '4', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Dependents'].replace('3+', '4', inplace=True)


In [80]:
(df.select_dtypes(include=[np.number]) < 0).any()

ApplicantIncome      False
CoapplicantIncome    False
LoanAmount           False
Loan_Amount_Term     False
Credit_History       False
dtype: bool

In [81]:
df[(df.select_dtypes(include=[np.number]) < 0).any(axis=1)]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status


In [82]:
median_income = df[df["ApplicantIncome"] > 0]["ApplicantIncome"].median()
df.loc[df["ApplicantIncome"] < 0, "ApplicantIncome"] = int(median_income)

In [83]:
df[(df.select_dtypes(include=[np.number]) < 0).any(axis=1)]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status


In [84]:
encoding = {
    'Gender' : {'Male': 1, 'Female': 0},
    'Married' : {'Yes': 1, 'No': 0},
    'Dependents' : {'0': 0, '1': 1, '2': 2, '4': 4},
    'Education' : {'Graduate': 1, 'Not Graduate': 0},
    'Self_Employed' : {'Yes': 1, 'No': 0},
    'Property_Area' : {'Rural': 0, 'Semiurban': 2, 'Urban': 1},
    'Loan_Status' : {'Y': 1, 'N': 0}
}

In [85]:
df.replace(encoding, inplace=True)

  df.replace(encoding, inplace=True)


In [86]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1,1,0,4583,1508,128,120,1,0,0
1,1,1,0,1,1,3000,0,66,120,1,1,1
2,1,1,0,0,0,2583,2358,120,120,1,1,1
3,1,0,0,1,0,6000,0,141,360,1,1,1
4,1,1,0,0,0,2333,1516,95,240,0,1,1


In [87]:
df["CoapplicantIncome"] = df["CoapplicantIncome"].round().astype(int)
df["LoanAmount"] = df["LoanAmount"].round().astype(int)

In [88]:
def estimate_age(row):
    base_age = 60 - (row['Loan_Amount_Term'] / 12)

    total_income = row['ApplicantIncome'] + row['CoapplicantIncome']
    if total_income < 4000:
        base_age -= 2
    elif total_income > 8000:
        base_age += 3
    
    if row['Self_Employed'] == 1:
        base_age += 5
    
    return int(base_age)

In [89]:
df['Age'] = df.apply(estimate_age, axis = 1)

In [90]:
df['LoanAmount'] = df['LoanAmount'] * 1000

In [91]:
def calculate_emi(row):
    P = row['LoanAmount']
    n = row['Loan_Amount_Term']
    r = (8.0 / 12) / 100

    if r == 0:
        return P / n

    emi = (P * r * (1 + r)**n) / ((1 + r)**n - 1)
    return emi

In [92]:
df['EMI'] = df.apply( lambda x: round(calculate_emi(x), 2), axis=1 )

In [93]:
df['EMI_to_Income'] = df['EMI'] / (df['ApplicantIncome'] + df['CoapplicantIncome'])

In [94]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Age,EMI,EMI_to_Income
0,1,1,1,1,0,4583,1508,128000,120,1,0,0,50,1552.99,0.254965
1,1,1,0,1,1,3000,0,66000,120,1,1,1,53,800.76,0.26692
2,1,1,0,0,0,2583,2358,120000,120,1,1,1,50,1455.93,0.294663
3,1,0,0,1,0,6000,0,141000,360,1,1,1,30,1034.61,0.172435
4,1,1,0,0,0,2333,1516,95000,240,0,1,1,38,794.62,0.206448


In [95]:
X = df.drop('Loan_Status', axis = 1)
y = df['Loan_Status']

In [96]:
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'EMI', 'EMI_to_Income']
scalar = StandardScaler()
X[num_cols] = scalar.fit_transform(X[num_cols])

In [97]:
X.tail()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Age,EMI,EMI_to_Income
19995,0,0,0,1,0,-0.860556,-0.510139,0.238711,-0.505726,1,2,43,0.262652,2.787083
19996,1,1,0,0,0,-0.892104,0.309578,-0.497527,1.522044,0,1,30,-0.922065,-0.749516
19997,1,0,0,1,0,-0.738964,-0.510139,-0.529537,-1.181649,1,2,48,0.255128,2.383125
19998,1,1,4,1,0,0.702394,-0.510139,1.102989,-0.505726,0,1,45,0.995235,0.736736
19999,1,1,0,1,0,1.161157,-0.510139,0.718865,0.170197,1,2,40,0.239796,-0.230567


In [98]:
def tune_model(model, param_grid):
    tuner = RandomizedSearchCV(model, param_grid, cv = 5, n_iter = 20, verbose = True, random_state = 42)
    tuner.fit(X, y)
    print(f"Best Score for {model.__class__.__name__}: {tuner.best_score_:.2f}")
    print(f"Best Parameter for {model.__class__.__name__}: {tuner.best_params_}")
    return tuner.best_estimator_

In [99]:
log_reg_grid = {'C': np.logspace(-4, 4, 20), "solver": ["liblinear"]}
svc_grid = {'C': [0.25, 0.50, 0.75, 1], "kernel": ['linear']}

rf_grid = {
    'n_estimators': np.arange(10, 1000, 10),
    'max_features': ['log2', 'sqrt'], 
    'max_depth': [None, 3, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 20, 50, 100],
    'min_samples_leaf': [1, 2, 5, 10]
}

In [100]:
def evaluate_model(model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cross_val = cross_val_score(model, X, y, cv=5)
    avg_cross_val = np.mean(cross_val)
    print(f"{model.__class__.__name__} - Accuracy : {accuracy: .2f}, Cross-Val-Score : {avg_cross_val: .2f}")
    return avg_cross_val

In [101]:
best_rf = tune_model(RandomForestClassifier(), rf_grid)
best_log_reg = tune_model(LogisticRegression(), log_reg_grid)
models = {
    LogisticRegression(),
    svm.SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    best_rf,
    best_log_reg,
    GradientBoostingClassifier()
}

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for RandomForestClassifier: 0.94
Best Parameter for RandomForestClassifier: {'n_estimators': np.int64(840), 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30}
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for LogisticRegression: 0.72
Best Parameter for LogisticRegression: {'solver': 'liblinear', 'C': np.float64(0.0018329807108324356)}


In [102]:
model_score = {model.__class__.__name__:evaluate_model(model) for model in models}

LogisticRegression - Accuracy :  0.71, Cross-Val-Score :  0.72


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression - Accuracy :  0.70, Cross-Val-Score :  0.71
RandomForestClassifier - Accuracy :  0.93, Cross-Val-Score :  0.94
GradientBoostingClassifier - Accuracy :  0.79, Cross-Val-Score :  0.78
RandomForestClassifier - Accuracy :  0.93, Cross-Val-Score :  0.93
DecisionTreeClassifier - Accuracy :  0.93, Cross-Val-Score :  0.91
SVC - Accuracy :  0.70, Cross-Val-Score :  0.71
