In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
%matplotlib inline

In [4]:
data = pd.read_csv('LoanPrediction_train.csv')

In [5]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [7]:
# Checking how many types of genders are present in this data
data['Gender'].value_counts()

Male      489
Female    112
Name: Gender, dtype: int64

In [8]:
# Filling missing gender data with the ratio of 4:1 (Male:Female)

def fill_gender_custom_ratio(df, column='Gender', male_label='Male', female_label='Female', ratio=(4,1)):
    null_indices = df[df[column].isna()].index
    total_nulls = len(null_indices)
    
    # Calculate how many Males and Females to insert
    total_units = sum(ratio)
    male_count = int((ratio[0] / total_units) * total_nulls)
    female_count = total_nulls - male_count  # rest are female
    
    # Create the fill values in the required ratio
    fill_values = [male_label]*male_count + [female_label]*female_count
    
    # Fill in the order of appearance
    for i, idx in enumerate(null_indices):
        df.at[idx, column] = fill_values[i]
    
    return df

# Apply it
data = fill_gender_custom_ratio(data)


In [9]:
# Checking the total null values of this column and fixing it
data['Gender'].isna().sum()

0

In [10]:
# Checking the types of marital status
data['Married'].value_counts()

Yes    398
No     213
Name: Married, dtype: int64

In [11]:
# Filling marital status with (2:1) ratio
def fill_married_custom_ratio(df, column='Married', yes_label='Yes', no_label='No', ratio=(2,1)):
    null_indices = df[df[column].isna()].index
    total_nulls = len(null_indices)
    
    # Calculate counts based on ratio
    total_units = sum(ratio)
    yes_count = int((ratio[0] / total_units) * total_nulls)
    no_count = total_nulls - yes_count
    
    # Create fill list in order
    fill_values = [yes_label]*yes_count + [no_label]*no_count

    # Fill in order of appearance
    for i, idx in enumerate(null_indices):
        df.at[idx, column] = fill_values[i]

    return df

# Apply it
data = fill_married_custom_ratio(data)


In [12]:
# Checking the total null values of this column and fixing it
data['Married'].isna().sum()

0

In [13]:
# Checking how many types of dependents are present in this data
data['Dependents'].value_counts()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [14]:
# Filling dependents data 
def fill_dependents_proportionally(df, column='Dependents'):
    # Get value counts (excluding NaNs)
    value_counts = df[column].value_counts(normalize=True)
    
    # Find how many values are missing
    null_indices = df[df[column].isna()].index
    total_nulls = len(null_indices)
    
    # Calculate how many of each category to fill
    fill_counts = (value_counts * total_nulls).round().astype(int)

    # Adjust to make sure the total adds up to total_nulls
    while fill_counts.sum() < total_nulls:
        # Add to the most frequent category
        most_common = fill_counts.idxmax()
        fill_counts[most_common] += 1
    while fill_counts.sum() > total_nulls:
        # Subtract from the most frequent category
        most_common = fill_counts.idxmax()
        fill_counts[most_common] -= 1

    # Create fill values list
    fill_values = []
    for category, count in fill_counts.items():
        fill_values.extend([category] * count)

    # Assign them in the order of null appearance
    for i, idx in enumerate(null_indices):
        df.at[idx, column] = fill_values[i]

    return df

# Apply it
data = fill_dependents_proportionally(data)


In [15]:
# Checking the total null values of this column and fixing it
data['Dependents'].isna().sum()

0

In [16]:
# Values after filling dependents data
data['Dependents'].value_counts()

0     353
1     105
2     104
3+     52
Name: Dependents, dtype: int64

In [17]:
# Checking how many types of education level are present in this data
data['Education'].value_counts()

Graduate        480
Not Graduate    134
Name: Education, dtype: int64

In [18]:
# Checking the total null values of this column and fixing it
data['Education'].isna().sum()

0

In [19]:
# Checking how many types of self-employed are present in this data
data['Self_Employed'].value_counts()

No     500
Yes     82
Name: Self_Employed, dtype: int64

In [20]:
# Filling Self_Employed column in this data
def fill_self_employed_proportionally(df, column='Self_Employed'):
    # Get current value distribution
    value_counts = df[column].value_counts(normalize=True)
    
    # Get indices of missing values
    null_indices = df[df[column].isna()].index
    total_nulls = len(null_indices)
    
    # Calculate how many 'No' and 'Yes' to fill
    fill_counts = (value_counts * total_nulls).round().astype(int)

    # Adjust in case rounding errors occur
    while fill_counts.sum() < total_nulls:
        fill_counts[fill_counts.idxmax()] += 1
    while fill_counts.sum() > total_nulls:
        fill_counts[fill_counts.idxmax()] -= 1

    # Build the fill list
    fill_values = []
    for category, count in fill_counts.items():
        fill_values.extend([category] * count)

    # Fill in order of missing values
    for i, idx in enumerate(null_indices):
        df.at[idx, column] = fill_values[i]
    
    return df

# Apply it
data = fill_self_employed_proportionally(data)


In [21]:
# Checking the total null values of this column and fixing it
data['Self_Employed'].isna().sum()

0

In [22]:
data['Self_Employed'].value_counts()

No     527
Yes     87
Name: Self_Employed, dtype: int64

In [23]:
# Checking the total null values (LoanAmount) of this column and fixing it
data['LoanAmount'].isna().sum()

22

In [24]:
# Fill missing LoanAmount values with the median
data['LoanAmount'].fillna(data['LoanAmount'].median(), inplace=True)

In [25]:
data['LoanAmount'].isna().sum()

0

In [26]:
data['Loan_Amount_Term'].value_counts()

360.0    512
180.0     44
480.0     15
300.0     13
240.0      4
84.0       4
120.0      3
60.0       2
36.0       2
12.0       1
Name: Loan_Amount_Term, dtype: int64

In [27]:
# Checking the total null values (Loan_Amount_term) of this column and fixing it
data['Loan_Amount_Term'].isna().sum()

14

In [28]:
# Filling the Loan_Amount_Term with the most value which is : 360.0
data['Loan_Amount_Term'].fillna(360.0, inplace=True)

In [29]:
data['Loan_Amount_Term'].isna().sum()

0

In [30]:
# Checking how many types of Credit History are present in this data
data['Credit_History'].value_counts()

1.0    475
0.0     89
Name: Credit_History, dtype: int64

In [31]:
# Checking the total null values (Credit History) of this column and fixing it
data['Credit_History'].isna().sum()

50

In [32]:
# Filling Credit_History with proportional value
def fill_credit_history_proportionally(df, column='Credit_History'):
    value_counts = df[column].value_counts(normalize=True)
    null_indices = df[df[column].isna()].index
    total_nulls = len(null_indices)

    # Calculate fill counts based on distribution
    fill_counts = (value_counts * total_nulls).round().astype(int)

    # Adjust to ensure exact match
    while fill_counts.sum() < total_nulls:
        fill_counts[fill_counts.idxmax()] += 1
    while fill_counts.sum() > total_nulls:
        fill_counts[fill_counts.idxmax()] -= 1

    # Build fill values list
    fill_values = []
    for val, count in fill_counts.items():
        fill_values.extend([val] * count)

    # Assign values in order
    for i, idx in enumerate(null_indices):
        df.at[idx, column] = fill_values[i]

    return df

# Apply it
data = fill_credit_history_proportionally(data)


In [33]:
data['Credit_History'].value_counts()

1.0    517
0.0     97
Name: Credit_History, dtype: int64

In [34]:
# Checking how many types of Property Area are present in this data
data['Property_Area'].value_counts()

Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64

In [35]:
# Checking the total null values (Property_Area) of this column and fixing it
data['Property_Area'].isna().sum()

0

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             614 non-null    object 
 2   Married            614 non-null    object 
 3   Dependents         614 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      614 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    float64
 9   Loan_Amount_Term   614 non-null    float64
 10  Credit_History     614 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [37]:
# Loan_ID is not very important feature so drop it.
data.drop('Loan_ID', axis=1, inplace=True)

In [38]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [39]:
# Scince Dependents has a value '3+' we should convert it in 3.
data['Dependents'] = data['Dependents'].replace('3+', '3')
# Changing the type of Dependents from object to int
data['Dependents'] = data['Dependents'].astype(int)

In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    int32  
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int32(1), int64(1), object(6)
memory usage: 55.3+ KB


In [41]:
# Changing the value of Loan_Status from Y & N to 1 & 0
data['Loan_Status'] = data['Loan_Status'].map({'Y': 1, 'N': 0})

In [42]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,1
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    int32  
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    int64  
dtypes: float64(4), int32(1), int64(2), object(5)
memory usage: 55.3+ KB


In [44]:
# encoding the Gender column
gender_dummies = pd.get_dummies(data['Gender'], prefix='Gender', drop_first=True)
data = pd.concat([data, gender_dummies], axis=1)

In [45]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Gender_Male
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,1,1
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0,1
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1,1


In [46]:
data['Gender_Male'].value_counts()

1    499
0    115
Name: Gender_Male, dtype: int64

In [47]:
data['Gender'].value_counts()

Male      499
Female    115
Name: Gender, dtype: int64

In [48]:
# One-hot encode 'Married' and 'Education', keeping only one column per category
married_dummies = pd.get_dummies(data['Married'], prefix='Married', drop_first=True)
education_dummies = pd.get_dummies(data['Education'], prefix='Education', drop_first=True)

# Concatenate the new columns to the original DataFrame
data = pd.concat([data, married_dummies, education_dummies], axis=1)

In [49]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Gender_Male,Married_Yes,Education_Not Graduate
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,1,1,0,0
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0,1,1,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1,1,1,0
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1,1,1,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1,1,0,0


In [50]:
# One-hot encode 'Self_Employed', keeping only one column
self_employed_dummies = pd.get_dummies(data['Self_Employed'], prefix='Self_Employed', drop_first=True)

# Concatenate to the original DataFrame
data = pd.concat([data, self_employed_dummies], axis=1)

In [51]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,1,1,0,0,0
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0,1,1,0,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1,1,1,0,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1,1,1,1,0
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1,1,0,0,0


In [52]:
# One-hot encode 'Property_Area', keeping two columns (drop_first=True)
property_area_dummies = pd.get_dummies(data['Property_Area'], prefix='Property_Area', drop_first=True)

# Concatenate new dummy columns to the DataFrame
data = pd.concat([data, property_area_dummies], axis=1)

In [53]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,1,1,0,0,0,0,1
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0,1,1,0,0,0,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1,1,1,0,1,0,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1,1,1,1,0,0,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1,1,0,0,0,0,1


In [54]:
# Dropping the unnecessary column remains in the data
data.drop(['Gender','Self_Employed','Married', 'Education','Property_Area'], axis=1, inplace=True)

In [55]:
data.head()

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,0,5849,0.0,128.0,360.0,1.0,1,1,0,0,0,0,1
1,1,4583,1508.0,128.0,360.0,1.0,0,1,1,0,0,0,0
2,0,3000,0.0,66.0,360.0,1.0,1,1,1,0,1,0,1
3,0,2583,2358.0,120.0,360.0,1.0,1,1,1,1,0,0,1
4,0,6000,0.0,141.0,360.0,1.0,1,1,0,0,0,0,1


In [56]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Dependents               614 non-null    int32  
 1   ApplicantIncome          614 non-null    int64  
 2   CoapplicantIncome        614 non-null    float64
 3   LoanAmount               614 non-null    float64
 4   Loan_Amount_Term         614 non-null    float64
 5   Credit_History           614 non-null    float64
 6   Loan_Status              614 non-null    int64  
 7   Gender_Male              614 non-null    uint8  
 8   Married_Yes              614 non-null    uint8  
 9   Education_Not Graduate   614 non-null    uint8  
 10  Self_Employed_Yes        614 non-null    uint8  
 11  Property_Area_Semiurban  614 non-null    uint8  
 12  Property_Area_Urban      614 non-null    uint8  
dtypes: float64(4), int32(1), int64(2), uint8(6)
memory usage: 34.9 KB


In [57]:
data.head()

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,0,5849,0.0,128.0,360.0,1.0,1,1,0,0,0,0,1
1,1,4583,1508.0,128.0,360.0,1.0,0,1,1,0,0,0,0
2,0,3000,0.0,66.0,360.0,1.0,1,1,1,0,1,0,1
3,0,2583,2358.0,120.0,360.0,1.0,1,1,1,1,0,0,1
4,0,6000,0.0,141.0,360.0,1.0,1,1,0,0,0,0,1


In [58]:
import seaborn as sns

In [59]:
x = data.drop('Loan_Status',axis=1)
y = data['Loan_Status']

In [60]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [61]:
from sklearn.linear_model import LogisticRegression

In [62]:
modelusing_logistic_regression = LogisticRegression(max_iter=10000)

In [63]:
modelusing_logistic_regression.fit(x_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,10000


In [64]:
predict_loan_status = modelusing_logistic_regression.predict(x_test)

In [65]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predict_loan_status))
print(classification_report(y_test,predict_loan_status))
print(accuracy_score(y_test,predict_loan_status))

[[16 24]
 [ 2 81]]
              precision    recall  f1-score   support

           0       0.89      0.40      0.55        40
           1       0.77      0.98      0.86        83

    accuracy                           0.79       123
   macro avg       0.83      0.69      0.71       123
weighted avg       0.81      0.79      0.76       123

0.7886178861788617


In [66]:
import joblib

In [67]:
joblib.dump(modelusing_logistic_regression,'iLoan.pkl')

['iLoan.pkl']