In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
%matplotlib inline

In [4]:
data = pd.read_csv('LoanPrediction_train.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
# Filling missing gender data with the ratio of 4:1 (Male:Female)

def fill_gender_custom_ratio(df, column='Gender', male_label='Male', female_label='Female', ratio=(4,1)):
    null_indices = df[df[column].isna()].index
    total_nulls = len(null_indices)
    
    # Calculate how many Males and Females to insert
    total_units = sum(ratio)
    male_count = int((ratio[0] / total_units) * total_nulls)
    female_count = total_nulls - male_count  # rest are female
    
    # Create the fill values in the required ratio
    fill_values = [male_label]*male_count + [female_label]*female_count
    
    # Fill in the order of appearance
    for i, idx in enumerate(null_indices):
        df.at[idx, column] = fill_values[i]
    
    return df

# Apply it
data = fill_gender_custom_ratio(data)


In [6]:
# Filling marital status with (2:1) ratio
def fill_married_custom_ratio(df, column='Married', yes_label='Yes', no_label='No', ratio=(2,1)):
    null_indices = df[df[column].isna()].index
    total_nulls = len(null_indices)
    
    # Calculate counts based on ratio
    total_units = sum(ratio)
    yes_count = int((ratio[0] / total_units) * total_nulls)
    no_count = total_nulls - yes_count
    
    # Create fill list in order
    fill_values = [yes_label]*yes_count + [no_label]*no_count

    # Fill in order of appearance
    for i, idx in enumerate(null_indices):
        df.at[idx, column] = fill_values[i]

    return df

# Apply it
data = fill_married_custom_ratio(data)


In [7]:
# Filling dependents data 
def fill_dependents_proportionally(df, column='Dependents'):
    # Get value counts (excluding NaNs)
    value_counts = df[column].value_counts(normalize=True)
    
    # Find how many values are missing
    null_indices = df[df[column].isna()].index
    total_nulls = len(null_indices)
    
    # Calculate how many of each category to fill
    fill_counts = (value_counts * total_nulls).round().astype(int)

    # Adjust to make sure the total adds up to total_nulls
    while fill_counts.sum() < total_nulls:
        # Add to the most frequent category
        most_common = fill_counts.idxmax()
        fill_counts[most_common] += 1
    while fill_counts.sum() > total_nulls:
        # Subtract from the most frequent category
        most_common = fill_counts.idxmax()
        fill_counts[most_common] -= 1

    # Create fill values list
    fill_values = []
    for category, count in fill_counts.items():
        fill_values.extend([category] * count)

    # Assign them in the order of null appearance
    for i, idx in enumerate(null_indices):
        df.at[idx, column] = fill_values[i]

    return df

# Apply it
data = fill_dependents_proportionally(data)


In [8]:
# Filling Self_Employed column in this data
def fill_self_employed_proportionally(df, column='Self_Employed'):
    # Get current value distribution
    value_counts = df[column].value_counts(normalize=True)
    
    # Get indices of missing values
    null_indices = df[df[column].isna()].index
    total_nulls = len(null_indices)
    
    # Calculate how many 'No' and 'Yes' to fill
    fill_counts = (value_counts * total_nulls).round().astype(int)

    # Adjust in case rounding errors occur
    while fill_counts.sum() < total_nulls:
        fill_counts[fill_counts.idxmax()] += 1
    while fill_counts.sum() > total_nulls:
        fill_counts[fill_counts.idxmax()] -= 1

    # Build the fill list
    fill_values = []
    for category, count in fill_counts.items():
        fill_values.extend([category] * count)

    # Fill in order of missing values
    for i, idx in enumerate(null_indices):
        df.at[idx, column] = fill_values[i]
    
    return df

# Apply it
data = fill_self_employed_proportionally(data)


In [9]:
# Fill missing LoanAmount values with the median
data['LoanAmount'].fillna(data['LoanAmount'].median(), inplace=True)

In [10]:
# Filling the Loan_Amount_Term with the most value which is : 360.0
data['Loan_Amount_Term'].fillna(360.0, inplace=True)

In [11]:
# Checking the total null values (Credit History) of this column and fixing it
data['Credit_History'].isna().sum()

np.int64(50)

In [12]:
# Filling Credit_History with proportional value
def fill_credit_history_proportionally(df, column='Credit_History'):
    value_counts = df[column].value_counts(normalize=True)
    null_indices = df[df[column].isna()].index
    total_nulls = len(null_indices)

    # Calculate fill counts based on distribution
    fill_counts = (value_counts * total_nulls).round().astype(int)

    # Adjust to ensure exact match
    while fill_counts.sum() < total_nulls:
        fill_counts[fill_counts.idxmax()] += 1
    while fill_counts.sum() > total_nulls:
        fill_counts[fill_counts.idxmax()] -= 1

    # Build fill values list
    fill_values = []
    for val, count in fill_counts.items():
        fill_values.extend([val] * count)

    # Assign values in order
    for i, idx in enumerate(null_indices):
        df.at[idx, column] = fill_values[i]

    return df

# Apply it
data = fill_credit_history_proportionally(data)


In [13]:
# Loan_ID is not very important feature so drop it.
data.drop('Loan_ID', axis=1, inplace=True)

In [14]:
# Scince Dependents has a value '3+' we should convert it in 3.
data['Dependents'] = data['Dependents'].replace('3+', '3')
# Changing the type of Dependents from object to int
data['Dependents'] = data['Dependents'].astype(int)

In [15]:
# Changing the value of Loan_Status from Y & N to 1 & 0
data['Loan_Status'] = data['Loan_Status'].map({'Y': 1, 'N': 0})

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    int64  
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    int64  
dtypes: float64(4), int64(3), object(5)
memory usage: 57.7+ KB


In [17]:
x = data.drop('Loan_Status',axis=1)
y = data['Loan_Status']

In [18]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,1
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1


In [19]:
data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,1
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,1
610,Male,Yes,3,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,1
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,1
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,1


In [20]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Target
# data['Loan_Status'] = data['Loan_Status'].map({'Y': 1, 'N': 0})

# X = data.drop('Loan_Status', axis=1)
# y = data['Loan_Status']

# Identify categorical & numeric columns
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education',
                    'Self_Employed', 'Property_Area']
numeric_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
                'Loan_Amount_Term', 'Credit_History']

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ])

# Full pipeline = preprocessing + model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=10000))
])

# Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Train
pipeline.fit(x_train, y_train)

# Evaluate
y_pred = pipeline.predict(x_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

# Save pipeline (this includes preprocessing + model!)
import joblib
joblib.dump(pipeline, "iLoan.pkl")


[[13 20]
 [ 4 86]]
              precision    recall  f1-score   support

           0       0.76      0.39      0.52        33
           1       0.81      0.96      0.88        90

    accuracy                           0.80       123
   macro avg       0.79      0.67      0.70       123
weighted avg       0.80      0.80      0.78       123

0.8048780487804879


['iLoan.pkl']