In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

 ### 'Loan Eligibility Prediction'

In real life , the problme of loan eligibility prediction is crucial for both lenders and borrowers. For lenders, accurately assesssing the rish associated with each loan application is essential for making informed decision and minimizing financial losses due to defuats. On the other hand for borrowers, access credit can significantly impact their ability to achieve financial goal sunch as buying a home, starting a business or pursiing higher education.


The important of this problme lies in its direct impact on induvidual's financial well-being and the stability of financial institutions. Inaccurate or biased loan eligibility predictions can lead to unfair lending pratices, discrimination and economic.


Therefore, Developing reliable and fair loan eligible prediction models is essential for promoting financial inclusion, reducing credit risk and fostering a healthy economy.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/kaggle/input/loan-eligible-dataset/loan-train.csv")

In [None]:
df.head()

1. Loan_ID: This column contains unique identifiers for each loan application. It is used to uniquely identify each record in the dataset.

2. Gender: This column represents the gender of the applicant, where 'Male' indicates male applicants and 'Female' indicates female applicants.

3. Married: This column indicates whether the applicant is married or not. 'Yes' means the applicant is married, and 'No' means the applicant is not married.

4. Dependents: This column indicates the number of dependents (e.g., children, elderly parents) the applicant has. It typically includes categories such as '0', '1', '2', '3+', representing the number of dependents.

5. Education: This column indicates the educational qualification of the applicant, where 'Graduate' indicates the applicant is a graduate and 'Not Graduate' indicates the applicant is not a graduate.

6. Self_Employed: This column indicates whether the applicant is self-employed or not. 'Yes' means the applicant is self-employed, and 'No' means the applicant is not self-employed.

7. ApplicantIncome: This column represents the income of the applicant.

8. CoapplicantIncome: This column represents the income of the co-applicant (if any) who is applying for the loan with the primary applicant.

9. LoanAmount: This column represents the amount of the loan applied for by the applicant.

10. Loan_Amount_Term: This column represents the term (duration) of the loan in months.

11. Credit_History: This column indicates the credit history of the applicant, where '1' means the applicant has a credit history, and '0' means the applicant does not have a credit history.

12. Property_Area: This column represents the location of the property for which the loan is being applied. It typically includes categories such as 'Rural', 'Semiurban', and 'Urban'.

13. Loan_Status: This column indicates whether the loan application was approved or not. 'Y' means the loan was approved, and 'N' means the loan was not approved.

In [None]:
df.info()

In [None]:
df.describe()

**Step 01: Data Preprocessing**

In [None]:
df.isnull().sum()

1. SimpleImputer: This class from sklearn.impute provides a simple strategy for imputing missing values in a dataset. Missing values can be replaced with a constant value (like 0), the mean, median, or most frequent value along each column. It helps handle missing data before feeding it to machine learning algorithms.

2. OneHotEncoder: This class from sklearn.preprocessing is used for one-hot encoding categorical features. Categorical variables are typically encoded as integers before being fed into machine learning algorithms, but this can introduce unintended ordinality. One-hot encoding transforms categorical variables into a binary matrix where each category becomes a separate binary feature.

3. StandardScaler: This class from sklearn.preprocessing is used for standardizing features by removing the mean and scaling to unit variance. Standardization is a common preprocessing step in machine learning workflows, as it helps to center the data around 0 and scale it to have a standard deviation of 1. This ensures that features are on a similar scale, which can be important for some algorithms.

4. ColumnTransformer: This class from sklearn.compose allows for applying different transformations to different columns or subsets of columns in a dataset. It is particularly useful when you have a mix of numerical and categorical features that require different preprocessing steps. ColumnTransformer enables you to create a preprocessing pipeline that handles each type of feature appropriately.

5. Pipeline: This class from sklearn.pipeline is used to sequentially apply a list of transformations to the data. It chains together multiple processing steps, such as imputation, encoding, and scaling, into a single object. Pipeline provides a convenient way to encapsulate the preprocessing steps and the model training step into a single entity, making the workflow more manageable and less error-prone.

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# Step 1: Define preprocessing steps
numeric_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
    ('scaler', StandardScaler())  # Scale numerical features
])

categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Step 2: Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Step 3: Fit and transform the data
processed_data = preprocessor.fit_transform(df)

# Step 4: Convert processed_data back to a DataFrame
processed_df = pd.DataFrame(processed_data, columns=numeric_features + list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out()))

# Step 5: Add the target variable 'Loan_Status' to the DataFrame
processed_df['Loan_Status'] = df['Loan_Status']

In [None]:
processed_df.head()

**Step 02L EDA (Exploritory Data Analysis)**

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots

# Define the list of categorical features
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

# Create subplots for each categorical feature
fig = make_subplots(rows=2, cols=3, subplot_titles=categorical_features)

# Plot count plots for each categorical feature
for i, feature in enumerate(categorical_features):
    counts = df[feature].value_counts().reset_index()
    counts.columns = [feature, 'count']
    fig.add_trace(px.bar(counts, x=feature, y='count').data[0], row=(i // 3) + 1, col=(i % 3) + 1)

# Update layout
fig.update_layout(title='Count of Categorical Features', showlegend=False)
fig.show()

In [None]:
# Define the list of categorical features
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

# Plot grouped bar plots for each categorical feature with Loan_Status
for feature in categorical_features:
    counts = df.groupby([feature, 'Loan_Status']).size().reset_index(name='count')
    fig = px.bar(counts, x=feature, y='count', color='Loan_Status', barmode='group', 
                 labels={'count': 'Count', 'Loan_Status': 'Loan Status'}, 
                 title=f'Count of Loan Status by {feature}')
    fig.show()

**Step 03: Study more on feature that influence the decision of Loan Eligibility**

In [None]:
print("Summary Statistics for ApplicantIncome:")
print(df['ApplicantIncome'].describe())

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['ApplicantIncome'], kde=True)
plt.title('Histogram of ApplicantIncome')
plt.xlabel('ApplicantIncome')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(y=df['ApplicantIncome'])
plt.title('Box Plot of ApplicantIncome')
plt.ylabel('ApplicantIncome')
plt.show()

In [None]:
print("Summary Statistics for LoanAmount:")
print(df['LoanAmount'].describe())

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['LoanAmount'], kde=True)
plt.title('Histogram of LoanAmount')
plt.xlabel('LoanAmount')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(y=df['LoanAmount'])
plt.title('Box Plot of LoanAmount')
plt.ylabel('LoanAmount')
plt.show()

In [None]:
print("Summary Statistics for Loan_Amount_Term:")
print(df['Loan_Amount_Term'].describe())

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['Loan_Amount_Term'], kde=True)
plt.title('Histogram of Loan_Amount_Term')
plt.xlabel('Loan_Amount_Term')
plt.ylabel('Frequency')
plt.show()

**Step 04: Remove outlier from those features**

In [None]:
# Define the numerical features
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

# Remove outliers above the 99th percentile for each numerical column
for feature in numerical_features:
    percentile_99 = df[feature].quantile(0.99)
    df = df[df[feature] <= percentile_99]

# Now you can perform EDA on the updated dataframe without outliers

In [None]:
# Combine numerical and categorical features
all_features = numerical_features + categorical_features

# Summary statistics after removing outliers
print("Summary Statistics after Removing Outliers:")
print(df[all_features].describe())

**Step 05: Building Logistic Regression Model**

In [None]:
class LogisticRegression:
    def __init__(self, lr=0.1, max_iter=10000):
        self.lr = lr
        self.max_iter = max_iter
        self.weight = None
        self.bias = None
        self.cost = []

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def cost_function(self, X, y):
        m = len(y)
        z = np.dot(X, self.weight) + self.bias
        h = self.sigmoid(z)
        cost = -1 / m * (np.dot(y, np.log(h)) + np.dot((1 - y), np.log(1 - h)))
        return cost
    
    def fit(self, X, y):
        m, n = X.shape
        self.weight = np.zeros(n)
        self.bias = 0

        for i in range(self.max_iter + 1):
            z = np.dot(X, self.weight) + self.bias
            h = self.sigmoid(z)
            grad_w = 1 / m * np.dot(X.T, y - h)
            grad_b = 1 / m * np.sum(y - h)

            self.weight += self.lr * grad_w
            self.bias += self.lr * grad_b

            c = self.cost_function(X, y)
            self.cost.append(c)

            final_weight = self.weight.tolist()
            final_bias = self.weight.tolist()

            if i % 1000 == 0:
                print(f"Iteration {i}: Cost = {c}")

    def predict(self, X):
        z = np.dot(X, self.weight) + self.bias
        h = self.sigmoid(z)
        predictions = [1 if p >= 0.5 else 0 for p in h]
        return predictions

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
# Define the features (X) and target variable (y)
X = processed_df.drop(columns=['Loan_Status'])  # Features
y = processed_df['Loan_Status']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode target variable y_train
y_train_encoded = label_encoder.fit_transform(y_train)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train_encoded)

In [None]:
plt.plot(range(len(model.cost)), model.cost)
plt.xlabel('Iteration')
plt.ylabel('Cost')
plt.title('Cost vs. Iteration')
plt.show()

In [None]:
y_pred_numeric = model.predict(X_test)

label_mapping = {0: 'N', 1: 'Y'}
y_pred = np.array([label_mapping[label] for label in y_pred_numeric])

# Step 6: Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

**Use sklearn logistic model to compare the result**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import confusion_matrix
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Predicted No', 'Predicted Yes'],
            yticklabels=['Actual No', 'Actual Yes'])
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)

print("Report Summarize")
print(report)