In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('Datasets/loan-eligibility.csv')
df.shape

(614, 13)

(614, 13)

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Co

In [None]:
df.isna().sum()

In [None]:
df.drop('Loan_ID',axis=1, inplace=True)

# Preprocessing
### Outlier Detection

In [None]:
target_columns = ['LoanAmount','Loan_Amount_Term','Credit_History']


for columns in df.columns:
    if columns in target_columns:
        plt.Figure(figsize=(8,4))
        sns.histplot(df[columns],kde = True)
        plt.title(f'Histogram of {columns}')
        plt.xlabel(columns)
        plt.ylabel('Frequency')
        plt.show()

In [None]:
sns.boxplot(x= df['LoanAmount'])

### Z score to detect outlier

In [None]:
mu = df['LoanAmount'].mean()
sigma = df['LoanAmount'].std()

print(mu)
print(sigma)

upper_bound = mu + 3*sigma
lower_bound = mu - 3*sigma

print(upper_bound)
print(lower_bound)

In [None]:
df[(df['LoanAmount']<upper_bound)&(df['LoanAmount']>lower_bound)]

In [None]:

df['LoanAmount'] = np.where(df['LoanAmount'] < lower_bound, lower_bound,
                          (np.where(df['LoanAmount'] > upper_bound, upper_bound, df['LoanAmount']))
                          )


In [None]:
df.shape

In [None]:
sns.boxplot(x= df['LoanAmount'])

In [None]:
q1 = df['LoanAmount'].quantile(0.25)
q3 = df['LoanAmount'].quantile(0.75)
iqr = q3 - q1
iqr

In [None]:
lower_limit = q1 - 1.5*iqr
upper_limit = q3 + 1.5*iqr
lower_limit, upper_limit

In [None]:
df[(df['LoanAmount'] >= lower_limit) & (df['LoanAmount'] <= upper_limit)]

### Train test split

In [None]:
X = df.drop('Loan_Status',axis=1)
y = df['Loan_Status']

In [None]:
X_train,X_holdout,y_train,y_holdout = train_test_split(X, y, test_size=0.15, random_state=42)
X_train.shape, X_holdout.shape, y_train.shape, y_holdout.shape

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_train, y_train, test_size=0.15, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Impute missing values

In [None]:
X_train.head()

In [None]:
imp = SimpleImputer(missing_values=np.nan,strategy='mean')

imp.fit(X_train.iloc[:,8:11])

In [None]:
X_train.iloc[:,8:11] = imp.transform(X_train.iloc[:,8:11])
X_test.iloc[:,8:11] = imp.transform(X_test.iloc[:,8:11])

In [None]:
X_train.isna().sum()

### Impute by mode

In [None]:
imp_mode = SimpleImputer(missing_values=np.nan,strategy='most_frequent')

imp_mode.fit(X_train.iloc[:,[1,2,3,5]])

In [None]:
X_train.iloc[:,[1,2,3,5]] = imp_mode.transform(X_train.iloc[:,[1,2,3,5]])
X_test.iloc[:,[1,2,3,5]] = imp_mode.transform(X_test.iloc[:,[1,2,3,5]])

In [None]:
X_train.isna().sum()

In [None]:
imp_constant = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)

### Encoding categorical features

In [None]:
columns_to_encode = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']


ohe = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)

ohe.fit(X_train[columns_to_encode])


In [None]:
encoded_columns = ohe.transform(X_train[columns_to_encode])

In [None]:
X_train.head()

In [None]:
ohe = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)

In [None]:
tranformer = ColumnTransformer(transformers=[('onehot',ohe,[1,2,3,4,5,11]),
                                            ],
                              remainder='passthrough')

In [None]:
tranformer.fit(X_train)

In [None]:
X_train_encode = tranformer.transform(X_train)
X_test_encode = tranformer.transform(X_test)

In [None]:
X_train_transformed_df1 = pd.DataFrame(X_train_encode) 
X_train_transformed_df1.head()

In [None]:
X_train_transformed_df1.drop(9,axis=1,inplace=True)

In [None]:
X_test_transformed_df1 = pd.DataFrame(X_test_encode) 
X_test_transformed_df1.drop(9,axis=1,inplace=True)

In [None]:
le= LabelEncoder()

le.fit(y_train)

In [None]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)
y_holdout = le.transform(y_holdout)

### scaling features

In [None]:
ss = StandardScaler()

ss.fit(X_train_transformed_df1)

In [None]:
X_train_scale = ss.transform(X_train_transformed_df1)
X_test_scale = ss.transform(X_test_transformed_df1)

In [None]:
X_train_scale

### training data

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [None]:
lr.fit(X_train_scale,y_train)

In [None]:
y_pred = lr.predict(X_test_scale)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(y_test, y_pred);
# print("Precision Score:", precision_score(y_test, y_pred))
# print("Recall Score:", recall_score(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import recall_score, precision_score, f1_score
recall_score(y_test, y_pred)

In [None]:
print("Precision: ", precision_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))