In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
loan_data = pd.read_csv('train.csv')

In [3]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [4]:
loan_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
loan_data.Loan_Status.isnull().sum()

0

In [6]:
y = loan_data.Loan_Status
X = loan_data.drop(['Loan_ID', 'Loan_Status'], axis = 1)

from sklearn.model_selection import train_test_split

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Take the low cardinality categorical data for easy one-hot encoding later
categorical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == 'object' and X_train_full[col].nunique() < 13]

numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ('int64', 'float64')]

total_cols = categorical_cols + numerical_cols

X_train = X_train_full[total_cols].copy()
X_valid = X_valid_full[total_cols].copy()

In [7]:
# Dealing with missing values

X_train.isnull().sum()

Gender               11
Married               3
Dependents           14
Education             0
Self_Employed        28
Property_Area         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           20
Loan_Amount_Term     11
Credit_History       38
dtype: int64

In [8]:
from sklearn.impute import SimpleImputer

cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
num_cols = list(set(X_train.columns) - set(cat_cols))

# print(cat_cols)
# print(num_cols)

# print(X_train[num_cols].isnull().sum())

# Imputing values for numerical/quantitative columns
numerical_imputer = SimpleImputer()
num_X_train = pd.DataFrame(numerical_imputer.fit_transform(X_train[num_cols]))
num_X_valid = pd.DataFrame(numerical_imputer.transform(X_valid[num_cols]))
# Imputation removed column names; put them back
num_X_train.columns = X_train[num_cols].columns
num_X_valid.columns = X_valid[num_cols].columns

# print(num_X_train)

# print(num_X_train.isnull().sum())

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_X_train = pd.DataFrame(scaler.fit_transform(num_X_train), columns = num_X_train.columns, index = num_X_train.index)
num_X_valid = pd.DataFrame(scaler.transform(num_X_valid), columns = num_X_valid.columns, index = num_X_valid.index)

# print(num_X_train)

# print(X_train[cat_cols].isnull().sum())
# Imputing values for categorical columns
categorical_imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
cat_X_train = pd.DataFrame(categorical_imputer.fit_transform(X_train[cat_cols]))
cat_X_valid = pd.DataFrame(categorical_imputer.transform(X_valid[cat_cols]))

# Imputation removed column names; put them back
cat_X_train.columns = X_train[cat_cols].columns
cat_X_valid.columns = X_valid[cat_cols].columns

# print(cat_X_train)
# print(cat_X_train.isnull().sum())

In [9]:
from sklearn.preprocessing import OneHotEncoder

# 2nd step : One-Hot Encoding for categorical columns
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output = False)
OH_train = OH_encoder.fit_transform(cat_X_train)
OH_valid = OH_encoder.transform(cat_X_valid)
train_feature_names = OH_encoder.get_feature_names_out(cat_X_train.columns)
valid_feature_names = OH_encoder.get_feature_names_out(cat_X_valid.columns)
OH_train = pd.DataFrame(OH_train, columns = train_feature_names)
OH_valid = pd.DataFrame(OH_valid, columns = valid_feature_names)

In [10]:
# Combine your numerical columns with categorical columns to get final training and validation sets

final_X_train = pd.concat([num_X_train, OH_train], axis = 1)
final_X_valid = pd.concat([num_X_valid, OH_valid], axis = 1)
# print(final_X_train.head())
# print(final_X_valid.head())

In [11]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

my_model = RandomForestClassifier(n_estimators=400, random_state=1)
my_model.fit(final_X_train, y_train)
predictions = my_model.predict(final_X_valid)
print("Confusion Matrix:")
print(confusion_matrix(y_valid, predictions))
print("\nClassification Report:")
print(classification_report(y_valid, predictions))

Confusion Matrix:
[[ 20  23]
 [  7 104]]

Classification Report:
              precision    recall  f1-score   support

           N       0.74      0.47      0.57        43
           Y       0.82      0.94      0.87       111

    accuracy                           0.81       154
   macro avg       0.78      0.70      0.72       154
weighted avg       0.80      0.81      0.79       154



In [12]:
my_model = RandomForestClassifier(n_estimators=400, random_state=1, class_weight='balanced')
my_model.fit(final_X_train, y_train)
predictions = my_model.predict(final_X_valid)
print("Confusion Matrix:")
print(confusion_matrix(y_valid, predictions))
print("\nClassification Report:")
print(classification_report(y_valid, predictions))

Confusion Matrix:
[[ 20  23]
 [  6 105]]

Classification Report:
              precision    recall  f1-score   support

           N       0.77      0.47      0.58        43
           Y       0.82      0.95      0.88       111

    accuracy                           0.81       154
   macro avg       0.79      0.71      0.73       154
weighted avg       0.81      0.81      0.80       154



In [13]:
from sklearn.linear_model import LogisticRegression

my_model2 = LogisticRegression(solver='liblinear', random_state=1, class_weight='balanced')
my_model2.fit(final_X_train, y_train)
predictions = my_model2.predict(final_X_valid)
print("Confusion Matrix:")
print(confusion_matrix(y_valid, predictions))
print("\nClassification Report:")
print(classification_report(y_valid, predictions))

Confusion Matrix:
[[24 19]
 [13 98]]

Classification Report:
              precision    recall  f1-score   support

           N       0.65      0.56      0.60        43
           Y       0.84      0.88      0.86       111

    accuracy                           0.79       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.78      0.79      0.79       154

