# Loan prediction problem

## Import

In [92]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.impute import MissingIndicator
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTENC

## Read dataset

In [93]:
raw_df = pd.read_csv("../Datasets/loan_datasets/train_loan_dataset.zip")
labels = raw_df['Loan_Status']

## Data expoloration

In [94]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


NB: Some columns has NaN values

In [95]:
raw_df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [96]:
raw_df.describe(include=np.object)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
count,614,601,611,599,614,582,614,614
unique,614,2,2,4,2,2,3,2
top,LP001713,Male,Yes,0,Graduate,No,Semiurban,Y
freq,1,489,398,345,480,500,233,422


In [97]:
raw_df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


## Data processing

### Features selector based on correlation index

In [98]:
# Correlation selector for the features selection 
class CorrSelector(BaseEstimator, TransformerMixin):
    '''
        extract features basing on a pearson correlation index
    '''
    
    def __init__(self, treshold=0.7):
        self.selected_features = []
        self.treshold = treshold

    def fit(self, X, y=None):
        column_number = X.shape[1]
        features_dict = {k:True for k in range(column_number)}
        for f1 in range(column_number):
            for f2 in range(f1+1, column_number):
                if abs(pearsonr(X[:,f1], X[:,f2])[0]) >= self.treshold:
                    features_dict[f2] = False
        self.selected_features = [k for k,v in features_dict.items() if v]
        return self

    def transform(self, X):
        return X[:,self.selected_features]

### Pipeline utilities

In [99]:
categorical_features_processing = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ordinal', OrdinalEncoder())
])

num_features_processing = Pipeline(steps=[
        ('inputing', KNNImputer(n_neighbors=2, weights="uniform")),
        ('scaling', StandardScaler())]
    )

preprocessor = ColumnTransformer(transformers=[
    ('num', num_features_processing, make_column_selector(dtype_include=np.number)),
    ('cat', categorical_features_processing, make_column_selector(dtype_include=object))
])

### Splitting dataset

In [100]:
X = raw_df.drop(columns=['Loan_Status', 'Loan_ID'])
y = raw_df['Loan_Status']
indexs_of_categorical_features = X.columns.get_indexer(X.select_dtypes('object').columns)
x_train,x_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=0)

### First evaluation of some ML algorithms

In [101]:
models = []
models.append(('Logistic Regression', LogisticRegression(solver='newton-cg', max_iter=100)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('Random Forest', RandomForestClassifier(n_estimators=100)))
models.append(('Gradient Boosting', GradientBoostingClassifier(n_estimators=100)))
models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier(max_iter=100)))
models.append(('SVM', SVC()))

results = []
names = []
results_mean = []

for name, model in models:
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('pearson', CorrSelector()),
        ('variance', VarianceThreshold(threshold=(.8 * (1 - .8)))),
        ('clf', model)
    ])  
    kfold = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    cv_results = cross_val_score(pipeline, x_train, y_train, cv=kfold, scoring='f1_macro')
    results.append(cv_results)
    names.append(name)
    results_mean.append(cv_results.mean())
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

Logistic Regression: 0.715480 (0.036203)
KNN: 0.719400 (0.033639)
Decision Tree: 0.642978 (0.030822)
Random Forest: 0.732857 (0.014497)
Gradient Boosting: 0.704474 (0.016436)
PassiveAggressiveClassifier: 0.597248 (0.111538)
SVM: 0.722456 (0.036979)


### Pipeline with Random Forest Classifier evaluation

In [102]:
rf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('pearson', CorrSelector()),
    ('variance', VarianceThreshold(threshold=(.8 * (1 - .8)))),
    ('clf', RandomForestClassifier(n_estimators=100, class_weight='balanced')),
])
rf_pipeline.fit(x_train, y_train)
y_pred = rf_pipeline.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           N       0.42      0.67      0.52        21
           Y       0.92      0.81      0.86       102

    accuracy                           0.79       123
   macro avg       0.67      0.74      0.69       123
weighted avg       0.84      0.79      0.81       123



### Pipeline with SVM classifier

In [103]:
svm_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('pearson', CorrSelector()),
    ('variance', VarianceThreshold(threshold=(.8 * (1 - .8)))),
    ('clf', SVC(class_weight='balanced')),
])
fitted_pipeline = pipeline.fit(x_train, y_train)
y_pred = fitted_pipeline.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           N       0.42      0.88      0.57        16
           Y       0.98      0.82      0.89       107

    accuracy                           0.83       123
   macro avg       0.70      0.85      0.73       123
weighted avg       0.91      0.83      0.85       123

