# Loan Prediction using Bagging and Boosting Models

#### Predicting whether loan will be approved or not using:
- Voting CLassifier
    -hard voting
    -soft voting
- Bagging Classifier
- Gradient Boosting Classifier
- XGBoost Classifier
- LightGBM Classifier
- CatBoost Classifier

In [2]:
import pandas as pd

In [5]:
data = pd.read_csv('loan_prediction.csv')
df = data.copy()

## Exploratory Data Analysis & Data Preparation

In [6]:
df.head(20)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [8]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Filling null values with mode and median

In [9]:
df.Gender.fillna(df.Gender.mode()[0], inplace= True)

In [10]:
df.Married.fillna(df.Married.mode()[0], inplace= True)

In [11]:
df.Dependents.fillna(df.Dependents.mode()[0], inplace= True)

In [12]:
df.Self_Employed.fillna(df.Self_Employed.mode()[0], inplace= True)

In [13]:
df.LoanAmount.fillna(df.LoanAmount.median(), inplace= True)

In [14]:
df.Loan_Amount_Term.fillna(df.Loan_Amount_Term.mode()[0], inplace= True)

In [15]:
df.Credit_History.fillna(df.Credit_History.mode()[0], inplace= True)

In [16]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [17]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [18]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [19]:
df_cat = [ 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
for col in df_cat:
    df[col] = label_encoder.fit_transform(df[col])

Dropping Loan_ID column since it will not have any positive effect on prediction

In [20]:
df.drop('Loan_ID', axis = 1, inplace = True)

In [22]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,128.0,360.0,1.0,2,1
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [23]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

In [25]:
from sklearn.preprocessing import RobustScaler

In [26]:
X = RobustScaler().fit_transform(X)

In [27]:
X

array([[ 0. , -1. ,  0. , ...,  0. ,  0. ,  0.5],
       [ 0. ,  0. ,  1. , ...,  0. ,  0. , -0.5],
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0.5],
       ...,
       [ 0. ,  0. ,  1. , ...,  0. ,  0. ,  0.5],
       [ 0. ,  0. ,  2. , ...,  0. ,  0. ,  0.5],
       [-1. , -1. ,  0. , ...,  0. , -1. ,  0. ]])

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, stratify = y)

## Voting Classifier

In [29]:
from sklearn.ensemble import VotingClassifier

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = SVC(probability=True)

### Hard Voting Classifier

In [31]:
estimators = [('lr', model1), ('dt', model2) , ('svm', model3) ] 
voting_hard = VotingClassifier(estimators, voting="hard")

In [32]:
import warnings
warnings.filterwarnings("ignore")

In [33]:
voting_hard.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('dt', DecisionTreeClassifier()),
                             ('svm', SVC(probability=True))])

In [34]:
voting_hard_pred = voting_hard.predict(X_test)

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
accuracy_score(y_test, voting_hard_pred)

0.8054054054054054

### Soft Voting Classifier

In [37]:
estimators = [('lr', model1), ('dt', model2) , ('svm', model3) ] 
voting_soft = VotingClassifier(estimators, voting="soft")

In [38]:
voting_soft.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('dt', DecisionTreeClassifier()),
                             ('svm', SVC(probability=True))],
                 voting='soft')

In [39]:
voting_soft_pred = voting_soft.predict(X_test)

In [40]:
accuracy_score(y_test, voting_soft_pred)

0.7297297297297297

## Bagging Classifier

In [41]:
from sklearn.ensemble import BaggingClassifier

In [43]:
from sklearn.svm import SVC
svm = SVC(probability=True)

In [52]:
bag = BaggingClassifier(base_estimator= svm,
                 n_estimators = 10,
                 max_samples = 0.4,
                 max_features = 11,
                 bootstrap = True,
                 bootstrap_features = True,
                 n_jobs = -1,
                 random_state = 123)

In [53]:
X_train.shape

(429, 11)

In [55]:
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=SVC(probability=True), bootstrap_features=True,
                  max_features=11, max_samples=0.4, n_jobs=-1,
                  random_state=123)

In [56]:
bag_pred = bag.predict(X_test)

In [57]:
accuracy_score(y_test, bag_pred)

0.6864864864864865

## Gradient Boosting Classifier

In [58]:
from sklearn.ensemble import GradientBoostingClassifier

In [59]:
gbm = GradientBoostingClassifier(random_state = 123)

In [60]:
gbm.fit(X_train, y_train)

GradientBoostingClassifier(random_state=123)

In [61]:
gbm_pred = gbm.predict(X_test)

In [63]:
accuracy_score(y_test, gbm_pred)

0.7621621621621621

## XGBoost Classifier

In [64]:
from xgboost import XGBClassifier

In [66]:
xgbt = XGBClassifier(max_depth = 2,
             learning_rate = 0.2,
             objective  = "multi:softmax",
             num_class = 2,
             booster = "gbtree",
             n_estimarors = 10,
             random_state = 123)

In [67]:
xgbt.fit(X_train, y_train)

Parameters: { "n_estimarors" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.2, max_delta_step=0,
              max_depth=2, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimarors=10, n_estimators=100,
              n_jobs=16, num_class=2, num_parallel_tree=1,
              objective='multi:softmax', predictor='auto', random_state=123,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, ...)

In [68]:
xgbt_pred = xgbt.predict(X_test)

In [69]:
accuracy_score(y_test, xgbt_pred)

0.772972972972973

In [70]:
xgbt.score(X_train, y_train)

0.8787878787878788

## LightGBM

In [71]:
# pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [72]:
from lightgbm import LGBMClassifier

In [74]:
model = LGBMClassifier()
model.fit(X_train, y_train)

LGBMClassifier()

In [75]:
predd = model.predict(X_test)

In [76]:
model.score(X_train, y_train)

0.9790209790209791

In [90]:
model.score(X_test, y_test)

0.7297297297297297

## Catboost

In [91]:
from catboost import CatBoostClassifier

In [95]:
from catboost import CatBoostRegressor

# Initialize CatBoostRegressor
model = CatBoostClassifier(iterations=2,
                          learning_rate=1,
                          depth=2)
# Fit model
model.fit(X_train, y_train)
# Get predictions
preds = model.predict(X_test)

0:	learn: 0.4779899	total: 156ms	remaining: 156ms
1:	learn: 0.4601689	total: 157ms	remaining: 0us


In [96]:
model.score(X_train, y_train)

0.8181818181818182

In [97]:
model.score(X_test, y_test)

0.7891891891891892