In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


**<h1><center>Model Selection and Tunning</center></h1>**

In this file we are going to select the best model which gives good accuracy. And fine tune that model.

**<h2>Contents</h2>**

1. Importing Libraries

2. Loading Data

3. Splitting train and test set

4. Model Training

4. Selecting and fine Tuning that model

5. Saving Model

**<h3>1. Importing Libraries</h3>**

Here we will import all packages which is neccessary for training purpose.(sklearn, pandas, imblearn, XGboost, Catboost.)

**Install packages xgboost and imblearn if it is not installed.**
1. !pip install xgboost
2. !pip install catboost

In [2]:
!pip install xgboost
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1-cp37-none-manylinux1_x86_64.whl (76.8 MB)
[K     |████████████████████████████████| 76.8 MB 19 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1


In [3]:
import pandas as pd

from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import confusion_matrix,classification_report, f1_score

from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

from catboost import CatBoostClassifier

import numpy as np

In [17]:
import warnings
warnings.filterwarnings('ignore')

**2. Loading Data**

Here we will load preprocessed data. 

In [5]:
# Reading data set
train_data = pd.read_csv("/content/drive/My Drive/Loan Prediction/Data Files/\
processed_data_imbalanced_treated.csv")

train_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,LoanAmount_log,Total_income
0,LP001005,1,1,0,1,1,360.0,1.0,2,1,4.189655,3000.0
1,LP001006,1,1,0,0,0,360.0,1.0,2,1,4.787492,4941.0
2,LP001008,1,0,0,1,0,360.0,1.0,2,1,4.94876,6000.0
3,LP001011,1,1,2,1,1,360.0,1.0,2,1,5.587249,9613.0
4,LP001013,1,1,0,0,0,360.0,1.0,2,1,4.553877,3849.0


In [6]:
train_data.isnull().sum()

Loan_ID             0
Gender              0
Married             0
Dependents          0
Education           0
Self_Employed       0
Loan_Amount_Term    0
Credit_History      0
Property_Area       0
Loan_Status         0
LoanAmount_log      0
Total_income        0
dtype: int64

**3. Train-Test Splits**

Here we will use Stratified Train-Test Splits 

-  Some classification problems do not have a balanced number of examples for each class label. As such, it is desirable to split the dataset into train and test sets in a way that preserves the same proportions of examples in each class as observed in the original dataset. This is called a stratified train-test split.

- We can achieve this by setting the “stratify” argument to the y component of the original dataset. This will be used by the train_test_split() function to ensure that both the train and test sets have the proportion of examples in each class that is present in the provided “y” array.

In [7]:
X = train_data.drop(['Loan_Status', 'Loan_ID'],axis = 1)
y = train_data['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=1, 
                                                    stratify=y)

In [8]:
y_test.value_counts()

0    85
1    78
Name: Loan_Status, dtype: int64

In [9]:
y_train.value_counts()

0    337
1    311
Name: Loan_Status, dtype: int64

In [10]:
X_train

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Loan_Amount_Term,Credit_History,Property_Area,LoanAmount_log,Total_income
167,1,1,2,1,0,360.0,1.0,1,5.010635,6095.0
57,0,1,0,1,0,360.0,1.0,1,4.919981,4786.0
303,1,1,3,1,0,360.0,1.0,1,4.605170,4691.0
125,0,0,0,0,0,360.0,1.0,1,4.787492,4408.0
559,1,1,1,0,0,360.0,1.0,0,4.927254,9352.0
...,...,...,...,...,...,...,...,...,...,...
729,1,0,0,1,1,360.0,0.0,2,5.231109,10416.0
687,1,1,0,1,0,360.0,1.0,0,5.204007,8649.0
798,1,0,0,1,0,480.0,1.0,2,4.262680,3069.0
277,1,0,0,0,0,180.0,1.0,2,4.094345,2717.0


**4. Model Selection**

Here we will load various algorithms and train and we select best algorithms.

In [18]:
import pickle

prediction_data = {
    'Algorithm_name' : [],
    'Accuracy' : [],
    "F1_Score" : [],
    "Cross_validation_mean" : []
}

models = {'Logistic_Regression' : LogisticRegression(),
          "Random_forest": RandomForestClassifier(),
          "Gradient_boosting": GradientBoostingClassifier(),
          "Ada_boost": AdaBoostClassifier(),
          "KNN_algorithm": KNeighborsClassifier(),
          "XGB_boost": XGBClassifier(random_state=6,n_estimators=50),
          "Catboost" : CatBoostClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)

    pred = model.predict(X_test)
    result = model.score(X_test,y_test)

    cross_validation_score = cross_val_score(model,
                                             X_train, 
                                             y_train,
                                             cv=10,
                                             scoring="f1_macro"
                                             )
    prediction_data['Algorithm_name'].append(name)
    prediction_data['Accuracy'].append(result)
    prediction_data['F1_Score'].append(f1_score(y_test, pred))
    prediction_data['Cross_validation_mean'].append(np.mean(
        cross_validation_score))

    folder_path = "/content/drive/My Drive/Loan Prediction/Data Files/"
    algo_name = name + ".pkl"

    filepath = folder_path + algo_name

    with open(filepath, "wb") as file_pointer:
        pickle.dump(model, file_pointer)

    print("Accuracy of {} is {}".format(name, result))
    
    print("Cross validation {} mean is {}\n\n".format(name, 
                                                      np.mean(
                                                          cross_validation_score
                                                          )))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
8:	learn: 0.6583198	total: 13.9ms	remaining: 1.53s
9:	learn: 0.6551250	total: 14.9ms	remaining: 1.47s
10:	learn: 0.6517753	total: 16.3ms	remaining: 1.47s
11:	learn: 0.6481442	total: 17.8ms	remaining: 1.46s
12:	learn: 0.6448930	total: 19.3ms	remaining: 1.47s
13:	learn: 0.6416417	total: 20.8ms	remaining: 1.46s
14:	learn: 0.6387199	total: 22.3ms	remaining: 1.46s
15:	learn: 0.6358230	total: 23.6ms	remaining: 1.45s
16:	learn: 0.6328082	total: 25ms	remaining: 1.45s
17:	learn: 0.6297319	total: 26.5ms	remaining: 1.45s
18:	learn: 0.6267667	total: 28ms	remaining: 1.44s
19:	learn: 0.6240632	total: 29.4ms	remaining: 1.44s
20:	learn: 0.6206700	total: 30.9ms	remaining: 1.44s
21:	learn: 0.6182396	total: 31.9ms	remaining: 1.42s
22:	learn: 0.6158655	total: 32.9ms	remaining: 1.4s
23:	learn: 0.6132267	total: 34.3ms	remaining: 1.4s
24:	learn: 0.6109606	total: 35.2ms	remaining: 1.37s
25:	learn: 0.6085476	total: 36.2ms	remaining: 1.36s
26:	lea

In [19]:
prediction_data = pd.DataFrame(prediction_data)

prediction_data

Unnamed: 0,Algorithm_name,Accuracy,F1_Score,Cross_validation_mean
0,Logistic_Regression,0.723926,0.761905,0.713409
1,Random_forest,0.907975,0.904459,0.896121
2,Gradient_boosting,0.846626,0.850299,0.818612
3,Ada_boost,0.791411,0.795181,0.773602
4,KNN_algorithm,0.656442,0.611111,0.599691
5,XGB_boost,0.785276,0.797688,0.769155
6,Catboost,0.889571,0.888889,0.8438
