In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier

import matplotlib.pyplot as plt

import dill

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('data\loan_sanction_train.csv')
data.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [4]:
data['Loan_Status'] = data['Loan_Status'].map({'Y': 1, 'N': 0})
data.drop(['Loan_ID'], axis=1, inplace=True)

In [5]:
print('Missing values:')

for col in data.columns:
    print(f'{col} - {len(data[col]) - data[col].isna().value_counts()[0]}')

Missing values:
Gender - 13
Married - 3
Dependents - 15
Education - 0
Self_Employed - 32
ApplicantIncome - 0
CoapplicantIncome - 0
LoanAmount - 22
Loan_Amount_Term - 14
Credit_History - 50
Property_Area - 0
Loan_Status - 0


In [6]:
num_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
cat_features = ['Dependents', 'Property_Area']
bin_features = ['Gender', 'Married', 'Education', 'Self_Employed', 'Credit_History']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('Loan_Status', 1), data['Loan_Status'],
                                                    stratify=data['Loan_Status'], test_size=0.3, random_state=1)

In [8]:
X_train.head(3)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
475,Male,Yes,2,Graduate,Yes,16525,1014.0,150.0,360.0,1.0,Rural
84,Male,Yes,1,Graduate,No,3988,0.0,50.0,240.0,1.0,Urban
340,Male,Yes,3+,Not Graduate,No,2647,1587.0,173.0,360.0,1.0,Rural


In [9]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    

class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]
    
    
class OHEEncoderBin(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns].iloc[:, 0].to_frame()
    
    
class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.mode = None
    
    def fit(self, X, y=None):
        self.mode = X[self.key].mode()[0]
        return self
    
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.mode)
        return X
        
        
class NumImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.median = None
    
    def fit(self, X, y=None):
        self.median = X[self.key].median()
        return self
    
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.median)
        return X

In [10]:
final_transformers = list()

for cat_feature in cat_features:
    cat_transformer = Pipeline([
                ('imputer', TextImputer(key=cat_feature)),
                ('selector', FeatureSelector(column=cat_feature)),
                ('ohe', OHEEncoder(key=cat_feature))
            ])
    final_transformers.append((cat_feature, cat_transformer))
    
for num_feature in num_features:
    num_transformer = Pipeline([
                ('imputer', NumImputer(key=num_feature)),
                ('selector', NumberSelector(key=num_feature))
            ])
    final_transformers.append((num_feature, num_transformer))
    
for bin_feature in bin_features:
    bin_transformer = Pipeline([
                ('imputer', TextImputer(key=bin_feature)),
                ('selector', FeatureSelector(column=bin_feature)),
                ('ohebin', OHEEncoderBin(key=bin_feature))
            ])
    final_transformers.append((bin_feature, bin_transformer))
        
feats = FeatureUnion(final_transformers)

pipeline = Pipeline([
        ('features', feats),
        ('classifier', CatBoostClassifier(depth=3, iterations=500, learning_rate=0.11, random_state=1, verbose=False))
        ])

In [11]:
pipeline.fit(X_train, y_train)

preds = pipeline.predict_proba(X_test)[:, 1]
    
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
cnf_matrix = confusion_matrix(y_test, preds>thresholds[ix])
    
print('Best Threshold=%.3f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix],
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.126, F-Score=0.868, Precision=0.776, Recall=0.984


In [12]:
with open('models\catboost_class_pipeline.dill', 'wb') as f:
    dill.dump(pipeline, f)
    
X_train.to_csv('data\X_train.csv', index=None)
X_test.to_csv('data\X_test.csv', index=None)
y_train.to_csv('data\y_train.csv', index=None)
y_test.to_csv('data\y_test.csv', index=None)