In [71]:
# imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score

from sklearn.pipeline import Pipeline, TransformerMixin
from sklearn.base import BaseEstimator

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm

from graphviz import Source
from sklearn import tree

RANDOM_STATE = 12

# import custom classes for pipeline
from prep import FeatureGemerator, FeatureSelector

In [72]:
# load_data 
df = pd.read_csv("BPL.csv")
df.sample(5)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
3282,3283,45,21,91,95054,1,4.7,1,0,0,0,0,1,0
233,234,62,37,58,91320,4,1.7,1,0,0,0,0,1,0
4265,4266,27,2,44,93943,4,0.6,2,0,0,1,1,1,0
2216,2217,64,40,89,94707,1,3.8,1,0,0,0,0,0,0
99,100,66,41,15,91711,3,0.1,3,0,0,0,0,1,0


In [73]:
# load grid_search results 
scoring_df = pd.read_csv('scoring_df.csv')
scoring_df

Unnamed: 0,model,accuracy,precision,recall,roc_auc,f1
0,CatBoostClassifier,0.983,0.954023,0.864583,0.930079,0.907104
1,RandomForestClassifier,0.98,0.952381,0.833333,0.914454,0.888889
2,DecisionTreeClassifier,0.98,0.94186,0.84375,0.91911,0.89011
3,GradientBoostingClassifier,0.98,0.94186,0.84375,0.91911,0.89011


In [74]:
# train test split
target_col = ['Personal Loan']

X, y = df.drop(target_col, axis=1), df[target_col]

metrics = {'accuracy': accuracy_score,
           'precision': precision_score,
           'recall': recall_score,
           'roc_auc': roc_auc_score, 
           'f1': f1_score}

In [75]:
# feature_engeneering_pipeline
feature_engeneering_pipeline = joblib.load('data_engeneering_pipeline.pkl')
feature_engeneering_pipeline.transform(X.head(1))

Unnamed: 0,Income,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard,IsMortgaged,IsFamily,IsEducated,IsMortgaged75,IsIncome75,IsCCAvg75
0,49,4,1.6,1,0,1,0,0,0,0,1,0,0,0,0


In [76]:
# load model (cb_classifier)
cb_classifier = joblib.load('models\\cb_model.pkl')
cb_classifier

<catboost.core.CatBoostClassifier at 0x175cba4d4c8>

In [77]:
# making predictions
print(cb_classifier.predict(feature_engeneering_pipeline.transform(X.head(1))))

# predict proba
print(cb_classifier.predict_proba(feature_engeneering_pipeline.transform(X.head(1))))

[0]
[[9.99966897e-01 3.31031296e-05]]


### Общий пайплайн
Для упрощения процесса получения предсказаний сделаем pipeline с преобразованием сырых данных и финальной моделью (CatBoostClassifier).

In [78]:
# concat_pipeline
model_pipeline = Pipeline([
        ('feature_engeneering_pipeline', feature_engeneering_pipeline),
        ('cb_classifier', cb_classifier)
    ])

In [79]:
# making predictions with pipeilne
print(model_pipeline.predict(X.head(1)))

# predict proba with pipeline
print(model_pipeline.predict_proba(X.head(1)))

[0]
[[9.99966897e-01 3.31031296e-05]]


### Валидация модели
Для оценки качества модели мы использовали стратифицированную k-Fold валидацию на всем датасете с разбиением на 5 фолдов.


In [80]:
%%time 

# stratified StratifiedKFold
cv = StratifiedKFold(n_splits=11, shuffle=True, random_state=RANDOM_STATE)

# scores
scoring = list(scoring_df.columns)[1:]
# print(f'Scoring columns: {scoring}')

scores = cross_validate(model_pipeline, X, y, cv=cv, scoring=scoring, n_jobs=-1)
scores.keys()

Wall time: 1min 55s


dict_keys(['fit_time', 'score_time', 'test_accuracy', 'test_precision', 'test_recall', 'test_roc_auc', 'test_f1'])

In [81]:
# cv_scores to dateframe
model_cv_scores = pd.DataFrame()
model_cv_scores['fold'] = np.arange(1, 6)

for score in scores:
    if 'test_' in score:
        model_cv_scores[score] = scores[score]
        
model_cv_scores

ValueError: Length of values does not match length of index

In [None]:
# vizualization
std_coef = 1.

plt.figure(figsize=(16, 8))

for column in model_cv_scores:
    if column != 'fold':
        plt.plot(model_cv_scores['fold'], model_cv_scores[column], label=column)
        plt.fill_between(x=model_cv_scores['fold'], 
                         y1=model_cv_scores[column]+std_coef*model_cv_scores[column].std(),
                         y2=model_cv_scores[column]-std_coef*model_cv_scores[column].std(),
                         alpha=.2)
        
        
plt.title('CatBoostClassifier cv-5 validation results', size=18)
plt.xlabel('Fold', size=16)
plt.ylabel('Score', size=16)

plt.legend(loc='lower right', fontsize=14);