In [1]:
import pandas as pd
import numpy as np

from data_prep import DataPipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, classification_report

from catboost import CatBoostClassifier

from eli5.sklearn import PermutationImportance
import eli5

from sklearn.model_selection import KFold

import pickle

In [2]:
def get_scores(report_df, model, X_val, y_val, name):

    report = pd.DataFrame(columns={'ROC-AUC'}, data=[0])
    report['ROC-AUC'] = roc_auc_score(y_val,
                                      model.predict_proba(X_val)[:, 1])
    report['F1'] = f1_score(y_test, model.predict(X_val))
    report['precision_0'] = precision_score(
        y_test, model.predict(X_val), pos_label=0)
    report['precision_1'] = precision_score(
        y_test, model.predict(X_val), pos_label=1)
    report['recall_0'] = recall_score(
        y_test, model.predict(X_val), pos_label=0)
    report['recall_1'] = recall_score(
        y_test, model.predict(X_val), pos_label=1)

    report.index = [name]
    report_df = report_df.append(report)
    return report_df

In [3]:
train_df = pd.read_csv('train.csv')

In [4]:
pipe = DataPipeline()
pipe.fit(train_df)
df = pipe.transform(train_df)

df.to_csv('train_prep.csv', index=False, encoding='utf-8')

  self.medians = df.median()


KeyError: 'Gender'

In [None]:
df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,1,1,13,0,1,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18.0,0
1,1,0,25,1,2,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6.0,0
2,0,1,26,1,2,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0.0,1
3,0,1,25,1,2,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9.0,0
4,1,1,61,1,2,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0.0,1


In [None]:
df.columns.to_list

<bound method IndexOpsMixin.tolist of Index(['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')>

In [None]:
features = ['Inflight wifi service', 
       'Type of Travel',
       'Customer Type',
       'Baggage handling',
       'Online boarding',
       'Class', 
       'Inflight service', 
       'Checkin service', 
       'Gate location', 
       'Seat comfort', 
       'Age', 
       'Cleanliness']

target = 'satisfaction'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.3, random_state=42)

In [None]:
model_catb = CatBoostClassifier(silent=True, random_state=21,
                                     eval_metric='F1',
                                     early_stopping_rounds=20,
                                     use_best_model=True,
                                     custom_metric=['Precision', 'Recall']
                                    )

model_catb.fit(X_train, y_train, plot=True, eval_set=(X_test, y_test))

print(model_catb.best_score_)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

{'learn': {'Recall': 0.9442498726439124, 'Logloss': 0.08071689697246501, 'F1': 0.9599287910665156, 'Precision': 0.9761685319289006}, 'validation': {'Recall': 0.9408092825145039, 'Logloss': 0.09030313206734703, 'F1': 0.9567523155064236, 'Precision': 0.9734022342123262}}


In [None]:
y_pred_proba = model_catb.predict_proba(X_test)[:, 1]
y_pred_catb = model_catb.predict(X_test)

In [None]:
df_report = pd.DataFrame(columns={'ROC-AUC'}, data=[0])

df_report['ROC-AUC'] = roc_auc_score(y_test, y_pred_proba)
df_report['F1'] = f1_score(y_test, y_pred_catb)
df_report['precision_0'] = precision_score(y_test, y_pred_catb, pos_label=0)
df_report['recall_0'] = recall_score(y_test, y_pred_catb, pos_label=0)
df_report['precision_1'] = precision_score(y_test, y_pred_catb, pos_label=1)
df_report['recall_1'] = recall_score(y_test, y_pred_catb, pos_label=1)

df_report.index = ['CatBoost']

In [None]:
df_report

Unnamed: 0,ROC-AUC,F1,precision_0,recall_0,precision_1,recall_1
CatBoost,0.994567,0.956752,0.955144,0.980063,0.973402,0.940662


**Permutation Importance**

In [None]:
perm = PermutationImportance(model_catb, scoring='f1').fit(X_test, y_test)

In [None]:
eli5.show_weights(perm, feature_names=X_test.columns.tolist())

Weight,Feature
0.2024  ± 0.0020,Inflight wifi service
0.1585  ± 0.0045,Type of Travel
0.0680  ± 0.0019,Customer Type
0.0482  ± 0.0015,Gate location
0.0297  ± 0.0014,Baggage handling
0.0250  ± 0.0022,Class
0.0217  ± 0.0016,Inflight service
0.0215  ± 0.0011,Online boarding
0.0192  ± 0.0012,Checkin service
0.0143  ± 0.0019,Seat comfort


**Настройка гиперпараметров**

In [None]:
frozen_params = {
     'silent':True,
     'random_state':21,
     'eval_metric':'F1',
     'early_stopping_rounds':20
}
model_catb = CatBoostClassifier(**frozen_params)

params = {'iterations':[50, 200, 500, 700, 1500],
          'max_depth':[3, 5, 7]}

cv = KFold(n_splits=3, random_state=21, shuffle=True)

grid_search = model_catb.grid_search(params, X_train, y_train, cv=cv, stratified=True, plot=True, refit=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


bestTest = 0.8867695789
bestIteration = 42

0:	loss: 0.8867696	best: 0.8867696 (0)	total: 1.32s	remaining: 18.5s

bestTest = 0.9266025118
bestIteration = 196

1:	loss: 0.9266025	best: 0.9266025 (1)	total: 5.32s	remaining: 34.6s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9313189057
bestIteration = 288

2:	loss: 0.9313189	best: 0.9313189 (2)	total: 12.4s	remaining: 49.8s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9313189057
bestIteration = 288

3:	loss: 0.9313189	best: 0.9313189 (2)	total: 18.6s	remaining: 51.1s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9313189057
bestIteration = 288

4:	loss: 0.9313189	best: 0.9313189 (2)	total: 23.5s	remaining: 47.1s

bestTest = 0.9233913826
bestIteration = 49

5:	loss: 0.9233914	best: 0.9313189 (2)	total: 25s	remaining: 37.5s

bestTest = 0.9452099738
bestIteration = 199

6:	loss: 0.9452100	best: 0.9452100 (6)	total: 29.9s	remaining: 34.2s
Stopped by overfitting detector  (20 

In [None]:
pd.DataFrame(grid_search['cv_results']).sort_values('test-F1-mean', ascending=False).head()

Unnamed: 0,iterations,test-F1-mean,test-F1-std,train-F1-mean,train-F1-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
195,195,0.9509,0.003075,0.95338,0.001038,0.107341,0.002751,0.102729,0.00228
198,198,0.95088,0.003283,0.953442,0.001001,0.107034,0.002699,0.102355,0.002366
199,199,0.950875,0.003323,0.953484,0.000926,0.106928,0.002722,0.102228,0.00235
196,196,0.950836,0.003113,0.953396,0.001058,0.107217,0.002714,0.102583,0.002342
197,197,0.950829,0.0032,0.953358,0.000991,0.107092,0.002707,0.102439,0.002359


**Проверка качества**

In [None]:
def get_classification_report(y_train_true, y_train_pred, y_val_true, y_val_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_val_true, y_val_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_val_true, y_val_pred))

def evaluate_preds(model, X_train, X_val, y_train, y_val):
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    get_classification_report(y_train, y_train_pred, y_val, y_val_pred)

In [None]:
final_model = CatBoostClassifier(silent=True, random_state=21,
                                     reg_lambda=.5,
                                     n_estimators=200,
                                     max_depth=7,
                                     eval_metric='F1',
                                     early_stopping_rounds=20,
                                     use_best_model=True,
                                     custom_metric=['Precision', 'Recall'],
                                     min_data_in_leaf = 5
                                    )
final_model.fit(X_train, y_train, plot=True, eval_set=(X_test, y_test))

evaluate_preds(final_model, X_train, X_test, y_train, y_test)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

TRAIN

              precision    recall  f1-score   support

           0       0.95      0.98      0.97     41324
           1       0.97      0.94      0.95     31408

    accuracy                           0.96     72732
   macro avg       0.96      0.96      0.96     72732
weighted avg       0.96      0.96      0.96     72732

TEST

              precision    recall  f1-score   support

           0       0.95      0.98      0.97     17555
           1       0.97      0.94      0.95     13617

    accuracy                           0.96     31172
   macro avg       0.96      0.96      0.96     31172
weighted avg       0.96      0.96      0.96     31172

CONFUSION MATRIX

col_0             0      1
satisfaction              
0             17180    375
1               836  12781


In [None]:
#Сохранение модели
with open('../model/model.pkl', 'wb') as file:
    pickle.dump(final_model, file)