In [1]:
import tensorflow as tf
from tensorflow import keras 
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from sklearn.metrics import roc_auc_score,accuracy_score
import pandas as pd
import warnings
warnings.simplefilter(action='ignore')
from sklearn.model_selection import KFold, cross_val_score,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
def replace_target(df,name):
    df_1 = df.copy()
    d = { j:(i+1) for i,j in enumerate(iris_data['Species'].unique())}
    df_1[name] = df_1[name].map(d)
    return df_1

In [3]:
def make_cross_validation_classif_multi_n(X: pd.DataFrame,
                          y: pd.Series,
                          estimator: object,
                          metric: callable,
                          cv_strategy,
                          params,print_is = True):
   
    
    estimators, fold_train_scores, fold_valid_scores = [], [], []
    oof_predictions = np.zeros((X.shape[0],y.shape[1]))   

    for fold_number, (train_idx, valid_idx) in enumerate(cv_strategy.split(X, y)):
        x_train, x_valid = X[train_idx], X[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        estimator.fit(x_train, y_train,**params) 

        y_valid_pred = estimator.predict(x_valid)
        y_train_pred = estimator.predict(x_train)
        
        
        fold_train_scores.append(metric(y_train, y_train_pred,multi_class = 'ovr'))
        fold_valid_scores.append(metric(y_valid, y_valid_pred,multi_class = 'ovr'))
        oof_predictions[valid_idx] = y_valid_pred

        msg = (
            f"Fold: {fold_number+1}, train-observations = {len(train_idx)}, "
            f"valid-observations = {len(valid_idx)}\n"
            f"train-score = {round(fold_train_scores[fold_number], 4)}, "
            f"valid-score = {round(fold_valid_scores[fold_number], 4)}" 
        )
        if print_is:
            print(msg)
            print("="*69)
        estimators.append(estimator)
    oof_score = metric(y, oof_predictions,multi_class = 'ovr')
    if print_is:
        print(f"CV-results train: {round(np.mean(fold_train_scores), 4)} +/- {round(np.std(fold_train_scores), 3)}")
        print(f"CV-results valid: {round(np.mean(fold_valid_scores), 4)} +/- {round(np.std(fold_valid_scores), 3)}")
        print(f"OOF-score = {round(oof_score, 4)}")

    return estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions  

In [4]:
def enc_target(train,feature,target):
    d = train[train[target]==1].groupby(feature).size()/len(train)*100
    return d

In [5]:
def normalize(X, axis=-1, order=2):
    l2 = np.atleast_1d(np.linalg.norm(X, order, axis))
    l2[l2 == 0] = 1
    return X / np.expand_dims(l2, axis)

In [6]:
def from_one_hot(layer):
    arr = np.argmax(layer,axis=1)+1
    return arr.reshape(-1,1)

In [7]:
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=100)

In [8]:
# Загрузим данные
df = pd.read_csv("train.csv")

In [26]:
df

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,Rent,402192.0,< 1 year,0.0,3.0,8.5,107866.0,0.0,,0.0,other,Short Term,129360.0,73492.0,1900.0,697.0,0
7496,Home Mortgage,1533984.0,1 year,0.0,10.0,26.5,686312.0,0.0,43.0,0.0,debt consolidation,Long Term,444048.0,456399.0,12783.0,7410.0,1
7497,Rent,1878910.0,6 years,0.0,12.0,32.1,1778920.0,0.0,,0.0,buy a car,Short Term,99999999.0,477812.0,12479.0,748.0,0
7498,Home Mortgage,,,0.0,21.0,26.5,1141250.0,0.0,,0.0,debt consolidation,Short Term,615274.0,476064.0,37118.0,,0


In [9]:
# Обработаем данные

df_work = df.copy()

d={}
for i in df_work['Years in current job'].value_counts().index:
    if i[:2]=='10':
        d[i]=10
        continue
    if i[0]=='<':
        d[i]=0
        continue    
df_work['Years in current job'] = df_work['Years in current job'].map(d) 


df_work.loc[df_work['Maximum Open Credit']>7000000,'Maximum Open Credit']=7000000

df_work.fillna(999,inplace=True)

target_col = ['Credit Default']
categorical_features=['Home Ownership','Tax Liens','Purpose','Term']
discrete_feature = ['Years in current job','Number of Open Accounts',\
                    'Years of Credit History','Number of Credit Problems',\
                   'Months since last delinquent','Bankruptcies','Credit Score']
continuous_feature = ['Annual Income','Maximum Open Credit','Current Loan Amount',\
                      'Current Credit Balance','Monthly Debt']

target = 'Credit Default'

for i in categorical_features:
    d = enc_target(df_work,i,target)
    df_work[i] = df_work[i].map(d)
    df_work[i] = df_work[[i]].fillna(0)

    
X_train =  df_work.drop('Credit Default',1 )
y_train = df_work[target]
# y_train= y_train.map({0:1,1:2})

for i in categorical_features:
    X_train=pd.concat([X_train,pd.get_dummies(X_train[i],prefix=i)],axis=1)
    X_train.drop(i,1,inplace=True)
    
for i in continuous_feature + discrete_feature:
    X_train[i]=StandardScaler().fit_transform(X_train[[i]])    
    

In [10]:
# нормализуем данные
X_train= normalize(X_train.to_numpy())
y_train= to_categorical(y_train.to_numpy())

### Тестируем несколько слоем при одинаковом количестве нейронов

In [11]:
# Создадим 12 моделей с разным количством слоев с 1000 нейронами
n=1000
finish_n = y_train.shape[1]
models = []
input_dim = X_train.shape[1]
hide_levels = [2,3,5,10]
for level in hide_levels:
    n_nueron = (n-finish_n)//(level-1)
    lost_nueron = (n-finish_n)%(level-1)
    if level>2:
        levels_list = [n_nueron+lost_nueron]+[n_nueron for i in range(level-2)]+[finish_n]
    else:
        levels_list =[n-finish_n,finish_n]    
    for fun in['tanh','sigmoid','relu']:
        model_name = f'{level}_level_{"_".join([str(i) for i in levels_list])}_nueron_{fun}'
        denses_list = [Dense(levels_list[0],activation=fun, input_dim=input_dim,name='1')]
        if level>2:
            denses_list+=[ Dense(i,activation=fun,name = str(k+2)) for k,i in enumerate(levels_list[1:-1])] 
        denses_list+=[ Dense(levels_list[-1],activation='softmax',name = str(level))]  
        my_model = Sequential(denses_list,name =model_name )
        models.append((my_model,my_model.get_weights()))
        print(model_name)
print('количесвто моделей - ',len(models))

2_level_998_2_nueron_tanh
2_level_998_2_nueron_sigmoid
2_level_998_2_nueron_relu
3_level_499_499_2_nueron_tanh
3_level_499_499_2_nueron_sigmoid
3_level_499_499_2_nueron_relu
5_level_251_249_249_249_2_nueron_tanh
5_level_251_249_249_249_2_nueron_sigmoid
5_level_251_249_249_249_2_nueron_relu
10_level_118_110_110_110_110_110_110_110_110_2_nueron_tanh
10_level_118_110_110_110_110_110_110_110_110_2_nueron_sigmoid
10_level_118_110_110_110_110_110_110_110_110_2_nueron_relu
количесвто моделей -  12


In [12]:
for model in models:
    model[0].compile(
          optimizer='adam',
          loss='categorical_crossentropy',
          metrics=[keras.metrics.AUC()],
                    )

In [13]:
# Параметры для fit
params={'epochs':15,
       'batch_size':None, 
       'validation_split':0.2,
       'use_multiprocessing' : True,
       'verbose':0} 
data = {'X_train':X_train,
        'y_train':y_train}

In [14]:
# тестировать будем на кросвалидации. делим на 5 фолдоф, оцениваем по OOF-score по всей выборке
make_cross_validation_classif_multi_n(X_train, y_train,models[0][0],\
                                roc_auc_score,cv_strategy,print_is =True,params=params)[1]

Fold: 1, train-observations = 6000, valid-observations = 1500
train-score = 0.7647, valid-score = 0.7657
Fold: 2, train-observations = 6000, valid-observations = 1500
train-score = 0.7662, valid-score = 0.7617
Fold: 3, train-observations = 6000, valid-observations = 1500
train-score = 0.7664, valid-score = 0.7599
Fold: 4, train-observations = 6000, valid-observations = 1500
train-score = 0.7657, valid-score = 0.7623
Fold: 5, train-observations = 6000, valid-observations = 1500
train-score = 0.7701, valid-score = 0.7513
CV-results train: 0.7666 +/- 0.002
CV-results valid: 0.7602 +/- 0.005
OOF-score = 0.7594


0.7593885421697131

In [15]:
# Запускаем тестирование созданых моделей с разными эпохами
result=pd.DataFrame(columns=['model','epoch','result_valid'])
for epoh in [15,30,50,100,300]:
    params['epochs']=epoh
    for model in models:
        model[0].set_weights(model[1])
        answer = make_cross_validation_classif_multi_n(X_train, y_train,model[0],\
                                roc_auc_score,cv_strategy,print_is =False,params=params)[1]
        result = result.append({'model':model[0].name,'epoch':epoh,'result_valid':answer},ignore_index=True)

Wall time: 1h 15min 5s


In [16]:
result.sort_values('result_valid',ascending=False,inplace=True)
result

Unnamed: 0,model,epoch,result_valid
41,3_level_499_499_2_nueron_relu,100,0.877889
29,3_level_499_499_2_nueron_relu,50,0.873823
53,3_level_499_499_2_nueron_relu,300,0.870232
44,5_level_251_249_249_249_2_nueron_relu,100,0.869054
56,5_level_251_249_249_249_2_nueron_relu,300,0.868454
50,2_level_998_2_nueron_relu,300,0.8659
17,3_level_499_499_2_nueron_relu,30,0.864336
32,5_level_251_249_249_249_2_nueron_relu,50,0.862729
20,5_level_251_249_249_249_2_nueron_relu,30,0.861573
47,10_level_118_110_110_110_110_110_110_110_110_2...,100,0.861112


### Тестируем двухслойные модели с разным количеством нейронов

In [17]:
%%time
N=1000
result2=pd.DataFrame()
for n_nueron in range(10,N+1,50):
    for fun in['tanh','sigmoid','relu']:
        dens =[ Dense(n_nueron,input_dim=X_train.shape[1],activation=fun,name= '1'),
                Dense(y_train.shape[1],activation='softmax',name='exit')]
        model_level_2 = Sequential(dens, name=f'{n_nueron}_{fun}')
        model_level_2.compile(optimizer='adam',
                              loss='categorical_crossentropy',
                               metrics=[keras.metrics.AUC()])
        a = model_level_2.get_weights()   
                
        for epoh in [15,30,100]:
            model_level_2.set_weights(a)    
            params['epochs']=epoh
            answer = make_cross_validation_classif_multi_n(X_train, y_train,model_level_2,\
                                roc_auc_score,cv_strategy,print_is =False,params=params)[1]
            result2 = result2.append({'model':model_level_2.name,'epoch':epoh,'result_valid':answer},ignore_index=True)    
                
                

Wall time: 1h 33min 52s


In [18]:
result2.sort_values('result_valid',ascending=False,inplace=True)
result2

Unnamed: 0,epoch,model,result_valid
170,100.0,910_relu,0.852031
179,100.0,960_relu,0.848617
152,100.0,810_relu,0.846014
161,100.0,860_relu,0.845380
143,100.0,760_relu,0.843755
...,...,...,...
121,30.0,660_sigmoid,0.747899
157,30.0,860_sigmoid,0.746000
156,15.0,860_sigmoid,0.745410
138,15.0,760_sigmoid,0.745255


### Тестируем с разными оптимизаторам

In [28]:
ful_result=pd.DataFrame()
for fun in['sigmoid','relu']:
    dens =[ Dense(910,input_dim=X_train.shape[1],activation=fun,name= '1'),
            Dense(y_train.shape[1],activation='softmax',name='exit')]
    model = Sequential(dens, name=f'960_{fun}')
    a = model.get_weights()
    for epoh in [15,30,100]:
        params['epochs'] = epoh
        for opt in [keras.optimizers.RMSprop(),keras.optimizers.SGD(),keras.optimizers.Nadam()]:       
                model.compile(optimizer=opt,
                              loss='categorical_crossentropy',
                              metrics=[keras.metrics.AUC()])
                model.set_weights(a) 
                answer = make_cross_validation_classif_multi_n(X_train, y_train,model,\
                                roc_auc_score,cv_strategy,print_is =False,params=params)[1]
                ful_result = ful_result.append({'model':model.name,\
                                             'epoch':epoh,
                                              'opt':opt,  
                                              'result_valid':answer},ignore_index=True)    

In [None]:
ful_result.sort_values('result_valid',ascending=False,inplace=True)
ful_result

### Тестируем 10 слойную сеть с разными оптимизаторами

In [21]:
ful_result2=pd.DataFrame()
for fun in['sigmoid','relu']:
    dens =[ Dense(100,input_dim=X_train.shape[1],activation=fun,name= '1'),
            Dense(100,activation=fun,name= '2'),
           Dense(100,activation=fun,name= '3'),
           Dense(100,activation=fun,name= '4'),
           Dense(100,activation=fun,name= '5'),
           Dense(100,activation=fun,name= '6'),
           Dense(100,activation=fun,name= '7'),
           Dense(100,activation=fun,name= '8'),
           Dense(100,activation=fun,name= '9'),
            Dense(y_train.shape[1],activation='softmax',name='exit')]
    model = Sequential(dens, name=f'10_100_{fun}')
    a = model.get_weights()
    for epoh in [15,30,100]:
        params['epochs'] = epoh
        for opt in [keras.optimizers.RMSprop(),keras.optimizers.SGD(),keras.optimizers.Nadam()]:       
                model.compile(optimizer=opt,
                              loss='categorical_crossentropy',
                              metrics=[keras.metrics.AUC()])
                model.set_weights(a) 
                answer = make_cross_validation_classif_multi_n(X_train, y_train,model,\
                                roc_auc_score,cv_strategy,print_is =False,params=params)[1]
                ful_result2 = ful_result2.append({'model':model.name,\
                                             'epoch':epoh,
                                              'opt':opt,  
                                              'result_valid':answer},ignore_index=True)    

In [22]:
ful_result2.sort_values('result_valid',ascending=False,inplace=True)
ful_result2

Unnamed: 0,epoch,model,opt,result_valid
14,30.0,10_100_relu,<tensorflow.python.keras.optimizer_v2.nadam.Na...,0.856219
17,100.0,10_100_relu,<tensorflow.python.keras.optimizer_v2.nadam.Na...,0.855623
11,15.0,10_100_relu,<tensorflow.python.keras.optimizer_v2.nadam.Na...,0.827464
15,100.0,10_100_relu,<tensorflow.python.keras.optimizer_v2.rmsprop....,0.81479
16,100.0,10_100_relu,<tensorflow.python.keras.optimizer_v2.gradient...,0.813339
12,30.0,10_100_relu,<tensorflow.python.keras.optimizer_v2.rmsprop....,0.808881
9,15.0,10_100_relu,<tensorflow.python.keras.optimizer_v2.rmsprop....,0.789093
10,15.0,10_100_relu,<tensorflow.python.keras.optimizer_v2.gradient...,0.775781
13,30.0,10_100_relu,<tensorflow.python.keras.optimizer_v2.gradient...,0.769099
3,30.0,10_100_sigmoid,<tensorflow.python.keras.optimizer_v2.rmsprop....,0.503521


### Тестируем 3 слойную сеть с разными оптимизаторами

In [None]:
%%time
ful_result3=pd.DataFrame()
for fun in['sigmoid','relu']:
    dens =[ Dense(499,input_dim=X_train.shape[1],activation=fun,name= '1'),
            Dense(499,activation=fun,name= '2'),
            Dense(y_train.shape[1],activation='softmax',name='exit')]
    model = Sequential(dens, name=f'499_499_10_{fun}')
    a = model.get_weights()
    for epoh in [30,100,200]:
        params['epochs'] = epoh
        for opt in [keras.optimizers.RMSprop(),keras.optimizers.SGD(),keras.optimizers.Nadam(),'adam']:       
                model.compile(optimizer=opt,
                              loss='categorical_crossentropy',
                              metrics=[keras.metrics.AUC()])
                model.set_weights(a) 
                answer = make_cross_validation_classif_multi_n(X_train, y_train,model,\
                                roc_auc_score,cv_strategy,print_is =False,params=params)[1]
                ful_result3 = ful_result3.append({'model':model.name,\
                                             'epoch':epoh,
                                              'opt':opt,  
                                              'result_valid':answer},ignore_index=True)    

In [None]:
ful_result3.sort_values('result_valid',ascending=False,inplace=True)
ful_result3

## Результаты

In [None]:
result.head(10)

In [None]:
result2.head(10)

In [None]:
ful_result.head(10)

In [None]:
ful_result2.head(10)

In [None]:
ful_result3.head(10)

Победитель -     , проверим его результат

In [23]:
params['epochs'] = 15
dens =[ Dense(960,input_dim=X_train.shape[1],activation='relu',name= '1'),
            Dense(y_train.shape[1],activation='softmax',name='exit')]
model = Sequential(dens, name=f'960_{fun}')
model.compile(optimizer=keras.optimizers.RMSprop(),
                loss='categorical_crossentropy',
                metrics=[keras.metrics.AUC()])
cv = make_cross_validation_classif_multi_n(X_train, y_train,models[0][0],\
                                roc_auc_score,cv_strategy,print_is =True,params=params)

Fold: 1, train-observations = 6000, valid-observations = 1500
train-score = 0.8056, valid-score = 0.792
Fold: 2, train-observations = 6000, valid-observations = 1500
train-score = 0.8058, valid-score = 0.7983
Fold: 3, train-observations = 6000, valid-observations = 1500
train-score = 0.8054, valid-score = 0.8047
Fold: 4, train-observations = 6000, valid-observations = 1500
train-score = 0.8065, valid-score = 0.8073
Fold: 5, train-observations = 6000, valid-observations = 1500
train-score = 0.8117, valid-score = 0.7827
CV-results train: 0.807 +/- 0.002
CV-results valid: 0.797 +/- 0.009
OOF-score = 0.7955


0.7954634305247132

In [None]:
cv[-1]

In [None]:
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, learning_curve
from sklearn.metrics import roc_auc_score, f1_score,r2_score,classification_report,accuracy_score,precision_score, recall_score


In [None]:
def show_proba_calibration_plots(y_predicted_probs, y_true_labels):
    preds_with_true_labels = np.array(list(zip(y_predicted_probs, y_true_labels)))

    thresholds = []
    precisions = []
    recalls = []
    f1_scores = []

    for threshold in np.linspace(0.1, 0.9, 9):
        thresholds.append(threshold)
        precisions.append(precision_score(y_true_labels, list(map(int, y_predicted_probs > threshold))))
        recalls.append(recall_score(y_true_labels, list(map(int, y_predicted_probs > threshold))))
        f1_scores.append(f1_score(y_true_labels, list(map(int, y_predicted_probs > threshold))))

    scores_table = pd.DataFrame({'f1':f1_scores,
                                 'precision':precisions,
                                 'recall':recalls,
                                 'probability':thresholds}).sort_values('f1', ascending=False).round(3)
  
    figure = plt.figure(figsize = (15, 5))

    plt1 = figure.add_subplot(121)
    plt1.plot(thresholds, precisions, label='Precision', linewidth=4)
    plt1.plot(thresholds, recalls, label='Recall', linewidth=4)
    plt1.plot(thresholds, f1_scores, label='F1', linewidth=4)
    plt1.set_ylabel('Scores')
    plt1.set_xlabel('Probability threshold')
    plt1.set_title('Probabilities threshold calibration')
    plt1.legend(bbox_to_anchor=(0.25, 0.25))   
    plt1.table(cellText = scores_table.values,
               colLabels = scores_table.columns, 
               colLoc = 'center', cellLoc = 'center', loc = 'bottom', bbox = [0, -1.3, 1, 1])

    plt2 = figure.add_subplot(122)
    plt2.hist(preds_with_true_labels[preds_with_true_labels[:, 1] == 0][:, 0], 
              label='Another class', color='royalblue', alpha=1)
    plt2.hist(preds_with_true_labels[preds_with_true_labels[:, 1] == 1][:, 0], 
              label='Main class', color='darkcyan', alpha=0.8)
    plt2.set_ylabel('Number of examples')
    plt2.set_xlabel('Probabilities')
    plt2.set_title('Probability histogram')
    plt2.legend(bbox_to_anchor=(1, 1))

    plt.show()

In [None]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [None]:
def get_best_split(pred,y_test):
    d = {}
    for i in np.arange(0.05,0.96,0.01):
        z = np.where(pred < i, 0, 1)
        d[i] = f1_score(y_test,z)
    return sorted(d, key=d.get, reverse=True)[0]

In [None]:
show_proba_calibration_plots(res[4], target)

In [None]:
bs = get_best_split(res[4], target)
bs

In [None]:
y_train_pred=np.where(y_train_pred<0.33,0,1)
y_test_pred = np.where(res[4]<0.33,0,1)

In [None]:
get_classification_report(target, y_train_pred, target, y_test_pred)