![image.png](attachment:b599c08a-3992-40e3-bdde-82b76e20b38e.png)

**머신러닝을 몰라도 "Config"에 5가지만 넣으면, 실행할 수 있게 만들었습니다.<br>
Even if you don't know machine learning, you can submit only 5 things in "config"**

회사에서 많이 사용하게 만들었었고, 도움이 많이 될 겁니다.<br> 
I made a lot of use of it in company, and it will help a lot.

여러 분류 문제에 대하여 해당 모델을 튜닝 없이 동일하게 사용(다른 문제에 적용한 것을 보시려면 [여기 클릭](https://www.kaggle.com/hadeux/auto-simple-ensemble-model))해 보았고,<br> 
어느정도 좋은 결과가 나오는 것을 확인하였습니다.<br>

I tried using the model identically without tuning for several classification problems ([Click here](https://www.kaggle.com/hadeux/auto-simple-ensemble-model) to see it applied to other problems)<br>
It was confirmed that some good results were obtained.

# Config

In [None]:
import os

# 기본폴더(base folder)
dir_ad = '/kaggle/input/titanic'

# 학습데이터, predict data 파일명(train, test file name)
train = 'train.csv'
test = 'test.csv'

# 어떤것을 분석 및 데이터 추출할지 구분할 수 있는 파일명 선택 
# tag = 'train_MAU_3M_pred_MAUX_4M'← 이런 방식으로 나올 것으로 예상
tag = 's_result'

# 모델 및 각종 산출물이 저장될 폴더
model_dir = os.path.join(dir_ad, tag + '_v')

# 레이블에 해당하는 컬럼(column corresponding to label: Y value)
target = 'Survived' 

# 타겟팅 대상을 식별하기 위한 컬럼(Column to identify the target)
index_col = 'PassengerId'

# 평가방법 'Accuracy', 'Precision', 'Recall', 'F1_Score','roc_auc' ,'Kappa' 등 선택 (Choice of evaluation method)
choice = "Accuracy"

# Import Module and Data

In [None]:
import seaborn as sns
import sys
import csv
import datetime
import operator
import joblib
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb
from sklearn.svm import SVC

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from statsmodels.formula.api import ols
from sklearn.metrics import cohen_kappa_score
from collections import OrderedDict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy.stats import norm, skew, probplot
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier

In [None]:
df_train = pd.read_csv(os.path.join(dir_ad, train))
df_test = pd.read_csv(os.path.join(dir_ad, test))

# Preprocessing

어느 데이터를 사용하여도 유용할수 있는 전처리 방법에 대하여 고민해 보았고,<br>
그 결과 Outlier에 대한 조정과 null값에 대한 조정에 대하여 전처리를 자동으로 하도록 만들었습니다.<br>
I thought about a pre-processing method that can be useful with any data, and as a result,<br>
I made automatic pre-processing for outlier adjustment and null value adjustment.

In [None]:
num_cols = [col for col in df_test.columns if df_test[col].dtype in ["float16","float32","float64", "int64", "int32"]]
cat_cols = [col for col in df_test.columns if df_test[col].dtype not in ["float16","float32","float64", "int64", "int32"]]

In [None]:
def detect_outliers(data, col_name):
    outlier_indices = []
    # iterate over features(columns)
    for col in col_name:
        Q1 = np.percentile(data[col], 25)
        Q3 = np.percentile(data[col],75)
        IQR = Q3 - Q1
        outlier_step = 1.5 * IQR
        outlier_list_col = data[(data[col] < Q1 - outlier_step) | (data[col] > Q3 + outlier_step )].index
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers

In [None]:
# Drop outliers
df_train = df_train.drop(detect_outliers(df_train, num_cols), axis = 0).reset_index(drop=True)

In [None]:
def outlier_replace(data, col_name, q1=0.25, q3=0.75):
    quartile1 = data[col_name].quantile(q1)
    quartile3 = data[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    data.loc[(data[col_name] < low_limit), col_name] = low_limit
    data.loc[(data[col_name] > up_limit), col_name] = up_limit

In [None]:
for i in num_cols:
    outlier_replace(df_train,i)
    outlier_replace(df_test, i)

In [None]:
df_train = df_train.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0)
df_test = df_test.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0)

아시다시피 머신러닝 진행시 숫자로 전부 이루어지도록 해야하므로, One-hot encoding 같은 방식의 변환이 필요합니다.<br> 
여러 기법이 있지만, 이러한 것을 사용시 데이터 유형에 따라 튜닝해줘야하는 문제가 보여서 저는 단순히 get-dummy를 사용해 다 풀어준 후 열의 길이를 맞추는 방법을 사용했습니다.<br>
<br>
As you know, when machine learning is performed, it must be done entirely with numbers, so a conversion such as one-hot encoding is required.<br>
There are several techniques, but when using these, I see a problem that needs to be tuned according to the data type, so I simply used get-dummy to solve it and then adjust the length of the column.<br>

In [None]:
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

In [None]:
test_col = []
delete_train = []

for i in df_test.columns:
    test_col.append(i)
for j in df_train.columns:
    if j not in test_col:
        delete_train.append(j)
delete_train.remove(target)

In [None]:
train_col = []
delete_test = []

for i in df_train.columns:
    train_col.append(i)
for j in df_test.columns:
    if j not in train_col:
        delete_test.append(j)

In [None]:
for delete in delete_train:
    df_train = df_train.drop([delete],axis=1)
    
for delete in delete_test:
    df_test = df_test.drop([delete],axis=1)

# Modeling

모델링은 XG-Boost와 LightGBM으로 진행하였고, Soft Voting방법으로 앙상블 모델을 만들었습니다.<br>
두 모델간의 상관관계가 높을수록 좋은 결과가 나오는 것을 확인하였습니다.<br>


Modeling was done with XG-Boost and LightGBM, and an ensemble model was created using the Soft Voting method.<br>
It was confirmed that the higher the correlation between the two models, the better the results.<br>

In [None]:
random_state_val =777
test_size_val =0.1

df_train, df_val = train_test_split(df_train, test_size = test_size_val, random_state = random_state_val)

drop_col = [target, index_col]
y_nm = target

df_train_x = df_train.drop(drop_col, axis = 1)
df_train_y = pd.DataFrame(df_train[y_nm])

df_val_x = df_val.drop(drop_col, axis = 1)
df_val_y = pd.DataFrame(df_val[y_nm])

In [None]:
LGBClassifier = lgb.LGBMClassifier(objective='binary',
                                   nthread=4,
                                   n_estimators=10000,
                                   learning_rate=0.02,
                                   num_leaves=34,
                                   colsample_bytree=0.9497036,
                                   subsample=0.8715623,
                                   max_depth=8,
                                   reg_alpha=0.041545473,
                                   reg_lambda=0.0735294,
                                   min_split_gain=0.0222415,
                                   min_child_weight=39.3259775,
                                   silent=-1,
                                   random_state = 42)

In [None]:
start = datetime.datetime.now()
lgbm = LGBClassifier.fit(df_train_x.values,
                       df_train_y.values.ravel(),
                       eval_set = [(df_train_x.values, df_train_y), (df_val_x.values, df_val_y)], 
                       eval_metric ='auc',
                       early_stopping_rounds = 200,
                       verbose = True)
end = datetime.datetime.now()
end-start

In [None]:
feature_imp= pd.DataFrame(sorted(zip(lgbm.feature_importances_, df_train_x.columns), reverse = True), columns = ['Value', 'Feature'])
# feature_imp.to_excel("feature_imp.xlsx")

plt.figure(figsize=(7,5))
sns.barplot(x='Value', y='Feature', data=feature_imp[:10].sort_values(by='Value', ascending=False))
plt.tight_layout()
plt.show()
# plt.savefig('lightGBM_ Importances.png')

In [None]:
fpr, tpr, _ = roc_curve(df_val_y, lgbm.predict_proba(df_val_x.values)[:, 1])
roc_auc = auc(fpr, tpr)

result_lst =[]
max_value =0.
opt_threshold =0.
val_y_prob = lgbm.predict_proba(df_val_x.values)[:, 1]

for n in range(0,60):
    threshold = round(((n+1)*0.01),2)
    pred_yn = val_y_prob.copy()
    pred_yn = np.where(pred_yn > threshold, 1., 0.)
    
    result_dict = {}
    precision, recall, f1_score, support = precision_recall_fscore_support(df_val_y.values.ravel(), pred_yn, average='binary')
    accuracy = accuracy_score(df_val_y.values.ravel(), pred_yn)
    kappa = cohen_kappa_score(df_val_y.values.ravel(), pred_yn)
    
    result_dict ={'Threshold': threshold, 'Accuracy': round(accuracy,4), 'Precision': round(precision,4), 'Recall': round(recall,4), 'F1_Score': round(f1_score,4),'roc_auc': round(roc_auc,4), 'Kappa': round(kappa,4)}
    result_lst.append(result_dict)
    
    if choice == 'Accuracy':
        if max_value <= accuracy:
            max_value = accuracy
            opt_threshold = threshold
    elif choice == 'Precision':
        if max_value <= precision:
            max_value = precision
            opt_threshold = threshold
    elif choice == 'Recall':
        if max_value <= recall:
            max_value = recall
            opt_threshold = threshold
    elif choice == 'F1_Score':
        if max_value <= f1_score:
            max_value = f1_score
            opt_threshold = threshold
    elif choice == 'roc_auc':
        if max_value <= roc_auc:
            max_value = roc_auc
            opt_threshold = threshold
        
    confMat = confusion_matrix(df_val_y.values.ravel(), pred_yn, labels=[1,0])
    
matric_df = pd.DataFrame(result_lst, columns=['Threshold','Accuracy', 'Precision', 'Recall', 'F1_Score','roc_auc' ,'Kappa'])
matric_df.to_csv('REC_scores.csv',sep=',', header=True, index=False, encoding='UTF-8')

print('Max_value =%f, optimized_threshold=%f'%(max_value, opt_threshold))
print('Complete')

In [None]:
predict_lgbm = lgbm.predict_proba(df_val_x.values)[:,1]
pred_val = np.where(predict_lgbm > opt_threshold, 1., 0.)

tp, fn, fp, tn = confusion_matrix(df_val_y.values.ravel(), pred_val, labels=[1,0]).ravel()

In [None]:
conf_matrix = pd.DataFrame(
    confusion_matrix(df_val_y.values.ravel(), pred_val),
    columns=['Predicted Value 0', 'Predicted Value 1'],
    index=['True Value 0', 'True Value 1']
)

print("1. Counfusion Matrix")
print(conf_matrix.T)
print("")

print("2. Classification Report")
print(classification_report(df_val_y.values.ravel(), pred_val))

Accuracy_Rate = (tp + tn) / (tp + tn + fp + fn)
Recall_Rate = tp / (tp + fn)
Precision_Rate = tp / (tp + fp)
Specificity_Rate = tn / (tn + fp)
F1_Score = (Precision_Rate * Recall_Rate) / (Precision_Rate + Recall_Rate) * 2

print("3. Model Metric Sumamry")
print(" - Accuracy Rate    : {:2.3f} %".format(Accuracy_Rate*100))
print(" - Recall Rate      : {:2.3f} %".format(Recall_Rate*100))
print(" - Precision Rate   : {:2.3f} %".format(Precision_Rate*100))
print(" - Specificity Rate : {:2.3f} %".format(Specificity_Rate*100))
print(" - F1 Score         : {:2.3f} ".format(F1_Score*100))
print(" - ROC AUC          : {:2.3f} ".format(roc_auc*100))

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(df_val_y.values.ravel(), predict_lgbm)

import matplotlib.pyplot as plt
roc_auc = auc(fpr, tpr)

# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
XGBClassifier = xgb.XGBClassifier(max_depth = 6,
                                 learning_rate = 0.01,
                                 n_estimators = 10000,
                                 objective = 'binary:logistic',
                                 tree_method = 'gpu_hist',
                                 booster = 'gbtree',
                                 seed = 23,
                                 min_child_weight = 35,
                                 subsample = 0.7,
                                 alpha = 0.25,
                                 n_jobs = -1
                                 )

In [None]:
start = datetime.datetime.now()
xgb = XGBClassifier.fit(df_train_x.values,
                       df_train_y.values.ravel(),
                       eval_set = [(df_train_x.values, df_train_y), (df_val_x.values, df_val_y)], 
                       eval_metric = 'auc',
                       early_stopping_rounds = 200,
                       verbose = True)
end = datetime.datetime.now()
end-start

In [None]:
fi_vals = xgb.get_booster().get_score(importance_type = 'weight')
fi_dict = {df_train_x.columns[i]:float(fi_vals.get('f'+str(i),0.)) for i in range(len(df_train_x.columns))}
feature_importance_ = sorted(fi_dict.items(), key=operator.itemgetter(1), reverse=True)
feature_importance_result = OrderedDict(feature_importance_)

importance = pd.DataFrame(feature_importance_)
importance.columns = ['feature','weight']
importance.head(10)

In [None]:
importance_ten = importance[:10]
importance_ten.set_index('feature').sort_values(by='weight').plot(kind='barh', figsize=(5, 5))

In [None]:
fpr, tpr, _ = roc_curve(df_val_y, xgb.predict_proba(df_val_x.values)[:, 1])
roc_auc = auc(fpr, tpr)

result_lst =[]
max_value =0.
opt_threshold_2 =0.
val_y_prob = xgb.predict_proba(df_val_x.values)[:, 1]

for n in range(0,60):
    threshold = round(((n+1)*0.01),2)
    pred_yn = val_y_prob.copy()
    pred_yn = np.where(pred_yn > threshold, 1., 0.)
    
    result_dict = {}
    precision, recall, f1_score, support = precision_recall_fscore_support(df_val_y.values.ravel(), pred_yn, average='binary')
    accuracy = accuracy_score(df_val_y.values.ravel(), pred_yn)
    kappa = cohen_kappa_score(df_val_y.values.ravel(), pred_yn)
    
    result_dict ={'Threshold': threshold, 'Accuracy': round(accuracy,4), 'Precision': round(precision,4), 'Recall': round(recall,4), 'F1_Score': round(f1_score,4),'roc_auc': round(roc_auc,4), 'Kappa': round(kappa,4)}
    result_lst.append(result_dict)
    
    if choice == 'Accuracy':
        if max_value <= accuracy:
            max_value = accuracy
            opt_threshold_2 = threshold
    elif choice == 'Precision':
        if max_value <= precision:
            max_value = precision
            opt_threshold_2 = threshold
    elif choice == 'Recall':
        if max_value <= recall:
            max_value = recall
            opt_threshold_2 = threshold
    elif choice == 'F1_Score':
        if max_value <= f1_score:
            max_value = f1_score
            opt_threshold_2 = threshold
    elif choice == 'roc_auc':
        if max_value <= roc_auc:
            max_value = roc_auc
            opt_threshold_2 = threshold
        
    confMat = confusion_matrix(df_val_y.values.ravel(), pred_yn, labels=[1,0])
    
matric_df = pd.DataFrame(result_lst, columns=['Threshold','Accuracy', 'Precision', 'Recall', 'F1_Score','roc_auc' ,'Kappa'])
matric_df.to_csv('REC_scores.csv',sep=',', header=True, index=False, encoding='UTF-8')

print('Max_value =%f, optimized_threshold=%f'%(max_value, opt_threshold_2))
print('Complete')

In [None]:
predict_xgb = xgb.predict_proba(df_val_x.values)[:,1]
pred_val = np.where(predict_xgb > opt_threshold_2, 1., 0.)

tp, fn, fp, tn = confusion_matrix(df_val_y.values.ravel(), pred_val, labels=[1,0]).ravel()

conf_matrix = pd.DataFrame(
    confusion_matrix(df_val_y.values.ravel(), pred_val),
    columns=['Predicted Value 0', 'Predicted Value 1'],
    index=['True Value 0', 'True Value 1']
)

print("1. Counfusion Matrix")
print(conf_matrix.T)
print("")

print("2. Classification Report")
print(classification_report(df_val_y.values.ravel(), pred_val))

Accuracy_Rate = (tp + tn) / (tp + tn + fp + fn)
Recall_Rate = tp / (tp + fn)
Precision_Rate = tp / (tp + fp)
Specificity_Rate = tn / (tn + fp)
F1_Score = (Precision_Rate * Recall_Rate) / (Precision_Rate + Recall_Rate) * 2

print("3. Model Metric Sumamry")
print(" - Accuracy Rate    : {:2.3f} %".format(Accuracy_Rate*100))
print(" - Recall Rate      : {:2.3f} %".format(Recall_Rate*100))
print(" - Precision Rate   : {:2.3f} %".format(Precision_Rate*100))
print(" - Specificity Rate : {:2.3f} %".format(Specificity_Rate*100))
print(" - F1 Score         : {:2.3f} ".format(F1_Score*100))
print(" - ROC AUC          : {:2.3f} ".format(roc_auc*100))

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(df_val_y.values.ravel(), predict_xgb)

import matplotlib.pyplot as plt
roc_auc = auc(fpr, tpr)

# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
df_test = df_test.drop(index_col, axis = 1)

test_xgb = pd.Series(xgb.predict(df_test.values), name="XGB")
test_lgbm = pd.Series(lgbm.predict(df_test.values), name="LGBM")

# Concatenate all classifier results
ensemble_results = pd.concat([test_xgb,test_lgbm],axis=1)

g= sns.heatmap(ensemble_results.corr(),annot=True)

In [None]:
votingC = VotingClassifier(estimators=[('xgb', xgb),('lgbm', lgbm)], voting='soft')
votingC = votingC.fit(df_train_x.values, df_train_y.values)

In [None]:
fpr, tpr, _ = roc_curve(df_val_y, votingC.predict_proba(df_val_x.values)[:, 1])
roc_auc = auc(fpr, tpr)

result_lst =[]
max_value =0.
opt_threshold_3 =0.
val_y_prob = votingC.predict_proba(df_val_x.values)[:, 1]

for n in range(0,60):
    threshold = round(((n+1)*0.01),2)
    pred_yn = val_y_prob.copy()
    pred_yn = np.where(pred_yn > threshold, 1., 0.)
    
    result_dict = {}
    precision, recall, f1_score, support = precision_recall_fscore_support(df_val_y.values.ravel(), pred_yn, average='binary')
    accuracy = accuracy_score(df_val_y.values.ravel(), pred_yn)
    kappa = cohen_kappa_score(df_val_y.values.ravel(), pred_yn)
    
    result_dict ={'Threshold': threshold, 'Accuracy': round(accuracy,4), 'Precision': round(precision,4), 'Recall': round(recall,4), 'F1_Score': round(f1_score,4),'roc_auc': round(roc_auc,4), 'Kappa': round(kappa,4)}
    result_lst.append(result_dict)
    
    if choice == 'Accuracy':
        if max_value <= accuracy:
            max_value = accuracy
            opt_threshold_3 = threshold
    elif choice == 'Precision':
        if max_value <= precision:
            max_value = precision
            opt_threshold_3 = threshold
    elif choice == 'Recall':
        if max_value <= recall:
            max_value = recall
            opt_threshold_3 = threshold
    elif choice == 'F1_Score':
        if max_value <= f1_score:
            max_value = f1_score
            opt_threshold_3 = threshold
    elif choice == 'roc_auc':
        if max_value <= roc_auc:
            max_value = roc_auc
            opt_threshold_3 = threshold
        
    confMat = confusion_matrix(df_val_y.values.ravel(), pred_yn, labels=[1,0])
    
matric_df = pd.DataFrame(result_lst, columns=['Threshold','Accuracy', 'Precision', 'Recall', 'F1_Score','roc_auc' ,'Kappa'])
matric_df.to_csv('REC_scores.csv',sep=',', header=True, index=False, encoding='UTF-8')

print('Max_value =%f, optimized_threshold=%f'%(max_value, opt_threshold_3))
print('Complete')

In [None]:
predict_votingC = votingC.predict_proba(df_val_x.values)[:,1]
pred_val = np.where(predict_votingC > opt_threshold_3, 1., 0.)

tp, fn, fp, tn = confusion_matrix(df_val_y.values.ravel(), pred_val, labels=[1,0]).ravel()

conf_matrix = pd.DataFrame(
    confusion_matrix(df_val_y.values.ravel(), pred_val),
    columns=['Predicted Value 0', 'Predicted Value 1'],
    index=['True Value 0', 'True Value 1']
)

print("1. Counfusion Matrix")
print(conf_matrix.T)
print("")

print("2. Classification Report")
print(classification_report(df_val_y.values.ravel(), pred_val))

Accuracy_Rate = (tp + tn) / (tp + tn + fp + fn)
Recall_Rate = tp / (tp + fn)
Precision_Rate = tp / (tp + fp)
Specificity_Rate = tn / (tn + fp)
F1_Score = (Precision_Rate * Recall_Rate) / (Precision_Rate + Recall_Rate) * 2

print("3. Model Metric Sumamry")
print(" - Accuracy Rate    : {:2.3f} %".format(Accuracy_Rate*100))
print(" - Recall Rate      : {:2.3f} %".format(Recall_Rate*100))
print(" - Precision Rate   : {:2.3f} %".format(Precision_Rate*100))
print(" - Specificity Rate : {:2.3f} %".format(Specificity_Rate*100))
print(" - F1 Score         : {:2.3f} ".format(F1_Score*100))
print(" - ROC AUC          : {:2.3f} ".format(roc_auc*100))

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(df_val_y.values.ravel(), predict_votingC)

import matplotlib.pyplot as plt
roc_auc = auc(fpr, tpr)

# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
predict_votingC = votingC.predict_proba(df_test.values)[:,1]
pred_test = np.where(predict_votingC > opt_threshold_3, 1., 0.)

df_test = pd.read_csv(os.path.join(dir_ad, test))
test_result= pd.DataFrame(pred_test)
test_result.columns = [target]
predict = test_result[target]
Id_No = df_test[index_col]
submission = pd.DataFrame({index_col: Id_No, target: predict})
submission[target] = submission[target].astype('Int64')
submission.to_csv('submission.csv', index=False)
submission.head()