In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
from xgboost import plot_importance
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

dataset= load_breast_cancer()
X_features= dataset.data
y_label= dataset.target

cancer_df= pd.DataFrame(data=X_features, columns=dataset.feature_names)
cancer_df['target']= y_label
cancer_df.head()

In [None]:
cancer_df.target.value_counts()

In [None]:
cancer_df.info()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y_label,
                                                   test_size=0.2, random_state= 156)
print(X_train.shape,X_test.shape)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, \
precision_score, recall_score, f1_score, roc_auc_score

def get_clf_eval(y_test, pred, pred_proba):
    confusion= confusion_matrix(y_test,pred)
    accuracy= accuracy_score(y_test,pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred_proba)
    
    print('오차 행렬 >> \n',confusion)
    print('정확도: {:.4f}, 정밀도: {:.4f}, 재현율: {:.4f}, F1: {:.4f}, \
    AUC:{:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [None]:

# 사이킷런 Wrapper XGBooster 적용
from xgboost import XGBClassifier
evals = [(X_test, y_test)]
xgb_wrapper = XGBClassifier( n_estimators=400, learning_rate=0.1, max_depth=3) # 객체생성
xgb_wrapper.fit(X_train, y_train, early_stopping_rounds=100,
                eval_set=evals, eval_metric='logloss',verbose=True) # verbose: 진행상황 보이기
ws100_preds = xgb_wrapper.predict(X_test)
ws100_preds_proba = xgb_wrapper.predict_proba(X_test)[:,1]

In [None]:
get_clf_eval(y_test,ws100_preds,ws100_preds_proba)

In [None]:
# 사이킷런 Wrapper XGBooster 적용
from xgboost import XGBClassifier
evals = [(X_test, y_test)]
xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.1,max_depth=3)
xgb_wrapper.fit(X_train, y_train, early_stopping_rounds=400,
                eval_set=evals, eval_metric='logloss',verbose=True) # verbose: 진행상황 보이기
ws400_preds = xgb_wrapper.predict(X_test)
ws400_preds_proba = xgb_wrapper.predict_proba(X_test)[:,1]

In [None]:
get_clf_eval(y_test,ws400_preds,ws400_preds_proba)

In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt

fig,ax = plt.subplots(figsize=(10,12))
plot_importance(xgb_wrapper,ax=ax)

In [None]:
import lightgbm 
print(lightgbm.__version__)

In [None]:
from lightgbm import LGBMClassifier
dataset = load_breast_cancer()
ftr= dataset.data
target = dataset.target

X_train, X_test, y_train, y_test = train_test_split(ftr, target,
                                                   test_size=0.2, 
                                                    random_state= 156)

lgbm_wrapper= LGBMClassifier(n_estimators=400)
evals= [(X_test,y_test)]
lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=100,
                eval_metric= 'logloss',eval_set=evals, verbose=True)
preds = lgbm_wrapper.predict(X_test)
preds_proba= lgbm_wrapper.predict_proba(X_test)[:,1]

In [None]:
get_clf_eval(y_test,preds,preds_proba)

In [None]:
from lightgbm import plot_importance
import matplotlib.pyplot as plt
fig,ax = plt.subplots(figsize=(10,12))
plot_importance(lgbm_wrapper)

In [None]:
[과제]
캐글 산탄데르 고객 만족 예측 
Q. 산탄테르 은행의 고객만족 예측 분석을 수행하세요.
- 370개의 피처로 주어진 데이터 세트
- 클래스 레이블명은 target 1이 불만, 0은 만족 
- 모델의 성능 평가는 ROC_AUC
- 


In [None]:
from xgboost import plot_importance
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('dataset/santander/train.csv')
test= pd.read_csv('dataset/santander/test.csv')

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.head(3)

In [None]:
def check_missing_data(df):
    flag=df.isna().sum().any()
    if flag==True:
        total = df.isnull().sum()
        percent = (df.isnull().sum())/(df.isnull().count()*100)
        output = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
        data_type = []
        # written by MJ Bahmani
        for col in df.columns:
            dtype = str(df[col].dtype)
            data_type.append(dtype)
        output['Types'] = data_type
        return(np.transpose(output))
    else:
        return(False)
check_missing_data(train)
check_missing_data(test)

In [None]:
X = train.drop(["TARGET","ID"],axis=1)
y = train["TARGET"]

In [None]:
X_test =test.drop('ID',axis=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=156)


In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score

def get_clf_eval(y_test, pred, pred_proba):
    roc_auc = roc_auc_score(y_test, pred_proba)
    
    print('오차 행렬 >> \n',confusion)
    print(' AUC:{:.4f}'.format(roc_auc))

In [None]:
# 사이킷런 Wrapper XGBooster 적용
from xgboost import XGBClassifier

evals = [(X_test, y_test)]

xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.1,max_depth=3)
xgb_wrapper.fit(X_train, y_train, early_stopping_rounds=100,
                eval_set=evals, eval_metric='logloss',verbose=True) # verbose: 진행상황 보이기
ws100_preds = xgb_wrapper.predict(X_test)
ws100_preds_proba = xgb_wrapper.predict_proba(X_test)[:,1]

In [1]:
import pandas as pd
import numpy as np

In [2]:
cust_df = pd.read_csv('dataset/santander/train.csv')
print(cust_df.shape)
cust_df.head()

(76020, 371)


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [3]:
cust_df.TARGET.value_counts()

0    73012
1     3008
Name: TARGET, dtype: int64

In [4]:
# 불만족 count
uns_cnt= cust_df[cust_df['TARGET']==1].TARGET.count()
# 전체 count
total_cnt= cust_df.TARGET.count()

# 불만족 비중
print('불만족 비중>>{:.2f}'.format(uns_cnt/total_cnt))

불만족 비중>>0.04


In [5]:
cust_df.describe()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,75964.050723,-1523.199277,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.8,0.039569
std,43781.947379,39033.462364,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.6,0.194945
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,38104.75,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61,0.0
50%,76043.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2,0.0
75%,113748.75,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3,0.0
max,151838.0,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0


In [6]:
cust_df.var3.value_counts()[:10]

 2         74165
 8           138
-999999      116
 9           110
 3           108
 1           105
 13           98
 7            97
 4            86
 12           85
Name: var3, dtype: int64

In [7]:
# -999999 이상함 --> 제일 많은 값 2로 변경 
cust_df['var3'].replace(-999999,2,inplace=True)
# ID컬럼 필요없으니 drop하기
cust_df.drop('ID',axis=1,inplace=True)

(76020, 369)

In [26]:
# X_features : TARGET 뺴고 다 / y_labels : TARGET 
X_features = cust_df.iloc[:,:-1]
y_labels = cust_df.iloc[:,-1]

X_features.shape
y_labels.shape

(76020,)

In [33]:
from sklearn.model_selection import train_test_split
# train/test 분리 
# stratify= y_label 불만족 비중  train/ test에 공평하게 나눠주기 
X_train,X_test,y_train,y_test = train_test_split(X_features, y_labels,
                                                test_size=0.2,
                                                random_state=0,
                                                stratify= y_labels)
train_cnt= y_train.count()
test_cnt= y_test.count()

# train/ test에 target 분포 비율 
print('train 분포 비율 :\n', y_train.value_counts()/train_cnt)
print('test 분포 비율 :\n', y_test.value_counts()/test_cnt)

train 분포 비율 :
 0    0.960438
1    0.039562
Name: TARGET, dtype: float64
test 분포 비율 :
 0    0.960405
1    0.039595
Name: TARGET, dtype: float64


In [43]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# xgb에 사용할 tree 수: 500 
xgb_clf = XGBClassifier(n_estimators=500, random_state=156)
# xgb학습   ( eval_metric: 검증에 사용되는 함수 )
xgb_clf.fit(X_train, y_train, early_stopping_rounds=100,
           eval_metric='auc', eval_set=[(X_train,y_train),(X_test,y_test)])

xgb_roc_score= roc_auc_score( y_test, xgb_clf.predict_proba(X_test)[:,1])
print(xgb_roc_score)



[0]	validation_0-auc:0.82569	validation_1-auc:0.79283
[1]	validation_0-auc:0.84010	validation_1-auc:0.80737
[2]	validation_0-auc:0.84361	validation_1-auc:0.81021
[3]	validation_0-auc:0.84783	validation_1-auc:0.81287
[4]	validation_0-auc:0.85123	validation_1-auc:0.81469
[5]	validation_0-auc:0.85518	validation_1-auc:0.81860
[6]	validation_0-auc:0.85922	validation_1-auc:0.81977
[7]	validation_0-auc:0.86238	validation_1-auc:0.82034
[8]	validation_0-auc:0.86570	validation_1-auc:0.82147
[9]	validation_0-auc:0.86798	validation_1-auc:0.82301
[10]	validation_0-auc:0.87104	validation_1-auc:0.82379
[11]	validation_0-auc:0.87448	validation_1-auc:0.82456
[12]	validation_0-auc:0.87687	validation_1-auc:0.82401
[13]	validation_0-auc:0.87918	validation_1-auc:0.82467
[14]	validation_0-auc:0.88081	validation_1-auc:0.82508
[15]	validation_0-auc:0.88331	validation_1-auc:0.82379
[16]	validation_0-auc:0.88569	validation_1-auc:0.82457
[17]	validation_0-auc:0.88674	validation_1-auc:0.82453
[18]	validation_0-au

In [41]:
from sklearn.model_selection import GridSearchCV

xgb_clf = XGBClassifier(n_estimatos=100)
params= {'max_depth':[5,7], 'min_child_weight':[1,3],
        'colsample_bytree':[0.5,0.75]}

 

array([[0.993096  , 0.00690398],
       [0.97350717, 0.02649283],
       [0.9808965 , 0.01910355],
       ...,
       [0.98011357, 0.01988643],
       [0.98821384, 0.01178615],
       [0.99388534, 0.00611465]], dtype=float32)