## 1. 환경설정

In [1]:
import numpy as np
import pandas as pd
import sklearn

from sklearn import preprocessing
from sklearn.ensemble  import RandomForestClassifier
from sklearn import metrics 

import xgboost as xgb   # XGBoost 라이브러리 읽기
from xgboost import XGBClassifier


# 디스플레이 환경 설정 (자료 검사용)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings(action='ignore')



## 2. 함수설정

In [2]:
# 확률 + 상대순위(입력값)을 조사 결정으로 변환하는 함수

def convert(pred_prob,m):
    
    
    e=[]
    for h in range(0,len(pred_prob)):
        e.append(pred_prob[h][1])
    e_temp=sorted(e)
    e_temp.reverse()
    e_ind=[]
    for i in e:
        e_ind.append(e_temp.index(i)+1)    
    last_ind=len(e_ind)
    init=0
    for x in range(0,last_ind):
        dd=e[x]
        if dd == 0:
            e_ind[x]=last_ind
            
    point_m=int(round(len(e_ind)*m))
    pred=[]
    
    for x in range(0,last_ind):
        hh=e_ind[x]
    
        if hh<=point_m:
            pred.append(1)
        else:
            pred.append(0)

    return pred

 # 조사 시 검거 확률과 미스 확률을 반환하는 함수

def performance(X, Y):
    detect=[]
    miss=[]
    last_ind=len(Y)
    for i in range(0,last_ind):
        x_i=X[i]
        y_i=Y[i]

    
        if (x_i==1) & (y_i==1):
            detect.append(1)
        elif (x_i==1) & (y_i==0):
            detect.append(0)
        elif (x_i==0) & (y_i==1):
            miss.append(1)
        elif (x_i==0) & (y_i==0):
            miss.append(0)    
    prob_detect=detect.count(1)/len(detect)
    prob_miss=miss.count(1)/len(miss)
    
    
    return [prob_detect,prob_miss]



# 자료 인코딩용 함수 정의 : 아래 함수에서 dataDF는 데이터프레임, list는 라벨화하려는 범주변수의 목록
def encode_labels(list, dataDF):                               
    for x in list:
        temp=preprocessing.LabelEncoder()
        dataDF[x]=temp.fit_transform(dataDF[x])
    return dataDF

## 3. checkpoint quantile의 설정

In [3]:
# check할 quantile points

measurements=[0.5, 0.25, 0.1]

## 4. 하이퍼 파라메터 설정 (각 숫자는 차례대로 부정청약,위장전입,부정매매에 대한 파라메터)

In [18]:
#------------- 각 flag는 어떤 test를 할 것인지 설정.  RF는 random forest, XGB는 XGBoost, Y1,Y2,Y3는 부정청약,위장전입,부정매매 test

flag_RF=1
flag_XGB=1

flag_Y1=1 #부정청약
flag_Y2=1 #위장전입
flag_Y3=1 #부정매매

#------- RF parameters----------

max_depth_Y1=10
max_depth_Y2=7
max_depth_Y3=14

min_samples_leaf_Y1=5
min_samples_leaf_Y2=3
min_samples_leaf_Y3=3

max_leaf_nodes_Y1=None
max_leaf_nodes_Y2=None
max_leaf_nodes_Y3=None

max_features_Y1='sqrt'
max_features_Y2='sqrt'
max_features_Y3='sqrt'

min_samples_split_Y1=4
min_samples_split_Y2=4
min_samples_split_Y3=4

bootstrap_Y1=True
bootstrap_Y2=True
bootstrap_Y3=True

warm_start_Y1=False
warm_start_Y2=False
warm_start_Y3=False

class_weight_Y1= {0:1, 1:5}
class_weight_Y2= {0:1, 1:5}
class_weight_Y3= {0:1, 1:5}

#-------XGBoost parameters-------

reg_alpha_Y1 = 0.75
reg_alpha_Y2 = 0.75
reg_alpha_Y3 = 0.75

reg_lambda_Y1 = 0.5
reg_lambda_Y2 = 0.5
reg_lambda_Y3 = 0.5

gamma_Y1 = 0
gamma_Y2 = 0
gamma_Y3 = 0

booster_Y1 = 'gbtree'
booster_Y2 = 'gbtree'
booster_Y3 = 'gbtree'
#  'gbtree', 'gblinear' 'dart'

max_depth_Y1 = 10
max_depth_Y2 = 10
max_depth_Y3 = 10 

objective_Y1 = 'binary:logistic'
objective_Y2 = 'binary:logistic'
objective_Y3 = 'binary:logistic'
#'binary:logistic', 'binary:logitraw', 'binary:hinge'

learning_rate_Y1=0.75
learning_rate_Y2=0.75
learning_rate_Y3=0.75

min_child_weight_Y1=1
min_child_weight_Y2=1
min_child_weight_Y3=1

colsample_bytree_Y1=1
colsample_bytree_Y2=1
colsample_bytree_Y3=1

scale_pos_weight_Y1=1
scale_pos_weight_Y2=1
scale_pos_weight_Y3=1

subsample_Y1=1
subsample_Y2=1
subsample_Y3=1

## 5. 학습 및 성능체크(printing)

In [19]:
# main


for j in range(1,11):
    

#-------------------------- data load---------------------------------------
    
    X_train_df=pd.read_csv(r'X_train_'+str(j)+'.csv', encoding='CP949', sep=",")
    X_test_df=pd.read_csv(r'X_test_'+str(j)+'.csv', encoding='CP949', sep=",")
    Y1_train=pd.read_csv(r'Y1_train_'+str(j)+'.csv', encoding='CP949', sep=",")
    Y1_test=pd.read_csv(r'Y1_test_'+str(j)+'.csv', encoding='CP949', sep=",")
    Y2_train=pd.read_csv(r'Y2_train_'+str(j)+'.csv', encoding='CP949', sep=",")
    Y2_test=pd.read_csv(r'Y2_test_'+str(j)+'.csv', encoding='CP949', sep=",")
    Y3_train=pd.read_csv(r'Y3_train_'+str(j)+'.csv', encoding='CP949', sep=",")
    Y3_test=pd.read_csv(r'Y3_test_'+str(j)+'.csv', encoding='CP949', sep=",")
    X_train_df=X_train_df.drop('Unnamed: 0', axis=1)
    X_test_df=X_test_df.drop('Unnamed: 0', axis=1)
    Y1_train=Y1_train.drop('Unnamed: 0', axis=1)
    Y2_train=Y2_train.drop('Unnamed: 0', axis=1)
    Y3_train=Y3_train.drop('Unnamed: 0', axis=1)
    Y1_test=Y1_test.drop('Unnamed: 0', axis=1)
    Y2_test=Y2_test.drop('Unnamed: 0', axis=1)
    Y3_test=Y3_test.drop('Unnamed: 0', axis=1)


#---------------------변수 선택 및 전처리 등 파트-----------------------------

    X_list=['크기', 'ad_총공급', 'ad_행정변경시점', 'ad_성명생년전화중복', \
            '배우자', '2년청약건수', '세대원수', '분리세대원수', '폰중복횟수_부동산원', 'IP중복신청횟수_부동산원',\
            'ad_IP중복_3자리', 'ad_IP중복_4자리', 'ad_접수시간', 'ad_신청당첨거주일치여부', 'ad_부양가수족수', 'ad_저축가입기간', 'ad_무주택기간',\
            'ad_청약납부회차', 'ad_청약경과기간', 'ad_총점', 'ad_주소일치여부', 'ad_변경시점2',\
            '공급금액', '연령', '세대주관계', '특일동시여부', ' 주소중복횟수', '가점합계', 'ad_신청유형', '기관추천종류',\
            '특이사항', '접수매체'
            ]  

    X_train=X_train_df[X_list]
    X_test=X_test_df[X_list]
    str_list = ['배우자', '세대주관계', 'ad_신청유형', '기관추천종류', '특이사항', '접수매체']
    X_train[ str_list ]=X_train[ str_list ].astype('str')
    X_test[ str_list ]=X_test[ str_list ].astype('str')

#float_list = ['ad_청약납부금액']
#X_train[ float_list ]=X_train[ float_list ].astype('float')
#X_test[ float_list ]=X_test[ float_list ].astype('float')

    Y1_train.loc[X_train['세대원수'].isna(), '부정청약판정'] = None
    Y2_train.loc[X_train['세대원수'].isna(), '부정_위장전입'] = None
    Y3_train.loc[X_train['세대원수'].isna(), '부정_입주자저축증서매매'] = None
    Y1_test.loc[X_test['세대원수'].isna(), '부정청약판정'] = None
    Y2_test.loc[X_test['세대원수'].isna(), '부정_위장전입'] = None
    Y3_test.loc[X_test['세대원수'].isna(), '부정_입주자저축증서매매'] = None
    X_train=X_train.dropna()
    X_test=X_test.dropna()
    Y1_train=Y1_train.dropna()
    Y1_test=Y1_test.dropna()
    Y2_train=Y2_train.dropna()
    Y2_test=Y2_test.dropna()
    Y3_train=Y3_train.dropna()
    Y3_test=Y3_test.dropna()

    X_train=encode_labels(str_list, X_train)
    X_test=encode_labels(str_list, X_test)

    
#-------------------------------RF: training-----------------------------------
    if flag_RF==1:
        if flag_Y1==1:
            rf1=RandomForestClassifier(n_estimators=100, random_state=11, max_depth=max_depth_Y1, min_samples_leaf=min_samples_leaf_Y1,\
                                       max_features=max_features_Y1, max_leaf_nodes=max_leaf_nodes_Y1, min_samples_split=min_samples_split_Y1,\
                                      bootstrap=bootstrap_Y1, warm_start=warm_start_Y1, class_weight=class_weight_Y1)
            rf1.fit(X_train,Y1_train)
            rf1_pred=rf1.predict_proba(X_test)
        if flag_Y2==1:
            rf2=RandomForestClassifier(n_estimators=100, random_state=11, max_depth=max_depth_Y2, min_samples_leaf=min_samples_leaf_Y2,\
                                       max_features=max_features_Y2, max_leaf_nodes=max_leaf_nodes_Y2, min_samples_split=min_samples_split_Y2,\
                                      bootstrap=bootstrap_Y2, warm_start=warm_start_Y2, class_weight=class_weight_Y1)
            rf2.fit(X_train,Y1_train)
            rf2_pred=rf2.predict_proba(X_test)
        if flag_Y3==1:
            rf3=RandomForestClassifier(n_estimators=100, random_state=11, max_depth=max_depth_Y3, min_samples_leaf=min_samples_leaf_Y3,\
                                       max_features=max_features_Y3, max_leaf_nodes=max_leaf_nodes_Y3, min_samples_split=min_samples_split_Y3,\
                                      bootstrap=bootstrap_Y3, warm_start=warm_start_Y3, class_weight=class_weight_Y1)
            rf3.fit(X_train,Y1_train)
            rf3_pred=rf3.predict_proba(X_test)
            
#-------------------------------XGB: training-----------------------------------
    if flag_XGB==1:
        if flag_Y1==1:
            xgb1=XGBClassifier(reg_alpha=reg_alpha_Y1, reg_lambda=reg_lambda_Y1, gamma=gamma_Y1, booster=booster_Y1, max_depth=max_depth_Y1,\
                              objective=objective_Y1, learning_rate=learning_rate_Y1, min_child_weight=min_child_weight_Y1,\
                              colsample_bytree=colsample_bytree_Y1, scale_pos_weight=scale_pos_weight_Y1, subsample_=subsample_Y1,\
                              verbosity = 0)
            xgb1.fit(X_train,Y1_train)
            xgb1_pred=xgb1.predict_proba(X_test)
        if flag_Y2==1:
            xgb2=XGBClassifier(reg_alpha=reg_alpha_Y2,reg_lambda=reg_lambda_Y2, gamma=gamma_Y2, booster=booster_Y2, max_depth=max_depth_Y2,\
                              objective=objective_Y2, learning_rate=learning_rate_Y2, min_child_weight=min_child_weight_Y2,\
                              colsample_bytree=colsample_bytree_Y2, scale_pos_weight=scale_pos_weight_Y2, subsample_=subsample_Y2,\
                              verbosity = 0)
            xgb2.fit(X_train,Y1_train)
            xgb2_pred=xgb2.predict_proba(X_test)
        if flag_Y3==1:
            xgb3=XGBClassifier(reg_alpha=reg_alpha_Y3,reg_lambda=reg_lambda_Y3, gamma=gamma_Y3, booster=booster_Y3, max_depth=max_depth_Y3,\
                              objective=objective_Y3, learning_rate=learning_rate_Y3, min_child_weight=min_child_weight_Y3,\
                              colsample_bytree=colsample_bytree_Y3, scale_pos_weight=scale_pos_weight_Y3, subsample_=subsample_Y3,\
                              verbosity = 0)
            xgb3.fit(X_train,Y1_train)
            xgb3_pred=xgb3.predict_proba(X_test)
            
            
            
            
        
#--------------------------------------- performance---------------------

    print('  ')
    print('  ')
    print('********************************************',j,'th subsample********************************************')   
    print('  ')
    
    if flag_Y1==1:
        print('-------------------------------------부정청약여부----------------------------------')
        Y=list(Y1_test.iloc[:,0])
        print('Y1 benchmark:', Y.count(1)/len(Y))
        for m in measurements:
            if flag_RF==1:
                pred_iter=convert(rf1_pred,m)
                result=performance(pred_iter,Y)
                print('<RF>  prob-detect for m=',m,'  :    ', result[0], '            prob-miss:    ', result[1])
                
            if flag_XGB==1:
                pred_iter=convert(xgb1_pred,m)
                result=performance(pred_iter,Y)
                print('<XGB>  prob-detect for m=',m,'  :    ', result[0], '            prob-miss:    ', result[1])
                
    
    if flag_Y2==1:
        print('-------------------------------------위장전입여부----------------------------------')
        Y=list(Y2_test.iloc[:,0])
        print('Y2 benchmark:', Y.count(1)/len(Y))
        for m in measurements:
            if flag_RF==1:
                pred_iter=convert(rf2_pred,m)
                result=performance(pred_iter,Y)
                print('<RF>  prob-detect for m=',m,'  :    ', result[0], '            prob-miss:    ', result[1])
                
            if flag_XGB==1:
                pred_iter=convert(xgb2_pred,m)
                result=performance(pred_iter,Y)
                print('<XGB>  prob-detect for m=',m,'  :    ', result[0], '            prob-miss:    ', result[1])
    
    if flag_Y3==1:
        print('-------------------------------------매매여부-------------------------------------')    
        Y=list(Y3_test.iloc[:,0])
        print('Y3 benchmark:', Y.count(1)/len(Y))
        for m in measurements:
            if flag_RF==1:
                pred_iter=convert(rf3_pred,m)
                result=performance(pred_iter,Y)
                print('<RF>  prob-detect for m=',m,'  :    ', result[0], '            prob-miss:    ', result[1])
                
            if flag_XGB==1:
                pred_iter=convert(xgb3_pred,m)
                result=performance(pred_iter,Y)
                print('<XGB>  prob-detect for m=',m,'  :    ', result[0], '            prob-miss:    ', result[1])

    

  
  
******************************************** 1 th subsample********************************************
  
-------------------------------------부정청약여부----------------------------------
Y1 benchmark: 0.03864948911594847
<RF>  prob-detect for m= 0.5   :     0.0630550621669627             prob-miss:     0.014222222222222223
<XGB>  prob-detect for m= 0.5   :     0.0630550621669627             prob-miss:     0.014222222222222223
<RF>  prob-detect for m= 0.25   :     0.10479573712255773             prob-miss:     0.016587677725118485
<XGB>  prob-detect for m= 0.25   :     0.0941385435168739             prob-miss:     0.02014218009478673
<RF>  prob-detect for m= 0.1   :     0.19555555555555557             prob-miss:     0.021224086870681145
<XGB>  prob-detect for m= 0.1   :     0.16             prob-miss:     0.025172754195459033
-------------------------------------위장전입여부----------------------------------
Y2 benchmark: 0.027543314082629944
<RF>  prob-detect for m= 0.5   :     0.0470692

<XGB>  prob-detect for m= 0.25   :     0.07130124777183601             prob-miss:     0.016033254156769598
<RF>  prob-detect for m= 0.1   :     0.14285714285714285             prob-miss:     0.017318159327065808
<XGB>  prob-detect for m= 0.1   :     0.09821428571428571             prob-miss:     0.022266204849084613
-------------------------------------매매여부-------------------------------------
Y3 benchmark: 0.008908685968819599
<RF>  prob-detect for m= 0.5   :     0.012477718360071301             prob-miss:     0.005342831700801425
<XGB>  prob-detect for m= 0.5   :     0.013368983957219251             prob-miss:     0.004452359750667854
<RF>  prob-detect for m= 0.25   :     0.0213903743315508             prob-miss:     0.004750593824228029
<XGB>  prob-detect for m= 0.25   :     0.023172905525846704             prob-miss:     0.004156769596199525
<RF>  prob-detect for m= 0.1   :     0.044642857142857144             prob-miss:     0.004948045522018802
<XGB>  prob-detect for m= 0.1   :   

<XGB>  prob-detect for m= 0.25   :     0.11190053285968028             prob-miss:     0.02014218009478673
<RF>  prob-detect for m= 0.1   :     0.2088888888888889             prob-miss:     0.024679170779861797
<XGB>  prob-detect for m= 0.1   :     0.18666666666666668             prob-miss:     0.027147087857847977
-------------------------------------위장전입여부----------------------------------
Y2 benchmark: 0.027099067081297203
<RF>  prob-detect for m= 0.5   :     0.043516873889875664             prob-miss:     0.010666666666666666
<XGB>  prob-detect for m= 0.5   :     0.04262877442273535             prob-miss:     0.011555555555555555
<RF>  prob-detect for m= 0.25   :     0.07460035523978685             prob-miss:     0.011255924170616114
<XGB>  prob-detect for m= 0.25   :     0.07104795737122557             prob-miss:     0.012440758293838863
<RF>  prob-detect for m= 0.1   :     0.1288888888888889             prob-miss:     0.01579466929911155
<XGB>  prob-detect for m= 0.1   :     0.088

## 튜닝 된 파라메터 안까먹게 저장

In [6]:
# <RF> save 1 

max_depth_Y1=10
max_depth_Y2=7
max_depth_Y3=14

min_samples_leaf_Y1=5
min_samples_leaf_Y2=3
min_samples_leaf_Y3=3

max_leaf_nodes_Y1=None
max_leaf_nodes_Y2=None
max_leaf_nodes_Y3=None

max_features_Y1=10
max_features_Y2=10
max_features_Y3=6

min_samples_split_Y1=4
min_samples_split_Y2=4
min_samples_split_Y3=4

bootstrap_Y1=True
bootstrap_Y2=True
bootstrap_Y3=True

warm_start_Y1=False
warm_start_Y2=False
warm_start_Y3=False


In [7]:
# <XGB> save 1 

reg_alpha_Y1 = 0.75
reg_alpha_Y2 = 0.75
reg_alpha_Y3 = 0.75

reg_lambda_Y1 = 0.5
reg_lambda_Y2 = 0.5
reg_lambda_Y3 = 0.5

gamma_Y1 = 0
gamma_Y2 = 0
gamma_Y3 = 0

booster_Y1 = 'gbtree'
booster_Y2 = 'gbtree'
booster_Y3 = 'gbtree'
#  'gbtree', 'gblinear' 'dart'

max_depth_Y1 = 10
max_depth_Y2 = 10
max_depth_Y3 = 10 

objective_Y1 = 'binary:logistic'
objective_Y2 = 'binary:logistic'
objective_Y3 = 'binary:logistic'
#'binary:logistic', 'binary:logitraw', 'binary:hinge'

learning_rate_Y1=0.75
learning_rate_Y2=0.75
learning_rate_Y3=0.75

min_child_weight_Y1=1
min_child_weight_Y2=1
min_child_weight_Y3=1

colsample_bytree_Y1=1
colsample_bytree_Y2=1
colsample_bytree_Y3=1

scale_pos_weight_Y1=1
scale_pos_weight_Y2=1
scale_pos_weight_Y3=1

subsample_Y1=1
subsample_Y2=1
subsample_Y3=1