## 목차
#### 1. 라이브러리 설치 및 불러오기
#### 2. 데이터 전처리
#### 3. 모델링
#### 4. 스태킹
---

## 1. 라이브러리 설치 및 불러오기 (Install and Load the Libraries)

In [1]:
# 시스템 
import os, sys
import timeit

# 기본 라이브러리
import numpy as np
import pandas as pd
import datetime
import math 

# 문자열 처리
import string

# 시각화 라이브러리
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 구글 드라이브 
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 모델링 라이브러리
import tensorflow as tf
from tensorflow import keras

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold , KFold
from sklearn.metrics import *
from sklearn.utils import class_weight

from xgboost.sklearn import XGBClassifier
!pip install -q catboost 
from catboost import CatBoostClassifier 

import pickle, joblib

import warnings
warnings.filterwarnings(action='ignore')

Mounted at /content/drive


## 2. 데이터 전처리 (Data Preprocessing)

### 3종류의 데이터프레임을 만들기
- **[1타입]**:  train1  
- **[2타입]**: train2  
- **[3타입]**: train3  

### **2-1. 1타입 데이터 만들기**
#### 2-1-1. 파생변수 생성
1. D / H / L 코드의 매칭 여부
  - D & L은 세분류, 대분류 기준 / H는 중분류, 대분류 기준 매칭시키기
  - d_l_match_123, d_m_match_123, d_s_match_123, h_l_match_123, h_m_match_123, h_s_match_123
    - ex) d_l_match_123: 1,2,3 중 하나라도 대분류가 match 된다면 True 아니라면 False

#### 2-1-2. 변수형 변환

#### 2-1-3.  id 리스트 저장

#### 2-1-4. 불필요한 컬럼 제거
- EDA 결과 `person_prefer_f` 열과 `person_prefer_g` 열은 모든 샘플에 대해 동일한 값을 가진 상수라는 것을 파악할 수 있었다. 따라서 해당 열은 제거하고 분석을 진행하기로 하였다.
- `contents_open_dt`를 제외할 때 모델의 성능이 더 좋았기 때문에 해당 열을 제외했다.

#### `preprocessing_1` 전처리 함수 정의

In [2]:
def preprocessing_1(df):

  # 1) D / H / L 코드 파생변수 생성  
  D_code_1=D_code.rename(columns={'속성 D 코드':'person_prefer_d_1','속성 D 세분류코드':'person_prefer_d_1_m','속성 D 대분류코드':'person_prefer_d_1_l'}, inplace = False).drop(['속성 D 소분류코드','속성 D 중분류코드'],axis="columns")
  D_code_2=D_code.rename(columns={'속성 D 코드':'person_prefer_d_2','속성 D 세분류코드':'person_prefer_d_2_m','속성 D 대분류코드':'person_prefer_d_2_l'}, inplace = False).drop(['속성 D 소분류코드','속성 D 중분류코드'],axis="columns")
  D_code_3=D_code.rename(columns={'속성 D 코드':'person_prefer_d_3','속성 D 세분류코드':'person_prefer_d_3_m','속성 D 대분류코드':'person_prefer_d_3_l'}, inplace = False).drop(['속성 D 소분류코드','속성 D 중분류코드'],axis="columns")
  D_code_contents=D_code.rename(columns = {'속성 D 코드' : 'contents_attribute_d','속성 D 세분류코드':'contents_attribute_d_m','속성 D 대분류코드':'contents_attribute_d_l'}, inplace = False).drop(['속성 D 소분류코드','속성 D 중분류코드'],axis="columns")
  H_code_1=H_code.rename(columns={'속성 H 코드':'person_prefer_h_1','속성 H 중분류코드':'person_prefer_h_1_m','속성 H 대분류코드':'person_prefer_h_1_l'}, inplace = False)
  H_code_2=H_code.rename(columns={'속성 H 코드':'person_prefer_h_2','속성 H 중분류코드':'person_prefer_h_2_m','속성 H 대분류코드':'person_prefer_h_2_l'}, inplace = False)
  H_code_3=H_code.rename(columns={'속성 H 코드':'person_prefer_h_3','속성 H 중분류코드':'person_prefer_h_3_m','속성 H 대분류코드':'person_prefer_h_3_l'}, inplace = False)
  H_code_contents=H_code.rename(columns={'속성 H 코드' : 'contents_attribute_h','속성 H 중분류코드':'contents_attribute_h_m','속성 H 대분류코드':'contents_attribute_h_l'}, inplace = False)
  L_code_contents=L_code.rename(columns={'속성 L 코드' : 'contents_attribute_l','속성 L 세분류코드':'contents_attribute_l_m','속성 L 대분류코드':'contents_attribute_l_l'}, inplace = False).drop(['속성 L 소분류코드','속성 L 중분류코드'],axis="columns")

  df_list=[D_code_1,D_code_2,D_code_3,D_code_contents,
          H_code_1,H_code_2,H_code_3,H_code_contents, L_code_contents]
  df_column=["person_prefer_d_1","person_prefer_d_2","person_prefer_d_3","contents_attribute_d",
            "person_prefer_h_1","person_prefer_h_2","person_prefer_h_3","contents_attribute_h",
            "contents_attribute_l"]

  for i in range(0,len(df_column)):
    df = pd.merge(df, df_list[i],on=df_column[i]) 

  df.loc[(df['person_prefer_d_1_m']==df['contents_attribute_d_m'])|(df['person_prefer_d_2_m']==df['contents_attribute_d_m'])| (df['person_prefer_d_3_m']==df['contents_attribute_d_m']),'d_m_match_123']=True
  df.loc[(df['person_prefer_d_1_l']==df['contents_attribute_d_l'])|(df['person_prefer_d_2_l']==df['contents_attribute_d_l'])| (df['person_prefer_d_3_l']==df['contents_attribute_d_l']),'d_l_match_123']=True
  df.loc[(df['person_prefer_d_1']==df['contents_attribute_d'])|(df['person_prefer_d_2']==df['contents_attribute_d'])| (df['person_prefer_d_3']==df['contents_attribute_d']),'d_s_match_123']=True
  df.loc[(df['person_prefer_h_1_l']==df['contents_attribute_h_l'])|(df['person_prefer_h_2_l']==df['contents_attribute_h_l'])| (df['person_prefer_h_3_l']==df['contents_attribute_h_l']),'h_l_match_123']=True
  df.loc[(df['person_prefer_h_1_m']==df['contents_attribute_h_m'])|(df['person_prefer_h_2_m']==df['contents_attribute_h_m'])| (df['person_prefer_h_3_m']==df['contents_attribute_h_m']),'h_m_match_123']=True
  df.loc[(df['person_prefer_h_1']==df['contents_attribute_h'])|(df['person_prefer_h_2']==df['contents_attribute_h'])| (df['person_prefer_h_3']==df['contents_attribute_h']),'h_s_match_123']=True
  
  df = df.fillna(False).sort_values(by=['id'],axis=0) 


  # 2) 변수형 변환
  df.loc[:, df.columns != 'contents_open_dt'] = \
      df.loc[:, df.columns != 'contents_open_dt'].astype('int') # boolean -> int
  numeric_columns = ['contents_rn','person_rn','contents_attribute_j'
                     ]
  categorical_columns = list(df.columns.drop(numeric_columns))
  df=df[categorical_columns].astype('category')

  # 3) id list 저장 
  id_list = list(df['id'])
  df = df.sort_values(by=['id'],axis=0).set_index('id')


  # 4) 불필요한 컬럼 삭제 
  df.drop(['person_prefer_f','person_prefer_g','contents_open_dt'],
          axis="columns",inplace=True)
  
  return id_list, df  

#### `preprocessing_1` 전처리 함수 적용

In [3]:
# 디렉토리 설정: "[DACON] 잡케어 추천 알고리즘 경진대회" 폴더로 이동
path = "/content/drive/MyDrive/[DACON] 잡케어 추천 알고리즘 경진대회/data/"
os.chdir(path)

# 파일 불러오기 
train = pd.read_csv('Jobcare_data/train.csv')
test = pd.read_csv('Jobcare_data/test.csv')
D_code=pd.read_csv('Jobcare_data/속성_D_코드.csv')
L_code=pd.read_csv('Jobcare_data/속성_L_코드.csv')
H_code=pd.read_csv('Jobcare_data/속성_H_코드.csv')

# 1타입 전처리 함수 적용하기 
train1_idx, train1 = preprocessing_1(train)
#train1.to_csv('train1.csv', header=True,index=False)

test1_idx, test1 = preprocessing_1(test)  
#test1.to_csv('test1.csv', header=True,index=False)

In [4]:
#train1 = pd.read_csv('train1.csv')  #train_new_week.csv
numeric_columns1 = ['contents_rn','person_rn','contents_attribute_j']
categorical_columns1 = list(train1.columns.drop(['target']))

feature_names1 = numeric_columns1 + categorical_columns1
f_labels1 = numeric_columns1 + categorical_columns1
confusion_lbs1 = ['Not Used', 'Used']

f1_scores1=[]
models1=[]

# X, y 나누기 (train에만 적용)
y_train1, X_train1 = train1['target'], train1.drop(['target'],axis=1)
X_train1 = X_train1[categorical_columns1].astype('category')
primary_eval_metric1 = f1_score

### **2-2. 2타입 데이터 만들기**

#### 2-2-1. 파생변수 만들기
1. 코드 매칭 관련
2. person-contents 일치 여부 관련 

#### 2-2-2. id 리스트 저장

#### 2-2-3. 불필요한 컬럼 제거
- 'id','contents_rn','person_rn','contents_open_dt', 'person_prefer_f','person_prefer_g', 'd_l_match_yn','d_m_match_yn','d_s_match_yn','h_l_match_yn','h_m_match_yn','h_s_match_yn'

#### `preprocessing_2` 전처리 함수 정의

In [5]:
def preprocessing_2(df):

  # 1-1) 코드 매칭 관련 파생변수 생성 (add_code)
  D_attr_types = ['세', '소', '중', '대']
  H_attr_types = ['중', '대']
  L_attr_types = ['세', '소', '중', '대']
  prefer_rank = ['1', '2', '3'] 

  ## D 속성 코드 
  for D_type in D_attr_types:
    # 회원 속성 D
    for rank in prefer_rank:
      df['person_prefer_d_{}_{}'.format(rank, D_type)] = df['person_prefer_d_{}'.format(rank)].apply(lambda x:attr_D_code[x]['속성 D {}분류코드'.format(D_type)])
    # 컨텐츠 속성 D 
    df['contents_attribute_d_{}'.format(D_type)] = df['contents_attribute_d'].apply(lambda x:attr_D_code[x]['속성 D {}분류코드'.format(D_type)])

  ## H 속성 코드 
  for H_type in H_attr_types:
    # 회원 속성 H
    for rank in prefer_rank:
      df['person_prefer_h_{}_{}'.format(rank, H_type)] = df['person_prefer_h_{}'.format(rank)].apply(lambda x:attr_H_code[x]['속성 H {}분류코드'.format(H_type)])
    # 컨텐츠 속성 H 
    df['contents_attribute_h_{}'.format(H_type)] = df['contents_attribute_h'].apply(lambda x:attr_H_code[x]['속성 H {}분류코드'.format(H_type)])

  ## L 속성 코드 
  for L_type in L_attr_types:
    # 컨텐츠 속성 L 
    df['contents_attribute_l_{}'.format(L_type)] = df['contents_attribute_l'].apply(lambda x:attr_L_code[x]['속성 L {}분류코드'.format(L_type)])


  # 1-2) person-contents 일치 여부 파생변수 생성 
  cols_equal = [
          ('person_prefer_c','contents_attribute_c'),

          ('person_prefer_d_1_대','contents_attribute_d_대'),
          ('person_prefer_d_1_중','contents_attribute_d_중'),
          ('person_prefer_d_1_소','contents_attribute_d_소'),
          ('person_prefer_d_1_세','contents_attribute_d_세'),
          ('person_prefer_d_1','contents_attribute_d'),

          ('person_prefer_d_2_대','contents_attribute_d_대'),
          ('person_prefer_d_2_중','contents_attribute_d_중'),
          ('person_prefer_d_2_소','contents_attribute_d_소'),

          ('person_prefer_d_3_대','contents_attribute_d_대'),
          ('person_prefer_d_3_중','contents_attribute_d_중'),
          ('person_prefer_d_3_소','contents_attribute_d_소'),

          ('person_prefer_h_1_대','contents_attribute_h_대'),
          ('person_prefer_h_1_중','contents_attribute_h_중'),
          ('person_prefer_h_1','contents_attribute_h'),

          ('person_prefer_h_2_대','contents_attribute_h_대'),
          ('person_prefer_h_2_중','contents_attribute_h_중'),

          ('person_prefer_h_3_대','contents_attribute_h_대'),
          ('person_prefer_h_3_중','contents_attribute_h_중')
        
  ]

  for col1, col2 in cols_equal:
    df[f"{col1}_{col2}"]=(df[col1]==df[col2]).astype(int)

  df['person_attribute_a_contents_attribute_a'] = np.where(((df['person_attribute_a'] == df['contents_attribute_a'])|(df['contents_attribute_a']==3)),1,0)  # EDA 결과 3인 경우는 '상관없음'으로 판단함
  df['person_attribute_e_contents_attribute_e'] = np.where(((df['person_prefer_e'] == df['contents_attribute_e'])|(df['contents_attribute_e']==0)),1,0)  # EDA 결과 0인 경우는 '상관없음'으로 판단함 

  # 2) id 저장 
  id_list = list(df['id'])
  df = df.set_index('id')

  # 3) 불필요한 열 제거 
  del_cols_list = [c for c in df.columns if c in 
                    ['id','contents_rn','person_rn','contents_open_dt',
                     'person_prefer_f','person_prefer_g',
                     'd_l_match_yn','d_m_match_yn','d_s_match_yn',
                     'h_l_match_yn','h_m_match_yn','h_s_match_yn']
  ]
  df.drop(del_cols_list, axis="columns",inplace=True) 

  return id_list, df 


#### `preprocessing_2` 전처리 함수 적용

In [6]:
# 파일 불러오기 
train = pd.read_csv('Jobcare_data/train.csv')
test = pd.read_csv('Jobcare_data/test.csv')
attr_D_code = pd.read_csv("Jobcare_data/속성_D_코드.csv", index_col=0).T.to_dict()
attr_H_code = pd.read_csv("Jobcare_data/속성_H_코드.csv", index_col=0).T.to_dict()
attr_L_code = pd.read_csv("Jobcare_data/속성_L_코드.csv", index_col=0).T.to_dict()

# 2타입 전처리 함수 적용하기
train2_idx, train2 = preprocessing_2(train) 
#train2.to_csv('train2.csv', header=True,index=False)

test2_idx, test2 = preprocessing_2(test)
#test2.to_csv('test2.csv', header=True,index=False)

In [7]:
# 범주형 변수 지정 
categorical_columns2 = test2.columns[test2.nunique() > 2].tolist()

# X, y 나누기 (train만 적용)
y_train2 = train2['target']
X_train2 = train2.loc[:,train2.columns !='target'] 

### 2-3. 3타입 데이터 만들기 

#### 2-3-1. id 리스트 저장

#### 2-3-2. 파생변수 만들기
1. 코드 매칭 관련
2. person-contents 일치 여부 관련 
  - person_prefer과 contents_attribute이 일치하는지의 여부로 1/0 값을 부여했다.
  - 회원이 선호하는 속성과 컨텐츠 속성, 이 두 가지를 비교하고자 하였는데, 주어진 데이터가 명목형 변수이므로 단순 사칙연산이 불가능할 뿐만 아니라 코드값이 나열된 형태를 고려했을 때 빼기 연산으로 두 속성 간의 차이가 반영되기 어렵다고 판단하여, 두 변수의 일치 여부만을 살펴보았다.
  - 일치여부를 살펴본 쌍의 개수는 21개로, 21개의 새로운 변수를 생성했다.
  - EDA 결과 `contents_attribute_a`가 3인 경우와 `contents_attribute_e`가 0인 경우는 `상관없음`으로 판단하여 전처리를 진행했다. 
3. cumct, diff 관련
  - test 데이터에는 train 데이터에서 얻은 수치까지만을 활용하여 data leakage가 발생하지 않도록 하였다.
    - ex) person_rn이 1인 샘플이 train 데이터에 4건 있었다면, test 데이터에서 person_rn이 1인 샘플에 모두 4라는 값이 들어가도록 한다.
    - ex) person_rn이 train 데이터에는 없지만 test 데이터에는 있는 경우, 0으로 값을 채운다.
  - 3-1) 누적도수 관련 열 생성: contents_open_D_cumct, contents_open_W_cumct, contents_open_M_cumct, person_open_D_cumct, person_open_W_cumct, person_open_M_cumct
    - 각 콘텐츠가 일자별, 주차별, 월별로 얼마나 클릭되었는지(how many times the contents was opened)
    - 각 이용자가 일자별, 주차별, 월별로 얼마나 클릭했는지(how many times the person opened the contents)
  - 3-2) 시간차 관련 열 생성: contents_open_D_timediff, contents_open_W_timediff, contents_open_M_timediff, person_open_D_timediff, person_open_W_timediff, person_open_M_timediff
    - 각 콘텐츠가 일자별, 주차별, 월별로 다음 열과의 차이(interval)
    - 각 이용자가 일자별, 주차별, 월별로 열람한 시간차, 다음 열과의 차이(interval)

#### 2-3-3. 불필요한 컬럼 제거
- 'id','contents_rn','person_rn','contents_open_dt', 'person_prefer_f','person_prefer_g', 'd_l_match_yn','d_m_match_yn','d_s_match_yn','h_l_match_yn','h_m_match_yn','h_s_match_yn'

#### `preprocessing_3` 전처리 함수 정의

In [8]:
def preprocessing_3_train(df=train):

  # 1) id 저장 
  id_list = list(df['id'])

  # 2-1) 코드 매칭 관련 파생변수 생성 (add_code)
  D_attr_types = ['세', '소', '중', '대']
  H_attr_types = ['중', '대']
  L_attr_types = ['세', '소', '중', '대']
  prefer_rank = ['1', '2', '3'] 

  ## D 속성 코드 
  for D_type in D_attr_types:
    # 회원 속성 D
    for rank in prefer_rank:
      df['person_prefer_d_{}_{}'.format(rank, D_type)] = df['person_prefer_d_{}'.format(rank)].apply(lambda x:attr_D_code[x]['속성 D {}분류코드'.format(D_type)])
    # 컨텐츠 속성 D 
    df['contents_attribute_d_{}'.format(D_type)] = df['contents_attribute_d'].apply(lambda x:attr_D_code[x]['속성 D {}분류코드'.format(D_type)])

  ## H 속성 코드 
  for H_type in H_attr_types:
    # 회원 속성 H
    for rank in prefer_rank:
      df['person_prefer_h_{}_{}'.format(rank, H_type)] = df['person_prefer_h_{}'.format(rank)].apply(lambda x:attr_H_code[x]['속성 H {}분류코드'.format(H_type)])
    # 컨텐츠 속성 H 
    df['contents_attribute_h_{}'.format(H_type)] = df['contents_attribute_h'].apply(lambda x:attr_H_code[x]['속성 H {}분류코드'.format(H_type)])

  ## L 속성 코드 
  for L_type in L_attr_types:
    # 컨텐츠 속성 L 
    df['contents_attribute_l_{}'.format(L_type)] = df['contents_attribute_l'].apply(lambda x:attr_L_code[x]['속성 L {}분류코드'.format(L_type)])


  # 2-2) person-contents 일치 여부 파생변수 생성 
  cols_equal = [
          ('person_prefer_c','contents_attribute_c'),

          ('person_prefer_d_1_대','contents_attribute_d_대'),
          ('person_prefer_d_1_중','contents_attribute_d_중'),
          ('person_prefer_d_1_소','contents_attribute_d_소'),
          ('person_prefer_d_1_세','contents_attribute_d_세'),
          ('person_prefer_d_1','contents_attribute_d'),

          ('person_prefer_d_2_대','contents_attribute_d_대'),
          ('person_prefer_d_2_중','contents_attribute_d_중'),
          ('person_prefer_d_2_소','contents_attribute_d_소'),

          ('person_prefer_d_3_대','contents_attribute_d_대'),
          ('person_prefer_d_3_중','contents_attribute_d_중'),
          ('person_prefer_d_3_소','contents_attribute_d_소'),

          ('person_prefer_h_1_대','contents_attribute_h_대'),
          ('person_prefer_h_1_중','contents_attribute_h_중'),
          ('person_prefer_h_1','contents_attribute_h'),

          ('person_prefer_h_2_대','contents_attribute_h_대'),
          ('person_prefer_h_2_중','contents_attribute_h_중'),

          ('person_prefer_h_3_대','contents_attribute_h_대'),
          ('person_prefer_h_3_중','contents_attribute_h_중')
        
  ]

  for col1, col2 in cols_equal:
    df[f"{col1}_{col2}"]=(df[col1]==df[col2]).astype(int)

  df['person_attribute_a_contents_attribute_a'] = np.where(((df['person_attribute_a'] == df['contents_attribute_a'])|(df['contents_attribute_a']==3)),1,0)  # EDA 결과 3인 경우는 '상관없음'으로 판단함
  df['person_attribute_e_contents_attribute_e'] = np.where(((df['person_prefer_e'] == df['contents_attribute_e'])|(df['contents_attribute_e']==0)),1,0)  # EDA 결과 0인 경우는 '상관없음'으로 판단함 

  # 2-3) cumdiff 관련 파생변수 생성 

  # 데이터형 변환: object -> datetime
  df['contents_open_dt'] = pd.to_datetime(df['contents_open_dt'])
  
  # 2-3-1) cumct 관련 
  intervals = ['D', 'W', 'M'] 

  # contents 기준
  for i in intervals:
    df['contents_open_{}_cumct'.format(i)] = df.sort_values('contents_open_dt'). \
                                  groupby([pd.Grouper('contents_rn'), pd.Grouper(freq=i, key='contents_open_dt')]). \
                                  cumcount()+1  
  # person 기준
  for i in intervals:
    df['person_open_{}_cumct'.format(i)] = df.sort_values('contents_open_dt'). \
                                  groupby([pd.Grouper('person_rn'), pd.Grouper(freq=i, key='contents_open_dt')]). \
                                  cumcount()+1
  # 2-3-2) diff 관련 
  # contents 기준 
  for i in intervals:
    df['contents_open_{}_timediff'.format(i)] = df.\
        sort_values('contents_open_dt', ascending=False). \
        groupby([pd.Grouper('contents_rn'), pd.Grouper(freq=i, key='contents_open_dt')]) \
        ['contents_open_dt'].diff(-1) \
        .dt.seconds.div(60).fillna(0)   # 초 단위로 환산, 날짜형은 결측치를 0으로 대체 불가능하기에 환산함
  # person 기준
  for i in intervals:  
    df['person_open_{}_timediff'.format(i)] = df.\
        sort_values('contents_open_dt', ascending=False). \
        groupby([pd.Grouper('person_rn'), pd.Grouper(freq=i, key='contents_open_dt')]) \
        ['contents_open_dt'].diff(-1) \
        .dt.seconds.div(60).fillna(0)  


  # 3) 불필요한 열 제거 
  del_cols_list = [c for c in df.columns if c in 
                    ['id','contents_open_dt',#'contents_rn','person_rn',
                     'person_prefer_f','person_prefer_g',
                     'd_l_match_yn','d_m_match_yn','d_s_match_yn',
                     'h_l_match_yn','h_m_match_yn','h_s_match_yn']
  ]
  df.drop(del_cols_list, axis="columns",inplace=True) 

  return id_list, df 


In [9]:
def preprocessing_3_test(df=test):

  # 1) id 저장 
  id_list = list(df['id'])

  # 2-1) 코드 매칭 관련 파생변수 생성 (add_code)
  D_attr_types = ['세', '소', '중', '대']
  H_attr_types = ['중', '대']
  L_attr_types = ['세', '소', '중', '대']
  prefer_rank = ['1', '2', '3'] 

  ## D 속성 코드 
  for D_type in D_attr_types:
    # 회원 속성 D
    for rank in prefer_rank:
      df['person_prefer_d_{}_{}'.format(rank, D_type)] = df['person_prefer_d_{}'.format(rank)].apply(lambda x:attr_D_code[x]['속성 D {}분류코드'.format(D_type)])
    # 컨텐츠 속성 D 
    df['contents_attribute_d_{}'.format(D_type)] = df['contents_attribute_d'].apply(lambda x:attr_D_code[x]['속성 D {}분류코드'.format(D_type)])

  ## H 속성 코드 
  for H_type in H_attr_types:
    # 회원 속성 H
    for rank in prefer_rank:
      df['person_prefer_h_{}_{}'.format(rank, H_type)] = df['person_prefer_h_{}'.format(rank)].apply(lambda x:attr_H_code[x]['속성 H {}분류코드'.format(H_type)])
    # 컨텐츠 속성 H 
    df['contents_attribute_h_{}'.format(H_type)] = df['contents_attribute_h'].apply(lambda x:attr_H_code[x]['속성 H {}분류코드'.format(H_type)])

  ## L 속성 코드 
  for L_type in L_attr_types:
    # 컨텐츠 속성 L 
    df['contents_attribute_l_{}'.format(L_type)] = df['contents_attribute_l'].apply(lambda x:attr_L_code[x]['속성 L {}분류코드'.format(L_type)])


  # 2-2) person-contents 일치 여부 파생변수 생성 
  cols_equal = [
          ('person_prefer_c','contents_attribute_c'),

          ('person_prefer_d_1_대','contents_attribute_d_대'),
          ('person_prefer_d_1_중','contents_attribute_d_중'),
          ('person_prefer_d_1_소','contents_attribute_d_소'),
          ('person_prefer_d_1_세','contents_attribute_d_세'),
          ('person_prefer_d_1','contents_attribute_d'),

          ('person_prefer_d_2_대','contents_attribute_d_대'),
          ('person_prefer_d_2_중','contents_attribute_d_중'),
          ('person_prefer_d_2_소','contents_attribute_d_소'),

          ('person_prefer_d_3_대','contents_attribute_d_대'),
          ('person_prefer_d_3_중','contents_attribute_d_중'),
          ('person_prefer_d_3_소','contents_attribute_d_소'),

          ('person_prefer_h_1_대','contents_attribute_h_대'),
          ('person_prefer_h_1_중','contents_attribute_h_중'),
          ('person_prefer_h_1','contents_attribute_h'),

          ('person_prefer_h_2_대','contents_attribute_h_대'),
          ('person_prefer_h_2_중','contents_attribute_h_중'),

          ('person_prefer_h_3_대','contents_attribute_h_대'),
          ('person_prefer_h_3_중','contents_attribute_h_중')
        
  ]

  for col1, col2 in cols_equal:
    df[f"{col1}_{col2}"]=(df[col1]==df[col2]).astype(int)

  df['person_attribute_a_contents_attribute_a'] = np.where(((df['person_attribute_a'] == df['contents_attribute_a'])|(df['contents_attribute_a']==3)),1,0)  # EDA 결과 3인 경우는 '상관없음'으로 판단함
  df['person_attribute_e_contents_attribute_e'] = np.where(((df['person_prefer_e'] == df['contents_attribute_e'])|(df['contents_attribute_e']==0)),1,0)  # EDA 결과 0인 경우는 '상관없음'으로 판단함 

  # 2-3) cumdiff 관련 파생변수 생성 

  # 데이터형 변환: object -> datetime
  df['contents_open_dt'] = pd.to_datetime(df['contents_open_dt'])
  
  intervals = ['D', 'W', 'M']
  vartypes = ['cumct', 'timediff']

  contents_df = pd.DataFrame()
  person_df = pd.DataFrame()

  for i in vartypes: 
    for j in intervals: 
      contents_df['contents_open_{}_{}'.format(j,i)] = train3. \
          groupby('contents_rn')['contents_open_{}_{}'.format(j,i)].max()

  for i in vartypes: 
    for j in intervals: 
      person_df['person_open_{}_{}'.format(j,i)] = train3. \
          groupby('person_rn')['person_open_{}_{}'.format(j,i)].max()

  contents_df.reset_index(level=0, inplace=True)
  person_df.reset_index(level=0, inplace=True)

  df = df.merge(contents_df, how='left', on='contents_rn').fillna(0) \
              .merge(person_df, how='left', on='person_rn').fillna(0)

  # 3) 불필요한 열 제거 
  del_cols_list = [c for c in df.columns if c in 
                    ['id','contents_rn','person_rn','contents_open_dt',
                     'person_prefer_f','person_prefer_g',
                     'd_l_match_yn','d_m_match_yn','d_s_match_yn',
                     'h_l_match_yn','h_m_match_yn','h_s_match_yn']
  ]
  df.drop(del_cols_list, axis="columns",inplace=True) 

  return id_list, df 


#### `preprocessing_3` 전처리 함수 적용

In [10]:
# 파일 불러오기 
train = pd.read_csv('Jobcare_data/train.csv')
test = pd.read_csv('Jobcare_data/test.csv')

# 3타입 전처리 함수 적용하기
train3_idx, train3 = preprocessing_3_train(train)
test3_idx, test3 = preprocessing_3_test(test)

train3 = train3.drop(['contents_rn','person_rn'], axis=1)
#train3.to_csv('train3.csv', header=True,index=False)
#test3.to_csv('test3.csv', header=True,index=False)

In [11]:
numeric_columns3 = [
  'contents_attribute_j','contents_attribute_k',

  'contents_open_D_cumct','contents_open_W_cumct', 'contents_open_M_cumct',
  'contents_open_D_timediff', 'contents_open_W_timediff','contents_open_M_timediff', 
  'person_open_D_cumct', 'person_open_W_cumct', 'person_open_M_cumct', 
  'person_open_D_timediff','person_open_W_timediff', 'person_open_M_timediff',

  'person_prefer_c_contents_attribute_c',
  'person_prefer_d_1_대_contents_attribute_d_대',
  'person_prefer_d_1_중_contents_attribute_d_중',
  'person_prefer_d_1_소_contents_attribute_d_소',
  'person_prefer_d_1_세_contents_attribute_d_세',
  'person_prefer_d_1_contents_attribute_d',
  'person_prefer_d_2_대_contents_attribute_d_대',
  'person_prefer_d_2_중_contents_attribute_d_중',
  'person_prefer_d_2_소_contents_attribute_d_소',
  'person_prefer_d_3_대_contents_attribute_d_대',
  'person_prefer_d_3_중_contents_attribute_d_중',
  'person_prefer_d_3_소_contents_attribute_d_소',
  'person_prefer_h_1_대_contents_attribute_h_대',
  'person_prefer_h_1_중_contents_attribute_h_중',
  'person_prefer_h_1_contents_attribute_h',
  'person_prefer_h_2_대_contents_attribute_h_대',
  'person_prefer_h_2_중_contents_attribute_h_중',
  'person_prefer_h_3_대_contents_attribute_h_대',
  'person_prefer_h_3_중_contents_attribute_h_중',
  'person_attribute_a_contents_attribute_a',
  'person_attribute_e_contents_attribute_e'
]

categorical_columns3 = list(train3.columns.drop(numeric_columns3 + ['target']))
feature_names3 = numeric_columns3 + categorical_columns3

train3[categorical_columns3]=train3[categorical_columns3].astype('category')
test3[categorical_columns3]=test3[categorical_columns3].astype('category')
train3[numeric_columns3]=train3[numeric_columns3].astype('int')
test3[numeric_columns3]=test3[numeric_columns3].astype('int')
f_labels3 = categorical_columns3+numeric_columns3

# X, y 나누기 (train만 적용)
y_train3 = train3['target']
X_train3 = train3.loc[:,train3.columns != 'target']

## 3. 모델링 (Modeling)

### **3-1. CatBoost**

In [12]:
cv = KFold(n_splits=4, shuffle=True, random_state=2022)    # K-Fold 4개로 cross validation 진행

cat_train1_model=[]
cat_train2_model=[]
cat_train3_model=[]
index_number_validation=[]  
validation_target=[] 


각각의 train data(`X_train1`, `X_train2`, `X_train3`)에 대한 catboost model을 적합시키고 저장한다.

In [13]:
# Modeling Fitting & Saving

## X_train1
for i, j in cv.split(X_train1):
  index_number_validation.append(j)       # cv를 shuffle 시켜 index number를 저장해줌
  catboost_clf=CatBoostClassifier(cat_features = categorical_columns1, l2_leaf_reg=120, 
                               depth=6, auto_class_weights='Balanced',iterations=3000,
                               learning_rate=0.2, use_best_model=True, 
                               early_stopping_rounds=250, eval_metric='F1',
                               random_state=2022,one_hot_max_size=5,
                               task_type="GPU")
  catboost_clf.fit(X_train1.iloc[i], y_train1[i],
                   eval_set=[(X_train1.iloc[j],y_train1[j])],
                   early_stopping_rounds=250,
                   verbose=100)
  cat_train1_model.append(catboost_clf)  # train1으로 fit 한 model 저장
  if False:
    break

## X_train2
for i,j in cv.split(X_train2):  
  catboost_clf=CatBoostClassifier(cat_features = categorical_columns2, l2_leaf_reg=120, 
                               depth=6, auto_class_weights='Balanced',iterations=3000,
                               learning_rate=0.2, use_best_model=True, 
                               early_stopping_rounds=250, eval_metric='F1',
                               random_state=2022,one_hot_max_size=5,
                               task_type="GPU")
  catboost_clf.fit(X_train2.iloc[i], y_train2[i],
                   eval_set=[(X_train2.iloc[j],y_train2[j])],
                   early_stopping_rounds=250,
                   verbose=100)  # 100회마다 score 출력 
  cat_train2_model.append(catboost_clf)   # train2로 fit한 model 저장
  if False:
    break

## X_train3
for i,j in cv.split(X_train3):  
  catboost_clf=CatBoostClassifier(cat_features = categorical_columns3, l2_leaf_reg=120, 
                               depth=6, auto_class_weights='Balanced',iterations=3000,
                               learning_rate=0.2, use_best_model=True, 
                               early_stopping_rounds=250, eval_metric='F1',
                               random_state=2022,one_hot_max_size=5,
                               task_type="GPU")
  catboost_clf.fit(X_train3.iloc[i], y_train3[i],
                   eval_set=[(X_train3.iloc[j],y_train3[j])],
                   early_stopping_rounds=250,
                   verbose=100)
  cat_train3_model.append(catboost_clf)   # train3로 fit한 model 저장
  validation_target.append(y_train1[j])    # stacking을 위해 validation target 값 저장
  if False:
    break

0:	learn: 0.6252440	test: 0.6238603	best: 0.6238603 (0)	total: 114ms	remaining: 5m 43s
100:	learn: 0.6633477	test: 0.6737210	best: 0.6742372 (94)	total: 8.18s	remaining: 3m 54s
200:	learn: 0.6702190	test: 0.6734295	best: 0.6753422 (120)	total: 15.7s	remaining: 3m 38s
300:	learn: 0.6735839	test: 0.6722207	best: 0.6753422 (120)	total: 23.1s	remaining: 3m 26s
bestTest = 0.6753421918
bestIteration = 120
Shrink model to first 121 iterations.
0:	learn: 0.6282091	test: 0.6236108	best: 0.6236108 (0)	total: 103ms	remaining: 5m 9s
100:	learn: 0.6649003	test: 0.6731015	best: 0.6734552 (99)	total: 8.13s	remaining: 3m 53s
200:	learn: 0.6706141	test: 0.6748834	best: 0.6758384 (176)	total: 15.7s	remaining: 3m 38s
300:	learn: 0.6747654	test: 0.6747210	best: 0.6758384 (176)	total: 23.1s	remaining: 3m 27s
400:	learn: 0.6779339	test: 0.6741175	best: 0.6758384 (176)	total: 30.6s	remaining: 3m 18s
bestTest = 0.6758384111
bestIteration = 176
Shrink model to first 177 iterations.
0:	learn: 0.5931651	test: 0.

각각의 train data에 대하여 저장한 catboost model로 각 fold별 validation probability & test probability 값을 저장한다.
stacking 단계에서 사용하는 모델을위하여 validation probability도 모두 저장하였다.

In [14]:
cat_train1_validation_prob=[]
cat_train2_validation_prob=[]
cat_train3_validation_prob=[]
cat_train1_test_prob=[]
cat_train2_test_prob=[]
cat_train3_test_prob=[]

for a,(i, j) in enumerate(cv.split(X_train1)):  
  val_prob1=cat_train1_model[a].predict_proba(X_train1.iloc[j])[:,1]
  test_prob1=cat_train1_model[a].predict_proba(test1)[:,1]
  cat_train1_validation_prob.append(val_prob1)
  cat_train1_test_prob.append(test_prob1)

for a,(i,j) in enumerate(cv.split(X_train2)):
  val_prob2=cat_train2_model[a].predict_proba(X_train2.iloc[j])[:,1]
  test_prob2=cat_train2_model[a].predict_proba(test2)[:,1]
  cat_train2_validation_prob.append(val_prob2)
  cat_train2_test_prob.append(test_prob2)

for a,(i,j) in enumerate(cv.split(X_train3)):
  val_prob3=cat_train3_model[a].predict_proba(X_train3.iloc[j])[:,1]
  test_prob3=cat_train3_model[a].predict_proba(test3)[:,1]
  cat_train3_validation_prob.append(val_prob3)
  cat_train3_test_prob.append(test_prob3)

### **3-2. Ridge**

위의 CatBoost에서와 동일한 방식으로, 각각의 train data에 대하여 ridge model를 적합시키고 저장한다.

In [15]:
from sklearn.linear_model import Ridge
ridge_train1_model=[]
ridge_train2_model=[]
ridge_train3_model=[]

## X_train1
for i, j in cv.split(X_train1):
  ridge=Ridge()
  ridge.fit(X_train1.iloc[i],y_train1[i])
  ridge_train1_model.append(ridge)
  if False:
    break

## X_train2
for i,j in cv.split(X_train2):
  ridge=Ridge()
  ridge.fit(X_train2.iloc[i],y_train2[i])
  ridge_train2_model.append(ridge)
  if False:
    break

## X_train3
for i,j in cv.split(X_train3):
  ridge=Ridge()
  ridge.fit(X_train3.iloc[i],y_train3[i])
  ridge_train3_model.append(ridge)
  if False:
    break 

각각의 train data에 대하여 저장한 ridge model로 각 fold별 validation probability & test probability 값 저장 (stacking을 위한 모델 학습을 위하여 validation probability도 모두 저장함)

In [16]:
ridge_train1_validation_prob=[]
ridge_train2_validation_prob=[]
ridge_train3_validation_prob=[]
ridge_train1_test_prob=[]
ridge_train2_test_prob=[]
ridge_train3_test_prob=[]

## X_train1
for a,(i, j) in enumerate(cv.split(X_train1)):  
  val_prob1=ridge_train1_model[a].predict(X_train1.iloc[j])
  test_prob1=ridge_train1_model[a].predict(test1)
  ridge_train1_validation_prob.append(val_prob1)
  ridge_train1_test_prob.append(test_prob1)

## X_train2
for a,(i,j) in enumerate(cv.split(X_train2)):
  val_prob2=ridge_train2_model[a].predict(X_train2.iloc[j])
  test_prob2=ridge_train2_model[a].predict(test2)
  ridge_train2_validation_prob.append(val_prob2)
  ridge_train2_test_prob.append(test_prob2)

## X_train3
for a,(i,j) in enumerate(cv.split(X_train3)):
  val_prob3=ridge_train3_model[a].predict(X_train3.iloc[j])
  test_prob3=ridge_train3_model[a].predict(test3)
  ridge_train3_validation_prob.append(val_prob3)
  ridge_train3_test_prob.append(test_prob3)

### **3-3. Validation & Test DF 형성**

In [17]:
# validation proba DF
validation1=pd.DataFrame({'index': index_number_validation[0], 'cat1': cat_train1_validation_prob[0], 'cat2': cat_train2_validation_prob[0],'cat3': cat_train3_validation_prob[0],'ridge1':ridge_train1_validation_prob[0],'ridge2':ridge_train2_validation_prob[0],'ridge3':ridge_train3_validation_prob[0], 'target':validation_target[0]})
validation2=pd.DataFrame({'index': index_number_validation[1], 'cat1': cat_train1_validation_prob[1], 'cat2': cat_train2_validation_prob[1],'cat3': cat_train3_validation_prob[1],'ridge1':ridge_train1_validation_prob[1],'ridge2':ridge_train2_validation_prob[1],'ridge3':ridge_train3_validation_prob[1], 'target':validation_target[1]})
validation3=pd.DataFrame({'index': index_number_validation[2], 'cat1': cat_train1_validation_prob[2], 'cat2': cat_train2_validation_prob[2],'cat3': cat_train3_validation_prob[2],'ridge1':ridge_train1_validation_prob[2],'ridge2':ridge_train2_validation_prob[2],'ridge3':ridge_train3_validation_prob[2], 'target':validation_target[2]})
validation4=pd.DataFrame({'index': index_number_validation[3], 'cat1': cat_train1_validation_prob[3], 'cat2': cat_train2_validation_prob[3],'cat3': cat_train3_validation_prob[3],'ridge1':ridge_train1_validation_prob[3],'ridge2':ridge_train2_validation_prob[3],'ridge3':ridge_train3_validation_prob[3], 'target':validation_target[3]})
validation_proba=pd.concat([validation1,validation2,validation3,validation4],axis=0)
validation_proba.sort_values(by=['index'],inplace=True)  # cat validation prob DataFrame 저장

# test proba DF
test_proba=pd.DataFrame({'cat1':np.mean(cat_train1_test_prob,axis=0),'cat2':np.mean(cat_train2_test_prob,axis=0),'cat3':np.mean(cat_train3_test_prob,axis=0),'ridge1':np.mean(ridge_train1_test_prob,axis=0),'ridge2':np.mean(ridge_train2_test_prob,axis=0),'ridge3':np.mean(ridge_train3_test_prob,axis=0)})

## 4. 스태킹 (Stacking)

Bayes Classifier를 메타 모델로 사용하였다.


In [18]:
from scipy.stats import norm
import math
import scipy.stats

# bayes optimal classifier 분포값 정의
def Bayes_optimal_classifier(data,mu,sigma_metrics):
  # proba의 분포 가정
  distribution_data=scipy.stats.multivariate_normal.pdf(data,mean=mu,cov=sigma_metrics)
  return distribution_data

validation_0=validation_proba[validation_proba['target']==0]
validation_1=validation_proba[validation_proba['target']==1]
validation_0.drop(['index','ridge3','target'],axis=1,inplace=True)
validation_1.drop(['index','ridge3','target'],axis=1,inplace=True)

x0, xm0, sig0, cov0=[0]*len(validation_0.columns), [0]*len(validation_0.columns), [0]*len(validation_0.columns), [0]*len(validation_0.columns)
columns0=validation_0.columns
for i in range(len(validation_0.columns)):
  x0[i]=np.array(validation_0[columns0[i]])
  xm0[i]=np.mean(x0[i])
  sig0[i]=np.var(x0[i])
for i in range(len(validation_0.columns)):
  k=[0]*len(validation_0.columns)
  for j in range(len(validation_0.columns)):
    if i==j:
      k[j]=sig0[j]
    else:
      k[j]=np.cov(x0[i],x0[j])[0,1]
  cov0[i]=k

x1, xm1, sig1, cov1=[0]*len(validation_1.columns), [0]*len(validation_1.columns), [0]*len(validation_1.columns), [0]*len(validation_1.columns)
columns1=validation_1.columns
for i in range(len(validation_1.columns)):
  x1[i]=np.array(validation_1[columns1[i]])
  xm1[i]=np.mean(x1[i])
  sig1[i]=np.var(x1[i])
for i in range(len(validation_1.columns)):
  k=[0]*len(validation_1.columns)
  for j in range(len(validation_1.columns)):
    if i==j:
      k[j]=sig1[j]
    else:
      k[j]=np.cov(x1[i],x1[j])[0,1]
  cov1[i]=k
test_columns=test_proba.columns
t=[0]*len(test_columns)
for i in range(len(test_columns)):
  t[i]=np.array(test_proba[test_columns[i]])

test_proba_data = np.asarray(np.vstack((np.vstack(t))).T)
y_prob0 = np.array(  [Bayes_optimal_classifier(np.array([xx,yy,zz,jj, aa]),xm0,cov0) 
                     for xx, yy, zz, jj, aa in zip(np.ravel(test_proba_data[:,0]), np.ravel(test_proba_data[:,1]), np.ravel(test_proba_data[:,2]), np.ravel(test_proba_data[:,3]), np.ravel(test_proba_data[:,4])) ] )
y_prob1 = np.array(  [Bayes_optimal_classifier(np.array([xx,yy,zz,jj, aa]),xm1,cov1) 
                     for xx, yy, zz, jj, aa in zip(np.ravel(test_proba_data[:,0]), np.ravel(test_proba_data[:,1]), np.ravel(test_proba_data[:,2]), np.ravel(test_proba_data[:,3]), np.ravel(test_proba_data[:,4])) ] )
prob_for_1=y_prob1/(y_prob0+y_prob1)

treshold=0.27
test_pred=np.where(prob_for_1>=treshold,1,0).flatten()

# 최종 예측 결과
final_prediction=pd.read_csv('Jobcare_data/sample_submission.csv')
final_prediction['target'] = test_pred
#final_prediction.to_csv('final_prediction.csv')  # 결과 저장 

---
끝.