<a href="https://colab.research.google.com/github/SanGyuk-Raccoon/DACON_1/blob/main/ver_7_find_cv_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from lightgbm import LGBMClassifier
import xgboost

from sklearn.metrics import log_loss

In [3]:
# Load DATA
train = pd.read_csv("/content/drive/MyDrive/DACON/[월간 데이콘] 14. 신용카드 사용자 연체 예측 AI/train_update.csv")

test = pd.read_csv("/content/drive/MyDrive/DACON/[월간 데이콘] 14. 신용카드 사용자 연체 예측 AI/test_update.csv")


In [39]:
### 연속형 변수를 구간화하는 함수
###############################################################
def continuous_to_quantile(var, k, DATA) :
  t = 1 / k
  q = 0
  Q = [np.quantile( DATA[var] , q)]

  for i in range(k) :
    q += t
    Q1 = np.quantile( DATA[var] , q)
    Q.append(Q1)

  for i in range(k-1) :
    DATA.loc[(DATA[var] >= Q[i]) & (DATA[var] <= Q[i+1]), f'{var}_Quantile_{k}'] = i
#####################################################################

## 그룹에 대해서 평균을 해주는 함수
#######################################################
def conti_by_cate(GROUP, CONTI, DATA, method) :
  group = GROUP
  var = CONTI
  vec = []
  if method == 'mean' :
    vec = DATA.groupby( group )[ var ].mean()
  elif method == 'min' :
    vec = DATA.groupby( group )[ var ].min()
  elif method == 'max' :
    vec = DATA.groupby( group )[ var ].max()
  elif method == 'skew' :
    vec = DATA.groupby( group )[ var ].skew()
  elif method == 'med' :
    vec = DATA.groupby( group )[ var ].median()
  idx = vec.index

  for i in range(len(vec)) :
    DATA.loc[DATA[group] == idx[i], f'{var}_by_{group}_{method}'] = vec[i]

#######################################################


In [84]:
def pre_processing(DATA) :
  data =  DATA

  ###########################################################################################
  #### (+) 개인 식별 변수를 이용하여
  #        그룹을 나누고 각각의 begin_month를 이용해 신용카드 시작 순서를 구함
  var_identi = ['gender', 'car', 'reality', 'income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED'] # 개인을 식별할 변수로 사용

  data = data.sort_values('begin_month')

  data['ord_avg'] = data.groupby( var_identi )['begin_month'].rank(method = 'average')
  data['ord_min'] = data.groupby( var_identi )['begin_month'].rank(method = 'min')
  data['ord_max'] = data.groupby( var_identi )['begin_month'].rank(method = 'max')
  data['ord_first'] = data.groupby( var_identi )['begin_month'].rank(method = 'first')
  data['ord_dense'] = data.groupby( var_identi )['begin_month'].rank(method = 'dense')

  #data = data.sort_values('index')

  ###### 개인 기준, 신용카드를 새로 발급받기까지의 기간 계산
  # 처음은 0으로

  data = data.sort_values('begin_month')
  data['begin_month_diff'] = data.groupby( var_identi )['begin_month'].diff()
  data['begin_month_diff'] = data['begin_month_diff'].fillna(0)

  data = data.sort_values('index')

  ############################################################################################
  

  #############################################################################################
  data = data.drop(['index', 'FLAG_MOBIL'], axis = 1)

  #### 1. gender
  mapping_gender = {
      'F' : 0, # Female = 0
      'M' : 1  # Male = 1
  }
  data['gender_mapping'] = data.gender.map(mapping_gender)


  # 2. car
  mapping_car = {
      'N' : 0,
      'Y' : 1
  }
  data['car_mapping'] = data.car.map(mapping_car)

  # 3. reality
  mapping_reality = {
      'N' : 0,
      'Y' : 1
  }
  data['reality_mapping'] = data.reality.map(mapping_reality)

  # 4. edu_type : Ordinal variable
#  mapping_edu_type = {
#    'Lower secondary' : 0, # 중학교 미만
#    'Secondary / secondary special' : 1, #중학교
#    'Incomplete higher' : 2, # 고등학교 중퇴
#    'Higher education' : 3, # 고등학교 졸업
#    'Academic degree' : 4 # 학사 이상
#  }
#  data['edu_type_ordinal_mapping'] = data.edu_type.map(mapping_edu_type)

  ## child num. 5를 최대로 설정
  data['child_num_max5'] = data['child_num'].apply( lambda x : 5 if x > 5 else x)

  ## family_size 7을 최대로 설정
  data['family_size_max7'] = data['family_size'].apply( lambda x : 7 if x > 7 else x)

  ## DAYS_BIRTH -> 나이
  data['age'] = np.floor(-data['DAYS_BIRTH']/ 365)
  data['age_year'] = -data['DAYS_BIRTH'] / 365 
  data['age_month'] = -data['DAYS_BIRTH'] / (365 / 12)
  data['age_day'] = -data['DAYS_BIRTH']

  continuous_to_quantile('age', 4, data)
  continuous_to_quantile('age', 5, data)


  ## DAYS_EMPLOYES -> 경력
  data['career_year'] = data['DAYS_EMPLOYED'].apply( lambda x : - x / 365  if x < 0 else 0)
  data['career_month'] = data['DAYS_EMPLOYED'].apply( lambda x : - x / (365 / 12) if x < 0 else 0)
  data['career_day'] = data['DAYS_EMPLOYED'].apply( lambda x : - x if x < 0 else 0)
  data['career'] = data['DAYS_EMPLOYED'].apply( lambda x : 1 if x < 0 else 0)

  continuous_to_quantile('career', 4, data)
  continuous_to_quantile('career', 5, data)

  ## Begin_month -> 양수로
  data['begin_month_minus'] = - data['begin_month']
  data['log_begin_month_minus'] = np.log(data['begin_month_minus'])


  ###### 범주별 연속형 변수의 특성으로 대체 ##################################################
  METHOD = ['mean', 'max', 'median', 'skew', 'min'] 
  CONTI = ['income_total', 'age_day', 'career_day']
  GROUP = ['occyp_type', 'income_type', 'family_type', 'house_type', 'edu_type']
  for g in GROUP :
    for c in CONTI :
      for m in METHOD :
        conti_by_cate(g, c, data, method = m)

  ############################################################################################
  #### one-hot encoding
#  var = ['occyp_type', 'income_type',
#        'family_type', 'house_type'      ]

#  onehot_encoder = OneHotEncoder()
#  onehot_encoder.fit(data.loc[:, var])
#  onehot_df = pd.DataFrame(onehot_encoder.transform(data.loc[:,var]).toarray(), 
#              columns=onehot_encoder.get_feature_names(var))
#
#  data = pd.concat([data, onehot_df], axis = 1)

  ######################################################
# data['begin_month_diff*ord_dense'] = data['begin_month_diff'] * data['ord_dense']


  ######################################################
  data = data.drop(['gender', 'car', 'reality', 'edu_type', 'occyp_type', 'income_type',
        'family_type', 'house_type', 'DAYS_EMPLOYED', 'DAYS_BIRTH'], axis = 1)
  

  return data

In [43]:
#train = pd.read_csv("/content/drive/MyDrive/DACON/[월간 데이콘] 14. 신용카드 사용자 연체 예측 AI/train_update.csv")
train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,Laborers,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [90]:
#############카테고리 곱하기######################################
#for i in range(len(var)) :
#  for j in range(i+1, len(var)) :
#    data[f'{var[i]}_{var[j]}'] = data[var[i]] + '_' + data[var[j]]
#    var.append(f'{var[i]}_{var[j]}')
##################################################################
train = pd.read_csv("/content/drive/MyDrive/DACON/[월간 데이콘] 14. 신용카드 사용자 연체 예측 AI/train_update.csv")
train = pre_processing(train)
train.head()

Unnamed: 0,child_num,income_total,work_phone,phone,email,family_size,begin_month,credit,ord_avg,ord_min,ord_max,ord_first,ord_dense,begin_month_diff,gender_mapping,car_mapping,reality_mapping,child_num_max5,family_size_max7,age,age_year,age_month,age_day,age_Quantile_4,age_Quantile_5,career_year,career_month,career_day,career,career_Quantile_4,career_Quantile_5,begin_month_minus,log_begin_month_minus,income_total_by_occyp_type_mean,income_total_by_occyp_type_max,income_total_by_occyp_type_skew,income_total_by_occyp_type_min,age_day_by_occyp_type_mean,age_day_by_occyp_type_max,age_day_by_occyp_type_skew,...,career_day_by_income_type_mean,career_day_by_income_type_max,career_day_by_income_type_skew,career_day_by_income_type_min,income_total_by_family_type_mean,income_total_by_family_type_max,income_total_by_family_type_skew,income_total_by_family_type_min,age_day_by_family_type_mean,age_day_by_family_type_max,age_day_by_family_type_skew,age_day_by_family_type_min,career_day_by_family_type_mean,career_day_by_family_type_max,career_day_by_family_type_skew,career_day_by_family_type_min,income_total_by_house_type_mean,income_total_by_house_type_max,income_total_by_house_type_skew,income_total_by_house_type_min,age_day_by_house_type_mean,age_day_by_house_type_max,age_day_by_house_type_skew,age_day_by_house_type_min,career_day_by_house_type_mean,career_day_by_house_type_max,career_day_by_house_type_skew,career_day_by_house_type_min,income_total_by_edu_type_mean,income_total_by_edu_type_max,income_total_by_edu_type_skew,income_total_by_edu_type_min,age_day_by_edu_type_mean,age_day_by_edu_type_max,age_day_by_edu_type_skew,age_day_by_edu_type_min,career_day_by_edu_type_mean,career_day_by_edu_type_max,career_day_by_edu_type_skew,career_day_by_edu_type_min
0,0,202500.0,0,0,0,2.0,-6.0,1.0,5.0,5.0,5.0,5.0,5.0,19.0,0,0,0,0,2.0,38.0,38.079452,456.953425,13899,1.0,1.0,12.90137,154.816438,4709,1,2.0,3.0,6.0,1.791759,176212.935484,900000.0,1.403336,36000.0,14620.331968,24064.0,0.217646,...,2291.959207,15713.0,1.738268,73.0,187458.950319,1350000.0,2.360132,27000.0,15911.834139,25152.0,0.237466,8041.0,2279.274401,15713.0,1.645373,0.0,181021.026895,1350000.0,5.223406,36000.0,16012.215159,23952.0,0.059352,8466.0,2381.045232,12253.0,1.513213,0.0,224088.974309,1575000.0,2.812313,27000.0,15070.916923,24932.0,0.513804,8156.0,2288.897934,15038.0,1.803654,0.0
1,1,247500.0,0,0,1,3.0,-5.0,1.0,6.5,6.0,7.0,7.0,5.0,0.0,0,0,1,1,3.0,31.0,31.178082,374.136986,11380,0.0,0.0,4.219178,50.630137,1540,1,2.0,3.0,5.0,1.609438,176212.935484,900000.0,1.403336,36000.0,14620.331968,24064.0,0.217646,...,2291.959207,15713.0,1.738268,73.0,183419.722327,900000.0,2.750718,27000.0,15048.977862,24932.0,0.409646,7705.0,2055.921338,10821.0,1.595188,0.0,187205.414155,1575000.0,2.545753,27000.0,16208.908426,25140.0,0.132156,7959.0,2208.543102,15713.0,1.756384,0.0,172305.923006,1125000.0,2.031336,27000.0,16460.427508,25152.0,0.032008,7723.0,2184.444068,15713.0,1.744483,0.0
2,0,450000.0,0,1,0,2.0,-22.0,2.0,4.5,4.0,5.0,5.0,4.0,7.0,1,1,1,0,2.0,52.0,52.293151,627.517808,19087,2.0,3.0,12.147945,145.775342,4434,1,2.0,3.0,22.0,3.091042,288683.729222,1575000.0,2.490469,27000.0,15295.935314,23734.0,0.251948,...,2636.497105,15072.0,1.758738,17.0,187458.950319,1350000.0,2.360132,27000.0,15911.834139,25152.0,0.237466,8041.0,2279.274401,15713.0,1.645373,0.0,187205.414155,1575000.0,2.545753,27000.0,16208.908426,25140.0,0.132156,7959.0,2208.543102,15713.0,1.756384,0.0,224088.974309,1575000.0,2.812313,27000.0,15070.916923,24932.0,0.513804,8156.0,2288.897934,15038.0,1.803654,0.0
3,0,202500.0,0,1,0,2.0,-37.0,0.0,2.0,2.0,2.0,2.0,2.0,15.0,0,0,1,0,2.0,41.0,41.336986,496.043836,15088,1.0,2.0,5.731507,68.778082,2092,1,2.0,3.0,37.0,3.610918,172728.033437,697500.0,2.134482,45000.0,14079.767183,23122.0,0.29431,...,2291.959207,15713.0,1.738268,73.0,187458.950319,1350000.0,2.360132,27000.0,15911.834139,25152.0,0.237466,8041.0,2279.274401,15713.0,1.645373,0.0,187205.414155,1575000.0,2.545753,27000.0,16208.908426,25140.0,0.132156,7959.0,2208.543102,15713.0,1.756384,0.0,172305.923006,1125000.0,2.031336,27000.0,16460.427508,25152.0,0.032008,7723.0,2184.444068,15713.0,1.744483,0.0
4,0,157500.0,0,0,0,2.0,-26.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0,1,1,0,2.0,41.0,41.19726,494.367123,15037,1.0,2.0,5.767123,69.205479,2105,1,2.0,3.0,26.0,3.258097,288683.729222,1575000.0,2.490469,27000.0,15295.935314,23734.0,0.251948,...,3686.306407,15038.0,1.2658,88.0,187458.950319,1350000.0,2.360132,27000.0,15911.834139,25152.0,0.237466,8041.0,2279.274401,15713.0,1.645373,0.0,187205.414155,1575000.0,2.545753,27000.0,16208.908426,25140.0,0.132156,7959.0,2208.543102,15713.0,1.756384,0.0,224088.974309,1575000.0,2.812313,27000.0,15070.916923,24932.0,0.513804,8156.0,2288.897934,15038.0,1.803654,0.0


In [98]:
seed = 50
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

lgb_models={}
lgb_logloss = []


for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_logloss.append(lgb.best_score_['valid_1']['multi_logloss'])
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.668803	valid_1's multi_logloss: 0.761182
[200]	training's multi_logloss: 0.590115	valid_1's multi_logloss: 0.749551
[300]	training's multi_logloss: 0.527287	valid_1's multi_logloss: 0.74412
Early stopping, best iteration is:
[317]	training's multi_logloss: 0.517414	valid_1's multi_logloss: 0.74278


Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.663841	valid_1's multi_logloss: 0.774151
[200]	training's multi_logloss: 0.580112	valid_1's multi_logloss: 0.762804
Early stopping, best iteration is:
[264]	training's multi_logloss: 0.536712	valid_1's multi_logloss: 0.759408


Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.665297	valid_1's multi_logloss: 0.766486
[200]	training's multi_logloss: 0.583118	valid_1's multi_logloss: 0.757277
Early stopping, best iteration is:
[252]	training's multi_logloss: 

In [99]:
X_test = pre_processing(test)
dtest = xgboost.DMatrix(data = X_test)

Submission = pd.read_csv("/content/drive/MyDrive/DACON/[월간 데이콘] 14. 신용카드 사용자 연체 예측 AI/sample_submission.csv")
Submission.iloc[:,1:]=0
for fold in range(5):
    Submission.iloc[:,1:] += lgb_models[fold].predict_proba(X_test)/5

Submission.to_csv(f"/content/drive/MyDrive/DACON/[월간 데이콘] 14. 신용카드 사용자 연체 예측 AI/use_update_data_{seed}_{np.mean(lgb_logloss)}.csv", index = False)

In [83]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

params = {
    'max_depth' : 9,
    'eta' : 0.1,
    'objective' : 'multi:softprob',
    'eval_metric' : 'mlogloss',
    'early_stoppings' : 100,
    'num_class' : 3,
    'sub_sample' : 0.8,
    'alpha' : 0.5
}

xgb_models={}
xgb_logloss = []

for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    dtrain = xgboost.DMatrix(data = X_train, label = y_train)
    dvalid = xgboost.DMatrix(data = X_valid, label = y_valid)                                     
    
    eval_list = [(dtrain, 'train'),
             (dvalid, 'valid')]

    xgb_model = xgboost.train(params = params,
                      dtrain = dtrain,
                      evals = eval_list,
                      num_boost_round = 400,
                      verbose_eval = 100
                      )
    

    xgb_models[fold]=xgb_model
    print(f'================================================================================\n\n')

[0]	train-mlogloss:1.05005	valid-mlogloss:1.05382
[100]	train-mlogloss:0.529872	valid-mlogloss:0.752013
[200]	train-mlogloss:0.385552	valid-mlogloss:0.746703
[300]	train-mlogloss:0.290905	valid-mlogloss:0.752978
[399]	train-mlogloss:0.219938	valid-mlogloss:0.765881


[0]	train-mlogloss:1.05035	valid-mlogloss:1.05453
[100]	train-mlogloss:0.513804	valid-mlogloss:0.760454
[200]	train-mlogloss:0.367111	valid-mlogloss:0.756264
[300]	train-mlogloss:0.270827	valid-mlogloss:0.765018
[399]	train-mlogloss:0.203522	valid-mlogloss:0.781423


[0]	train-mlogloss:1.05042	valid-mlogloss:1.05393


KeyboardInterrupt: ignored

# 해야하는 것
### 적절한 cv seed 찾기

# 해볼만한 것.
### 개인 식별 변수 바꿔보기.
### 카테고리 변수 곱해서 세분화 시키기
### 적은 카테고리는 하나로 묶어보기