<a href="https://colab.research.google.com/github/SanGyuk-Raccoon/DACON_1/blob/main/ver_10_repeat_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## Set Random Seed
seed = 81288

## Import Library
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from lightgbm import LGBMClassifier
import xgboost

from sklearn.metrics import log_loss
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
##############################################################################
########################### Setting User Functions ###########################
##############################################################################

##############################################################################
########## Normalization
##############################################################################
def norm_scale(data, var) :
  M = data[ var ].max()
  m = data[ var ].min()
  result = (data[ var ] - m) / ( M - m )

  return result

##############################################################################
########## Standardization
##############################################################################
def stand_scale(data, var) :
  M = data[ var ].max()
  sd = data[ var ].std()
  result = ( data[ var ] - M ) / sd

  return result

##############################################################################
########## Calculate by Group
##############################################################################
def conti_by_cate(GROUP, CONTI, DATA, method) :
  group = GROUP
  var = CONTI
  vec = []
  if method == 'mean' :
    vec = DATA.groupby( group )[ var ].mean()
  elif method == 'min' :
    vec = DATA.groupby( group )[ var ].min()
  elif method == 'max' :
    vec = DATA.groupby( group )[ var ].max()
  elif method == 'skew' :
    vec = DATA.groupby( group )[ var ].skew()
  elif method == 'med' :
    vec = DATA.groupby( group )[ var ].median()
  elif method == 'std' :
    vec = DATA.groupby( group )[ var ].std()
    
  idx = vec.index

  for i in range(len(vec)) :
    DATA.loc[DATA[group] == idx[i], f'{var}_by_{group}_{method}'] = vec[i]

  result = DATA[f'{var}_by_{group}_{method}']
  DATA = DATA.drop(f'{var}_by_{group}_{method}', axis = 1, inplace = True)

  return result

In [3]:
# Load DATA
train_raw = pd.read_csv("/content/drive/MyDrive/DACON/[월간 데이콘] 14. 신용카드 사용자 연체 예측 AI/train_update.csv")
train_raw.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,Laborers,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [14]:
def pre_processing(DATA, GROUP, VAR, METHOD) :
  data =  DATA

  ############################################################################################
  data['income_type'] =data['income_type'].apply(lambda x : 'Pensioner' if x == 'Student' else x) # Student가 Pensioner과 가장 소득이 비슷함

  #############################################################################################
  data = data.drop(['index', 'FLAG_MOBIL'], axis = 1)

  #### 1. gender
  mapping_gender = {
      'F' : 0, # Female = 0
      'M' : 1  # Male = 1
  }
  data['gender_mapping'] = data.gender.map(mapping_gender)

  # 2. car
  mapping_car = {
      'N' : 0,
      'Y' : 1
  }
  data['car_mapping'] = data.car.map(mapping_car)

  # 3. reality
  mapping_reality = {
      'N' : 0,
      'Y' : 1
  }
  data['reality_mapping'] = data.reality.map(mapping_reality)

  ##
  #data['family_type'] =data['family_type'].apply(lambda x : 'Married' if x == 'Civil marriage' else x) 
  #data['family_type'] =data['family_type'].apply(lambda x : 'Single / not married' if x == 'Separated' else x)
  #data['family_type'] =data['family_type'].apply(lambda x : 'Single / not married' if x == 'Widow' else x)
  ##########################################################################

  

  ##########################################################################
  data['income_total'] = data['income_total']
  data['DAYS_BIRTH'] = - data['DAYS_BIRTH']
  data['DAYS_EMPLOYED'] = data['DAYS_EMPLOYED'].apply( lambda x : - x if x < 0 else 0)
  data['begin_month'] = - data['begin_month']
  ##########################################################################
  data['birth_employed'] = data['DAYS_BIRTH'] - data['DAYS_EMPLOYED']
  data['employed_begin'] = data['DAYS_EMPLOYED'] - data['begin_month'] * (365 / 12)
  data['birth_begin'] = data['DAYS_BIRTH'] - data['begin_month'] * (365 / 12)
  ##########################################################################
  data['income_by_birth'] = data['income_total'] / data['DAYS_BIRTH']
#  data['income_by_employ'] = data['income_total'] / (data['DAYS_EMPLOYED'] + 1)
#  data['income_by_month'] = data['income_total'] / (data['begin_month'] + 1)
  ########################################################################
  data['child_num'] = data['child_num'].apply(lambda x : 2 if x > 2 else x)
  data['family_size'] = data['family_size'].apply(lambda x : 4 if x > 4 else x)
  ########################################################################

  ########################################################################
  data['P_W_E'] = data['phone'].astype(str) + "_" + data['work_phone'].astype(str) + "_" + data['email'].astype(str)
  data['P_W_E_mapping'] = data['P_W_E'].apply(lambda x : 1 if x == '0_0_0' else 0)
  
  
  data['gender_family'] = data['gender'] + "_" + data['family_type']  
  data['gender_family_mapping'] = data['gender_family'].apply(lambda x : 1 if x == 'F_Married' else 0)
 
  data['car_reality'] = data['car'] + "_" + data['reality']
  data['car_reality_mapping'] = data['car_reality'].apply(lambda x : 1 if x == 'N_Y' else 0)
  
  data['house_type_mapping'] = data['house_type'].apply(lambda x : 1 if x == 'House / apartment' else 0)
  
  ##
  new_conti_var = pd.DataFrame()

  new_conti_var = pd.concat( [ new_conti_var, conti_by_cate(GROUP, VAR, data, METHOD) ], axis = 1 )

  new_conti_var = pd.concat( [ new_conti_var, pd.Series(data[VAR] / conti_by_cate(GROUP, VAR, data, METHOD),
                                                        name = f'{VAR}_divide_{METHOD}_by_{GROUP}' )], axis = 1 )
      
  data = pd.concat([data, norm_scale(new_conti_var, new_conti_var.columns)], axis = 1 )

  ##
  
  
  
  
  ########################################################################## 정규화
  scale_var = ['income_total', 'income_by_birth']
  data[ scale_var ] = stand_scale(data, scale_var )
  norm_var = ['DAYS_BIRTH', 'DAYS_EMPLOYED', 'begin_month', 'birth_employed', 'employed_begin', 'birth_begin']
  data[ norm_var ] = norm_scale(data, norm_var )
  ########################################################################
  

  
  ########################################################################
  
  ## ['mean', 'max', 'skew', 'min', 'median', 'std']
 
  ########################################################################
  

  ########################################################################
  var = ['occyp_type', 'income_type',
        'family_type', 'house_type', 'edu_type',
         'phone', 'work_phone', 'email', 'P_W_E_mapping'
         ]

  var2 = ['P_W_E', 'car_reality', 'gender_family']
  
  ######################################################
  data = data.drop(['gender', 'car', 'reality'] + var + var2, axis = 1)
  
  col = data.columns[data.isnull().sum(axis = 0) == 0]
  data = data[col]

  return data

In [5]:
def TRAINER(TRAIN) :
  skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    
  folds=[]
  for train_idx, valid_idx in skf.split(TRAIN, TRAIN['credit']):
      folds.append((train_idx, valid_idx))

  xgb_models={}
  xgb_logloss = []
  for fold in range(5):
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = TRAIN.drop(['credit'],axis=1).loc[train_idx], TRAIN.drop(['credit'],axis=1).loc[valid_idx],\
                                        TRAIN['credit'][train_idx], TRAIN['credit'][valid_idx]                                  

    eval_list = [(X_train, y_train),
                (X_valid, y_valid)]

    xgb_model = xgboost.XGBClassifier(objective = 'multi:softprob',
                eval_metric = 'mlogloss',
                learning_rate = 0.1,
                early_stoppings = 100,
                num_class = 3,
                max_depth = 8,
                n_estimators = 100                                   
                ) 

    xgb_model.fit(X_train,
                  y_train,
                  eval_set = eval_list,
                  verbose = False
                  )

    loss = log_loss(y_valid,
    xgb_model.predict_proba(X_valid))
    xgb_logloss.append(loss)
    xgb_models[fold]=xgb_model
    

  print(f'Mean of log_loss : {np.mean(xgb_logloss):.5}')
  

In [28]:
class TRAINER(object) :
  def __init__(self, DATA, GROUP, VAR, METHOD) :
    self.DATA = DATA
    self.GROUP = GROUP
    self.VAR = VAR
    self.METHOD = METHOD
    self.logloss = []


  def train(self) :
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    
    folds=[]
    for train_idx, valid_idx in skf.split(self.DATA, self.DATA['credit']):
      folds.append((train_idx, valid_idx))
    
      X_train, X_valid, y_train, y_valid = self.DATA.drop(['credit'],axis=1).loc[train_idx], self.DATA.drop(['credit'],axis=1).loc[valid_idx],\
                                          self.DATA['credit'][train_idx], self.DATA['credit'][valid_idx]                                  

      eval_list = [(X_train, y_train),
                  (X_valid, y_valid)]

      xgb_model = xgboost.XGBClassifier(objective = 'multi:softprob',
                  eval_metric = 'mlogloss',
                  learning_rate = 0.1,
                  early_stoppings = 100,
                  num_class = 3,
                  max_depth = 8,
                  n_estimators = 100                                   
                  ) 

      xgb_model.fit(X_train,
                    y_train,
                    eval_set = eval_list,
                    verbose = False
                    )

      loss = log_loss(y_valid,
      xgb_model.predict_proba(X_valid))
      self.logloss.append(loss)
  

In [None]:
METHOD = ['mean', 'max', 'min', 'median', 'skew', 'std']
VAR = ['income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'begin_month']
GROUP = ['P_W_E', 'P_W_E_mapping',
          'gender_family', 'gender_family_mapping',
          'car_reality', 'car_reality_mapping',
          'income_type', 
          'house_type', 'house_type_mapping', 
          'occyp_type']

result = pd.DataFrame()
n = 1
total = len(GROUP) * len(VAR) * len(METHOD)

for group in GROUP :
  for var in VAR :
    for method in METHOD :
      try : 
        X = TRAINER(train, group, var, method)
        X.train()
        result = pd.concat([result,
                          pd.DataFrame( [[X.GROUP, X.VAR, X.METHOD, np.mean(X.logloss)]]) ], axis = 0)
      except :
        pass

      print(f'============ {n/total * 100:.2f}% 완료 ===========')
      n = n + 1



In [1]:
result

NameError: ignored

NameError: ignored