<a href="https://colab.research.google.com/github/SanGyuk-Raccoon/lunch_box/blob/master/ver_4_occyp_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from lightgbm import LGBMClassifier
import xgboost

from sklearn.metrics import log_loss

In [None]:
# Load DATA
train = pd.read_csv("/content/drive/MyDrive/DACON/[월간 데이콘] 14. 신용카드 사용자 연체 예측 AI/train.csv")

train['occyp_type'] = train['occyp_type'].fillna('NONE')
train.loc[train['DAYS_EMPLOYED'] > 0, 'occyp_type'] = 'NO_WORK'
train.loc[(train['income_type'] == 'Student') & (train['occyp_type'] == 'NONE'), 'occyp_type'] = 'Laborers'

In [None]:
# 1. 경력이 없는 사람은 직업군이 결측치임
train.loc[train['DAYS_EMPLOYED'] > 0]['occyp_type'].value_counts()

Series([], Name: occyp_type, dtype: int64)

In [None]:
# 2. 경력이 없는 사람은 소득이 Pensioner임
train.loc[train['DAYS_EMPLOYED'] > 0]['income_type'].value_counts()

Pensioner    4438
Name: income_type, dtype: int64

In [None]:
train.head()

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [None]:
def pre_processing(DATA) :

  data = DATA
  



  data = data.drop(['index', 'credit'], axis = 1)

  ## Delete FLAG_MOBIL : train + test에서 값이 1밖에 없음 -> 무의미함
  data = data.drop('FLAG_MOBIL', axis = 1)


  ## Mapping Categorical variable
  # 1. gender
  mapping_gender = {
      'F' : 0, # Female = 0
      'M' : 1  # Male = 1
  }
  data.gender = data.gender.map(mapping_gender)

  # 2. car
  mapping_car = {
      'N' : 0,
      'Y' : 1
  }
  data.car = data.car.map(mapping_car)

  # 3. reality
  mapping_reality = {
      'N' : 0,
      'Y' : 1
  }
  data.reality = data.reality.map(mapping_reality)

  # 4. edu_type : Ordinal variable
  mapping_edu_type = {
    'Lower secondary' : 0, # 중학교 미만
    'Secondary / secondary special' : 1, #중학교
    'Incomplete higher' : 2, # 고등학교 중퇴
    'Higher education' : 3, # 고등학교 졸업
    'Academic degree' : 4 # 학사 이상
  }
  data.edu_type_ord = data.edu_type.map(mapping_edu_type)

  ## one-hot encoding
  var = ['income_type',
        'family_type',
          'edu_type',
        'house_type']

  onehot_encoder = OneHotEncoder()
  onehot_encoder.fit(data.loc[:, var])
  onehot_df = pd.DataFrame(onehot_encoder.transform(data.loc[:,var]).toarray(), 
              columns=onehot_encoder.get_feature_names(var))

  data.drop(var, inplace = True, axis = 1)
  data = pd.concat([data, onehot_df], axis = 1)

  ## child num. 5를 최대로 설정
  data.loc[data['child_num'] > 5,'child_num'] = 5

  ## family_size 7을 최대로 설정
  data.loc[data['family_size'] > 7,'family_size'] = 7

  ## DAYS_BIRTH -> 나이(월)
  data['age'] = -data['DAYS_BIRTH'] / (365 / 12)
  #data.drop('DAYS_BIRTH', axis = 1, inplace = True)

  ## DAYS_EMPLOYES -> 경력(월)
  data['career'] = data['DAYS_EMPLOYED'].apply( lambda x : - x / (365 / 12) if x < 0 else 0)
  #data.drop('DAYS_EMPLOYED', axis = 1, inplace = True)

  ## Begin_month -> 양수로
  data['begin_month'] = - data['begin_month']

  ##########################################################################################
  ## baseline

  data['log_income_total'] = np.log(data['income_total'])



  return data

In [None]:

data = pre_processing(train)


14       NO_WORK
18       NO_WORK
21       NO_WORK
24       NO_WORK
46       NO_WORK
          ...   
26431    NO_WORK
26432    NO_WORK
26439    NO_WORK
26441    NO_WORK
26443    NO_WORK
Name: occyp_type, Length: 4438, dtype: object

In [None]:
test = data.loc[data['occyp_type'] == 'NONE']
train = data.loc[data['occyp_type'] != 'NONE']

In [None]:
Label_encoder = LabelEncoder()
Label_encoder.fit(train['occyp_type'])
y_train_occp = Label_encoder.transform(train['occyp_type'])

X_train_occp = train.drop('occyp_type', axis = 1)


X_train, X_valid, y_train, y_valid = train_test_split(X_train_occp,
                                                      y_train_occp,
                                                      test_size=0.2,
                                                      shuffle = True,
                                                      stratify=y_train_occp,
                                                      random_state=2)

In [None]:
# xgb model
dtrain = xgboost.DMatrix(data = X_train, label = y_train)
dvalid = xgboost.DMatrix(data = X_valid, label = y_valid)

eval_list = [(dtrain, 'train'),
             (dvalid, 'valid')]


params = {
    'max_depth' : 8,
    'eta' : 0.1,
    'objective' : 'multi:softprob',
    'eval_metric' : 'mlogloss',
    'early_stoppings' : 100,
    'num_class' : 19,
    'sub_sample' : 0.8
}
xgb_model_occp = xgboost.train(params = params,
                      dtrain = dtrain,
                      evals = eval_list,
                      num_boost_round = 400,
                      verbose_eval = 10
                      )

[0]	train-mlogloss:2.56644	valid-mlogloss:2.58073
[10]	train-mlogloss:1.5081	valid-mlogloss:1.59576
[20]	train-mlogloss:1.13974	valid-mlogloss:1.26347
[30]	train-mlogloss:0.942541	valid-mlogloss:1.09127
[40]	train-mlogloss:0.815137	valid-mlogloss:0.983227
[50]	train-mlogloss:0.725974	valid-mlogloss:0.910322
[60]	train-mlogloss:0.65249	valid-mlogloss:0.851077
[70]	train-mlogloss:0.595583	valid-mlogloss:0.806199
[80]	train-mlogloss:0.546516	valid-mlogloss:0.768317
[90]	train-mlogloss:0.503555	valid-mlogloss:0.734723
[100]	train-mlogloss:0.46673	valid-mlogloss:0.70731
[110]	train-mlogloss:0.431851	valid-mlogloss:0.679396
[120]	train-mlogloss:0.400765	valid-mlogloss:0.656341
[130]	train-mlogloss:0.368263	valid-mlogloss:0.631092
[140]	train-mlogloss:0.343639	valid-mlogloss:0.613222
[150]	train-mlogloss:0.32105	valid-mlogloss:0.595902
[160]	train-mlogloss:0.300391	valid-mlogloss:0.579325
[170]	train-mlogloss:0.280488	valid-mlogloss:0.564026
[180]	train-mlogloss:0.260972	valid-mlogloss:0.5489

In [None]:
data['occyp_type'].value_counts()

NONE                     8170
Laborers                 4513
Core staff               2646
Sales staff              2539
Managers                 2167
Drivers                  1575
High skill tech staff    1040
Accountants               902
Medicine staff            864
Cooking staff             457
Security staff            424
Cleaning staff            403
Private service staff     243
Low-skill Laborers        127
Waiters/barmen staff      124
Secretaries                97
Realty agents              63
HR staff                   62
IT staff                   41
Name: occyp_type, dtype: int64

In [None]:
X_test_occp = test.drop(['occyp_type'], axis = 1)
dtest = xgboost.DMatrix(data = X_test_occp)
preds = xgb_model_occp.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])


array([ 8, 10, 15, ...,  3,  8, 17])

In [None]:
occyp_pred = Label_encoder.inverse_transform(best_preds)

In [None]:
# Load DATA
train = pd.read_csv("/content/drive/MyDrive/DACON/[월간 데이콘] 14. 신용카드 사용자 연체 예측 AI/train.csv")

train['occyp_type'] = train['occyp_type'].fillna('NONE')
train.loc[train['DAYS_EMPLOYED'] > 0, 'occyp_type'] = 'NO_WORK'
train.loc[(train['income_type'] == 'Student') & (train['occyp_type'] == 'NONE'), 'occyp_type'] = 'Laborers'

In [None]:
train.loc[train['occyp_type'] == 'NONE', 'occyp_type'] = occyp_pred

In [None]:
train.isnull().sum(axis = 0)

index            0
gender           0
car              0
reality          0
child_num        0
income_total     0
income_type      0
edu_type         0
family_type      0
house_type       0
DAYS_BIRTH       0
DAYS_EMPLOYED    0
FLAG_MOBIL       0
work_phone       0
phone            0
email            0
occyp_type       0
family_size      0
begin_month      0
credit           0
dtype: int64

In [None]:
train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,Laborers,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [None]:
train.to_csv("/content/drive/MyDrive/DACON/[월간 데이콘] 14. 신용카드 사용자 연체 예측 AI/train_update.csv", index = False)

In [188]:
xgb_model_occp.eval(dvalid)

'[0]\teval-mlogloss:0.411999'