In [2]:
# 데이터 시각화에 사용할 라이브러리
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

# 브라우저에서 바로 그려지도록
%matplotlib inline

# 유니코드에서  음수 부호설정
mpl.rc('axes', unicode_minus=False)

# Global Variables

In [3]:
import os
import numpy as np
import random
import torch

def reset_seeds(seed=42):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)    # 파이썬 환경변수 시드 고정
  np.random.seed(seed)
  torch.manual_seed(seed) # cpu 연산 무작위 고정
  torch.cuda.manual_seed(seed) # gpu 연산 무작위 고정
  torch.backends.cudnn.deterministic = True  # cuda 라이브러리에서 Deterministic(결정론적)으로 예측하기 (예측에 대한 불확실성 제거 )


In [5]:
import easydict
args = easydict.EasyDict()

# path 정보
args.default_path = './Data/'
args.train_csv = args.default_path+'aug_train.csv'
args.test_csv = args.default_path+'aug_test.csv'

# 데이터 분석을 위한 변수들
args.random_state = 21
args.results = []

# Load Data

> - enrollee_id : 후보자 식별 코드  
> - city : 도시 코드  
> - city_devel_development_index : 도시 개발 지수  
> - gender : 성별    
> - relevent_experience : 관련 분야 경험 유무  
> - enrolled_univercity : 대학 과정 등록 여부 및 유형  
> - education_level : 학력 수준   
> - major_discipline : 전공 분야  
> - experience : 전체 경력 년수  
> - company_size : 현재 재직중인 회사의 직원 수 규모  
> - company_type : 현재 회사의 형태  
> - last_new_job : 이전 직장과 현재 직장의 이직 간 년차  
> - training_hours : 교육 시간 총합  
> - target : 이직 의향 여부  

In [6]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
plt.ion()

import warnings
warnings.filterwarnings('ignore')

In [7]:
ori_train = pd.read_csv(args.train_csv)
ori_test = pd.read_csv(args.test_csv)

ori_train.shape, ori_test.shape

((19158, 14), (2129, 13))

In [8]:
ori_train.drop('enrollee_id', axis=1, inplace=True)
ori_train.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [9]:
ori_test.set_index(['enrollee_id'], inplace=True)
print(f'{ori_test.shape}')
ori_test.head()

(2129, 12)


Unnamed: 0_level_0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
enrollee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
32403,city_41,0.827,Male,Has relevent experience,Full time course,Graduate,STEM,9,<10,,1,21
9858,city_103,0.92,Female,Has relevent experience,no_enrollment,Graduate,STEM,5,,Pvt Ltd,1,98
31806,city_21,0.624,Male,No relevent experience,no_enrollment,High School,,<1,,Pvt Ltd,never,15
27385,city_13,0.827,Male,Has relevent experience,no_enrollment,Masters,STEM,11,10/49,Pvt Ltd,1,39
27724,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,10000+,Pvt Ltd,>4,72


In [10]:
ori_train["education_level"].unique()

array(['Graduate', 'Masters', 'High School', nan, 'Phd', 'Primary School'],
      dtype=object)

# train_test_split

In [11]:
new_target = pd.Categorical(ori_train["target"])
new_target = new_target.rename_categories(["target_0", "target_1"])  # or ["이직 의향 없음", "이직 의향 있음"]

new_target.describe()


Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
target_0,14381,0.750652
target_1,4777,0.249348


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
y = ori_train['target']
X = ori_train.drop(['target'], axis=1)

In [14]:
reset_seeds()
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=ori_train['target'])

X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((13410, 12), (5748, 12), (13410,), (5748,))

# Model1

In [15]:
train = X_tr.copy()
test = X_te.copy()
ori_te = ori_test.copy()

train.shape, test.shape, ori_te.shape

((13410, 12), (5748, 12), (2129, 12))

## Data Preprocessing

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13410 entries, 9719 to 3085
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    13410 non-null  object 
 1   city_development_index  13410 non-null  float64
 2   gender                  10287 non-null  object 
 3   relevent_experience     13410 non-null  object 
 4   enrolled_university     13145 non-null  object 
 5   education_level         13093 non-null  object 
 6   major_discipline        11420 non-null  object 
 7   experience              13364 non-null  object 
 8   company_size            9297 non-null   object 
 9   company_type            9130 non-null   object 
 10  last_new_job            13118 non-null  object 
 11  training_hours          13410 non-null  int64  
dtypes: float64(1), int64(1), object(10)
memory usage: 1.3+ MB


> drop columns

In [17]:
print(f'before: {train.shape} / {test.shape}')
drop_cols = ['city']

train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)
ori_te.drop(drop_cols, axis=1, inplace=True)

print(f'after: {train.shape} / {test.shape}')
train.info()

before: (13410, 12) / (5748, 12)
after: (13410, 11) / (5748, 11)
<class 'pandas.core.frame.DataFrame'>
Index: 13410 entries, 9719 to 3085
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city_development_index  13410 non-null  float64
 1   gender                  10287 non-null  object 
 2   relevent_experience     13410 non-null  object 
 3   enrolled_university     13145 non-null  object 
 4   education_level         13093 non-null  object 
 5   major_discipline        11420 non-null  object 
 6   experience              13364 non-null  object 
 7   company_size            9297 non-null   object 
 8   company_type            9130 non-null   object 
 9   last_new_job            13118 non-null  object 
 10  training_hours          13410 non-null  int64  
dtypes: float64(1), int64(1), object(9)
memory usage: 1.2+ MB


> missing value

In [18]:
# 결측치 처리 함수

mode_values = {
    "education_level": train["education_level"].mode()[0],
    "experience": train["experience"].mode()[0],
    "last_new_job": train["last_new_job"].mode()[0]
}

def fill_missing(df, mode_values):
    df["gender"] = df["gender"].fillna("unknown")
    df["enrolled_university"] = df["enrolled_university"].fillna("unknown")
    df["major_discipline"] = df["major_discipline"].fillna("unknown")
    df["company_size"] = df["company_size"].fillna("unknown")
    df["company_type"] = df["company_type"].fillna("unknown")

    # train에서 뽑은 mode로 채움
    for col, mode_val in mode_values.items():
        df[col] = df[col].fillna(mode_val)

    return df

train = fill_missing(train, mode_values)
test = fill_missing(test, mode_values)
ori_te = fill_missing(ori_te, mode_values)

print("결측치 처리 후")
print(train.isnull().sum())


결측치 처리 후
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
dtype: int64


In [19]:
# experience 묶기 - o
def simplify_experience(x):
    x = str(x).strip()

    if x in ['<1', '0']:
        return '0'
    elif x in ['1','2','3','4']:
        return '1-4'
    elif x in ['5','6','7','8','9']:
        return '5-9'
    elif x in ['10','11','12','13','14']:
        return '10-14'
    else:  # '15'이상
        return '15+'

train["experience_simplified"] = train["experience"].apply(simplify_experience)
test["experience_simplified"] = test["experience"].apply(simplify_experience)
ori_te["experience_simplified"] = ori_te["experience"].apply(simplify_experience)

print(train["experience"].value_counts().sort_index())
print(train["experience_simplified"].value_counts())



experience
1       381
10      700
11      462
12      345
13      279
14      428
15      466
16      337
17      240
18      197
19      212
2       782
20      113
3       952
4       976
5       965
6       867
7       754
8       532
9       679
<1      369
>20    2374
Name: count, dtype: int64
experience_simplified
15+      3939
5-9      3797
1-4      3091
10-14    2214
0         369
Name: count, dtype: int64


In [20]:
train.drop(columns=["experience"], inplace=True)
test.drop(columns=["experience"], inplace=True)
ori_te.drop(columns=["experience"], inplace=True)


In [21]:
import numpy as np

def create_features(df):
    # 1. training_hours 로그변환 - o
    df["log_training_hours"] = np.log1p(df["training_hours"])

    return df

train = create_features(train)
test = create_features(test)
ori_te = create_features(ori_te)

print(train[["log_training_hours"]].head())


       log_training_hours
9719             4.976734
17774            2.564949
16608            3.931826
9343             2.944439
12394            4.110874


In [22]:
# - o
train["company_combo"] = train["company_size"] + "_" + train["company_type"]
test["company_combo"] = test["company_size"] + "_" + test["company_type"]
ori_te["company_combo"] = ori_te["company_size"] + "_" + ori_te["company_type"]


> data encoding

In [23]:
cat_cols = train.select_dtypes(include=["object"]).columns.tolist()

train_enc = pd.get_dummies(train, columns=cat_cols)
test_enc = pd.get_dummies(test, columns=cat_cols)
ori_te_enc = pd.get_dummies(ori_te, columns=cat_cols)

train_enc, test_enc = train_enc.align(test_enc, join="left", axis=1, fill_value=0)
train_enc, ori_te_enc = train_enc.align(ori_te_enc, join="left", axis=1, fill_value=0)

print("Train:", train_enc.shape)
print("Test:", test_enc.shape)
print("Ori_te:", ori_te_enc.shape)


Train: (13410, 108)
Test: (5748, 108)
Ori_te: (2129, 108)


In [24]:
train_enc.isnull().sum().sum(), test_enc.isnull().sum().sum(), ori_te_enc.isnull().sum().sum()

(np.int64(0), np.int64(0), np.int64(0))

In [25]:
train_enc.shape, test_enc.shape, ori_te_enc.shape

((13410, 108), (5748, 108), (2129, 108))

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
reset_seeds()

model = DecisionTreeClassifier(
    max_depth=6,
    min_samples_split=50,
    min_samples_leaf=20,
    random_state=42
)

print(f'{train_enc.shape} / {y_tr.shape}')
model.fit(train_enc, y_tr)

(13410, 108) / (13410,)


0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,6
,min_samples_split,50
,min_samples_leaf,20
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [28]:
reset_seeds()

score_tr = model.score(train_enc, y_tr)
score_te = model.score(test_enc, y_te)

score_tr, score_te

(0.7978374347501864, 0.7971468336812805)

In [29]:
from sklearn.metrics import roc_curve, auc

y_pred = model.predict_proba(test_enc)[:,1]
fpr, tpr, thresholds = roc_curve(y_te, y_pred)

auc_te = auc(fpr, tpr)
print(f'model: {auc_te}')

model: 0.8040109842570302


In [30]:
ori_te_pred = model.predict_proba(ori_te_enc)[:,1]
ori_te_pred.shape

(2129,)

In [31]:
model.feature_importances_

array([0.58337245, 0.01295111, 0.00559165, 0.        , 0.        ,
       0.        , 0.        , 0.00465456, 0.        , 0.00404091,
       0.        , 0.        , 0.        , 0.02243535, 0.00323101,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.08209596,
       0.        , 0.        , 0.00175861, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.00468399, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00147403, 0.        , 0.00260757, 0.        , 0.00206658,
       0.        , 0.00869785, 0.        , 0.00452291, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.00674151, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [32]:
df_feature_importances = pd.DataFrame(model.feature_importances_, train_enc.columns).sort_values(by=[0], ascending=False).reset_index()

print(f'{df_feature_importances.shape}')
df_feature_importances

(108, 2)


Unnamed: 0,index,0
0,city_development_index,0.583372
1,company_combo_unknown_unknown,0.240323
2,major_discipline_unknown,0.082096
3,education_level_Graduate,0.022435
4,training_hours,0.012951
...,...,...
103,company_combo_unknown_Early Stage Startup,0.000000
104,company_combo_unknown_NGO,0.000000
105,company_combo_unknown_Funded Startup,0.000000
106,company_combo_unknown_Other,0.000000


In [33]:
import pandas as pd

df_feature_importances.head(20)

Unnamed: 0,index,0
0,city_development_index,0.583372
1,company_combo_unknown_unknown,0.240323
2,major_discipline_unknown,0.082096
3,education_level_Graduate,0.022435
4,training_hours,0.012951
5,company_combo_unknown_Public Sector,0.008751
6,last_new_job_never,0.008698
7,company_combo_10/49_Pvt Ltd,0.006742
8,log_training_hours,0.005592
9,company_size_unknown,0.004684
