In [135]:
import numpy as np
import pandas as pd

In [136]:
train = pd.read_csv('hr_data/aug_train.csv')
train.shape

(19158, 14)

# **컬럼 type 변경**
### category to numeric feature
- ~~education_level~~
- ~~last_new_job~~
- ~~experience~~
- ~~enrolled_university~~
    
- `category_feature = ['city','gender','relevent_experience','major_discipline','company_size','company_type']`
- `numeric_feature = ['city_development_index','training_hours','education_level','last_new_job','experience','enrolled_university']`

## education_level type change to integer
- 4 : Phd
- 3 : masters
- 2 : graduate
- 1 : high school
- 0 : primary school

In [137]:
train['education_level'].value_counts(dropna=False)

Graduate          11598
Masters            4361
High School        2017
NaN                 460
Phd                 414
Primary School      308
Name: education_level, dtype: int64

In [138]:
education_level = ['Primary School', 'High School', 'Graduate', 'Masters', 'Phd']
for i, v in enumerate(education_level):
    train.loc[train['education_level']==v, 'education_level'] = i

In [139]:
train['education_level'].value_counts(dropna=False)

2.0    11598
3.0     4361
1.0     2017
NaN      460
4.0      414
0.0      308
Name: education_level, dtype: int64

## last_new_job type change to integer
- `'>4' = 5`, `'never' = 0`

In [140]:
train['last_new_job'].value_counts(dropna=False)

1        8040
>4       3290
2        2900
never    2452
4        1029
3        1024
NaN       423
Name: last_new_job, dtype: int64

In [141]:
train.loc[train['last_new_job'] == '>4', 'last_new_job'] = 5
train.loc[train['last_new_job'] == 'never', 'last_new_job'] = 0

train['last_new_job'] = pd.to_numeric(train['last_new_job'], downcast='integer')

In [142]:
train['last_new_job'].value_counts(dropna=False)

1.0    8040
5.0    3290
2.0    2900
0.0    2452
4.0    1029
3.0    1024
NaN     423
Name: last_new_job, dtype: int64

## experience type change to integer
- `'>20' = 20`, `'<1' = 0`

In [143]:
train['experience'].value_counts(dropna=False)

>20    3286
5      1430
4      1403
3      1354
6      1216
2      1127
7      1028
10      985
9       980
8       802
15      686
11      664
14      586
1       549
<1      522
16      508
12      494
13      399
17      342
19      304
18      280
20      148
NaN      65
Name: experience, dtype: int64

In [144]:
train.loc[train['experience'] == '>20', 'experience'] = 21
train.loc[train['experience'] == '<1', 'experience'] = 0

train['experience'] = pd.to_numeric(train['experience'], downcast='integer')

In [145]:
train['experience'].value_counts(dropna=False)

21.0    3286
5.0     1430
4.0     1403
3.0     1354
6.0     1216
2.0     1127
7.0     1028
10.0     985
9.0      980
8.0      802
15.0     686
11.0     664
14.0     586
1.0      549
0.0      522
16.0     508
12.0     494
13.0     399
17.0     342
19.0     304
18.0     280
20.0     148
NaN       65
Name: experience, dtype: int64

## enrolled_university type change to integer
- 2 : Full time course
- 1 : Part time course
- 0 : no_enrollment
-------
- no_enrollment       13817
- Full time course     3757
- Part time course     1198
- NaN                   386

In [146]:
train['enrolled_university'].value_counts(dropna=False)

no_enrollment       13817
Full time course     3757
Part time course     1198
NaN                   386
Name: enrolled_university, dtype: int64

In [147]:
train.loc[train['enrolled_university'] == 'Full time course', 'enrolled_university'] = 2
train.loc[train['enrolled_university'] == 'Part time course', 'enrolled_university'] = 1
train.loc[train['enrolled_university'] == 'no_enrollment', 'enrolled_university'] = 0

In [148]:
train['enrolled_university'].value_counts(dropna=False)

0.0    13817
2.0     3757
1.0     1198
NaN      386
Name: enrolled_university, dtype: int64

# **컬럼별 결측치 처리**
- ~~**null**값이 500 이하인 행 KNN 처리~~
- ~~**gender** : 결측치를 반반 남, 녀 나눠주기~~
- ~~**major_discipline** : 결측치를 최빈값에 합친다.~~
- ~~**company_size . type** : unkown이라는 새로운 컬럼에 정의~~

In [77]:
train.columns

Index(['enrollee_id', 'city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours', 'target'],
      dtype='object')

In [92]:
train.isna().sum()

enrollee_id                 0
city                        0
city_development_index      0
gender                      0
relevent_experience         0
enrolled_university       386
education_level           460
major_discipline            0
experience                 65
company_size                0
company_type                0
last_new_job              423
training_hours              0
target                      0
dtype: int64

# 전체 데이터의 결측치가 3%이하인 결측치 처리
- knn으로 대체

In [149]:
limit = train.shape[0] * 0.03

In [150]:
under_list = []
for col in train.columns:
    if 1 < train[col].isna().sum() < limit:
        under_list.append(col)
under_list

['enrolled_university', 'education_level', 'experience', 'last_new_job']

In [154]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors = 1)
train[under_list] = imputer.fit_transform(train[under_list])

## gender 결측치 처리
- 결측치를 반으로 나누어 반은 Male, 반은 Female로 분배

In [158]:
gender_limit = train['gender'].isna().sum()//2

In [159]:
train['gender'].fillna('Female', limit=gender_limit, inplace=True)
train['gender'].fillna('Male', inplace=True)

## major_discipline 결측치 처리
- 결측치는 최빈값에 넣어준다.(최빈값 = 'STEM' == 데이터과학자과?)
- 데이터관련학과의 value는 STEM 한개이고(major) 나머지는 비전공으므로 non_major로 명시

In [160]:
train['major_discipline'].value_counts()

STEM               14492
Humanities           669
Other                381
Business Degree      327
Arts                 253
No Major             223
Name: major_discipline, dtype: int64

In [161]:
major = train['major_discipline'].value_counts().idxmax()
major

'STEM'

In [162]:
# 최빈값으로 na값 처리
train['major_discipline'].fillna(major, inplace=True)

In [163]:
train.loc[train['major_discipline'] == 'STEM', 'major_discipline'] = 'major'
train.loc[train['major_discipline'] != 'major', 'major_discipline'] = 'no_major'

In [164]:
train['major_discipline']

0           major
1           major
2           major
3        no_major
4           major
           ...   
19153    no_major
19154       major
19155       major
19156       major
19157       major
Name: major_discipline, Length: 19158, dtype: object

In [165]:
train['major_discipline'].value_counts(dropna=False)

major       17305
no_major     1853
Name: major_discipline, dtype: int64

# company_size, type 결측치 처리
- 결측치의 양이 많으므로 unknown으로 정의

In [166]:
train['company_size'].fillna('unknown', inplace=True)
train['company_type'].fillna('unknown', inplace=True)

In [169]:
train.to_csv('hr_data/processing_missing_value.csv', index=False)