In [62]:
import numpy as np
import pandas as pd

In [63]:
train = pd.read_csv('hr_data/aug_train.csv')
train.shape

(19158, 14)

# **컬럼 type 변경**
### category to numeric feature
- ~~education_level~~
- ~~last_new_job~~
- ~~experience~~
- ~~enrolled_university~~
- ~~company_size~~
- ~~company_type~~
    
- `category_feature = ['city','gender','relevent_experience','major_discipline','company_size']`
- `numeric_feature = ['city_development_index','training_hours','education_level','last_new_job','experience','enrolled_university', 'company_type']`

## education_level type change to integer
- 4 : Phd
- 3 : masters
- 2 : graduate
- 1 : high school
- 0 : primary school

In [64]:
train['education_level'].value_counts(dropna=False)

Graduate          11598
Masters            4361
High School        2017
NaN                 460
Phd                 414
Primary School      308
Name: education_level, dtype: int64

In [65]:
education_level = ['Primary School', 'High School', 'Graduate', 'Masters', 'Phd']
for i, v in enumerate(education_level):
    train.loc[train['education_level']==v, 'education_level'] = i

In [66]:
train['education_level'].value_counts(dropna=False)

2.0    11598
3.0     4361
1.0     2017
NaN      460
4.0      414
0.0      308
Name: education_level, dtype: int64

## last_new_job type change to integer
- `'>4' = 5`, `'never' = 0`

In [67]:
train['last_new_job'].value_counts(dropna=False)

1        8040
>4       3290
2        2900
never    2452
4        1029
3        1024
NaN       423
Name: last_new_job, dtype: int64

In [68]:
train.loc[train['last_new_job'] == '>4', 'last_new_job'] = 5
train.loc[train['last_new_job'] == 'never', 'last_new_job'] = 0

train['last_new_job'] = pd.to_numeric(train['last_new_job'], downcast='integer')

In [69]:
train['last_new_job'].value_counts(dropna=False)

1.0    8040
5.0    3290
2.0    2900
0.0    2452
4.0    1029
3.0    1024
NaN     423
Name: last_new_job, dtype: int64

## experience type change to integer
- `'>20' = 21`, `'<1' = 0`

In [70]:
train['experience'].value_counts(dropna=False)

>20    3286
5      1430
4      1403
3      1354
6      1216
2      1127
7      1028
10      985
9       980
8       802
15      686
11      664
14      586
1       549
<1      522
16      508
12      494
13      399
17      342
19      304
18      280
20      148
NaN      65
Name: experience, dtype: int64

In [71]:
train.loc[train['experience'] == '>20', 'experience'] = 21
train.loc[train['experience'] == '<1', 'experience'] = 0

train['experience'] = pd.to_numeric(train['experience'], downcast='integer')

In [72]:
train['experience'].value_counts(dropna=False)

21.0    3286
5.0     1430
4.0     1403
3.0     1354
6.0     1216
2.0     1127
7.0     1028
10.0     985
9.0      980
8.0      802
15.0     686
11.0     664
14.0     586
1.0      549
0.0      522
16.0     508
12.0     494
13.0     399
17.0     342
19.0     304
18.0     280
20.0     148
NaN       65
Name: experience, dtype: int64

## enrolled_university type change to integer
- 2 : Full time course
- 1 : Part time course
- 0 : no_enrollment
-------
- no_enrollment       13817
- Full time course     3757
- Part time course     1198
- NaN                   386

In [73]:
train['enrolled_university'].value_counts(dropna=False)

no_enrollment       13817
Full time course     3757
Part time course     1198
NaN                   386
Name: enrolled_university, dtype: int64

In [74]:
train.loc[train['enrolled_university'] == 'Full time course', 'enrolled_university'] = 2
train.loc[train['enrolled_university'] == 'Part time course', 'enrolled_university'] = 1
train.loc[train['enrolled_university'] == 'no_enrollment', 'enrolled_university'] = 0

In [75]:
train['enrolled_university'].value_counts(dropna=False)

0.0    13817
2.0     3757
1.0     1198
NaN      386
Name: enrolled_university, dtype: int64

# company_size type change to integer

In [76]:
train['company_size'].value_counts()

50-99        3083
100-500      2571
10000+       2019
10/49        1471
1000-4999    1328
<10          1308
500-999       877
5000-9999     563
Name: company_size, dtype: int64

In [77]:
company_size = ['<10', '10/49','50-99', '100-500','500-999', '1000-4999', '5000-9999', '10000+']

In [78]:
for i, v in enumerate(company_size):
    train.loc[train['company_size'] ==v, 'company_size'] = i

In [79]:
train['company_size'].value_counts()

2    3083
3    2571
7    2019
1    1471
5    1328
0    1308
4     877
6     563
Name: company_size, dtype: int64

In [101]:
company_type = ['Other', 'NGO', 'Early Stage Startup', 'Public Sector', 'Funded Startup', 'Pvt Ltd']

In [102]:
for i, v in enumerate(company_type):
    train.loc[train['company_type'] ==v, 'company_type'] = i

In [103]:
train['company_type'].value_counts()

5    9817
4    1001
3     955
2     603
1     521
0     121
Name: company_type, dtype: int64

# **컬럼별 결측치 처리**

- ~~**gender** : 결측치를 최빈값~~
- ~~**major_discipline** : 결측치를 최빈값에 합친다.~~
- ~~남은 결측치 KNN 처리~~

## gender 결측치 처리
- 결측치를 반으로 나누어 반은 Male, 반은 Female로 분배

In [80]:
# gender_limit = train['gender'].isna().sum()//2

In [81]:
# train['gender'].fillna('Female', limit=gender_limit, inplace=True)
train['gender'].fillna('Male', inplace=True)

## major_discipline 결측치 처리
- 결측치는 최빈값에 넣어준다.(최빈값 = 'STEM' == 데이터과학자과?)
- 데이터관련학과의 value는 STEM 한개이고(major) 나머지는 비전공으므로 non_major로 명시

In [82]:
train['major_discipline'].value_counts()

STEM               14492
Humanities           669
Other                381
Business Degree      327
Arts                 253
No Major             223
Name: major_discipline, dtype: int64

In [83]:
major = train['major_discipline'].value_counts().idxmax()
major

'STEM'

In [84]:
# 최빈값으로 na값 처리
train['major_discipline'].fillna('STEM', inplace=True)

In [85]:
train.loc[train['major_discipline'] == 'STEM', 'major_discipline'] = 'major'
train.loc[train['major_discipline'] != 'major', 'major_discipline'] = 'no_major'

In [86]:
# train['major_discipline']

In [87]:
train['major_discipline'].value_counts(dropna=False)

major       17305
no_major     1853
Name: major_discipline, dtype: int64

# 나머지 결측치 처리
- knn으로 처리

In [89]:
train.isna().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                       0
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline             0
experience                  65
company_size              5938
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64

In [92]:
train['company_size'].isna()

0         True
1        False
2         True
3         True
4        False
         ...  
19153     True
19154     True
19155    False
19156    False
19157     True
Name: company_size, Length: 19158, dtype: bool

In [94]:
knn_list = []
for col in train.columns:
    if train[col].isna().sum() != 0:
        knn_list.append(col)
knn_list

['enrolled_university',
 'education_level',
 'experience',
 'company_size',
 'company_type',
 'last_new_job']

In [104]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors = 1)
train[knn_list] = imputer.fit_transform(train[knn_list])

In [105]:
train.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,0.0,2.0,major,21.0,3.0,5.0,1.0,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,0.0,2.0,major,15.0,2.0,5.0,5.0,47,0.0
2,11561,city_21,0.624,Male,No relevent experience,2.0,2.0,major,5.0,6.0,5.0,0.0,83,0.0
3,33241,city_115,0.789,Male,No relevent experience,2.0,2.0,no_major,0.0,7.0,5.0,0.0,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,0.0,3.0,major,21.0,2.0,4.0,4.0,8,0.0


In [108]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 라벨인코딩

In [109]:
category_feature = ['city','gender','relevent_experience','major_discipline']

def encoing_label(x):
    le = LabelEncoder()
    if x.name in category_feature:
        return le.fit_transform(x)
    return x
train = train.apply(encoing_label)

In [112]:
train

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,5,0.920,1,0,0.0,2.0,0,21.0,3.0,5.0,1.0,36,1.0
1,29725,77,0.776,1,1,0.0,2.0,0,15.0,2.0,5.0,5.0,47,0.0
2,11561,64,0.624,1,1,2.0,2.0,0,5.0,6.0,5.0,0.0,83,0.0
3,33241,14,0.789,1,1,2.0,2.0,1,0.0,7.0,5.0,0.0,52,1.0
4,666,50,0.767,1,0,0.0,3.0,0,21.0,2.0,4.0,4.0,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,55,0.878,1,1,0.0,2.0,1,14.0,5.0,5.0,1.0,42,1.0
19154,31398,5,0.920,1,0,0.0,2.0,0,14.0,1.0,5.0,4.0,52,1.0
19155,24576,5,0.920,1,0,0.0,2.0,0,21.0,2.0,5.0,4.0,44,0.0
19156,5756,94,0.802,1,0,0.0,1.0,0,0.0,4.0,5.0,2.0,97,0.0


# 파일 저장

In [113]:
train.to_csv('hr_data/final.csv', index=False)