# 전처리

In [189]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

mpl.style.use('seaborn')
mpl.rcParams["font.family"] = 'Malgun Gothic'
mpl.rcParams["axes.unicode_minus"] = False

In [190]:
import time
def my_time(func):
    def wrapper():
        str_time = time.time()
        func()
        end_time = time.time()
        print(end_time-str_time, '초 소요')
    return wrapper

In [191]:
X_train = pd.read_csv('../data/aug_train.csv')
X_test = pd.read_csv('../data/aug_test.csv')

In [192]:
X_test['target'] = -1

In [193]:
X_train.shape, X_train.columns, X_test.shape, X_test.columns

((19158, 14),
 Index(['enrollee_id', 'city', 'city_development_index', 'gender',
        'relevent_experience', 'enrolled_university', 'education_level',
        'major_discipline', 'experience', 'company_size', 'company_type',
        'last_new_job', 'training_hours', 'target'],
       dtype='object'),
 (2129, 14),
 Index(['enrollee_id', 'city', 'city_development_index', 'gender',
        'relevent_experience', 'enrolled_university', 'education_level',
        'major_discipline', 'experience', 'company_size', 'company_type',
        'last_new_job', 'training_hours', 'target'],
       dtype='object'))

In [194]:
X = pd.concat([X_train, X_test])
# X.drop(columns=['enrollee_id','target'],inplace=True)

In [195]:
X.shape

(21287, 14)

#### 해당 컬럼부터 숫자형으로 변경

In [196]:
numeric_feature = ['city_development_index', 'training_hours',
                   'education_level', 'last_new_job',  'experience', 'enrolled_university']

In [197]:
# 각 컬럼 데이터 개수부터 확인
for col in numeric_feature:
    print(X[col].unique().size)
    print(X[col].unique())

93
[0.92  0.776 0.624 0.789 0.767 0.764 0.762 0.913 0.926 0.827 0.843 0.804
 0.855 0.887 0.91  0.884 0.924 0.666 0.558 0.923 0.794 0.754 0.939 0.55
 0.865 0.698 0.893 0.796 0.866 0.682 0.802 0.579 0.878 0.897 0.949 0.925
 0.896 0.836 0.693 0.769 0.775 0.903 0.555 0.727 0.64  0.516 0.743 0.899
 0.915 0.689 0.895 0.89  0.847 0.527 0.766 0.738 0.647 0.795 0.74  0.701
 0.493 0.84  0.691 0.735 0.742 0.479 0.722 0.921 0.848 0.856 0.898 0.83
 0.73  0.68  0.725 0.556 0.448 0.763 0.745 0.645 0.788 0.78  0.512 0.739
 0.563 0.518 0.824 0.487 0.649 0.781 0.625 0.807 0.664]
241
[ 36  47  83  52   8  24  18  46 123  32 108  23  26 106   7 132  68  50
  48  65  13  22 148  72  40 141  82 145 206 152  42  14 112  87  20  21
  92 102  43  45  19  90  25  15  98 142  28 228  29  12  17  35   4 136
  27  74  86  75 332 140 182 172  33  34 150 160   3   2 210 101  59 260
 131 109  70  51  60 164 290 133  76 156 120 100  39  55  49   6 125 326
 198  11  41 114 246  81  31  84 105  38 178 104 202  88 218  6

#### edu_lvl

In [198]:
education_level = ['Primary School', 'High School', 'Graduate', 'Masters', 'Phd']
for i, v in enumerate(education_level):
    X.loc[X['education_level']==v, 'education_level'] = i

In [199]:
X['education_level'].value_counts(dropna=False)

2.0    12867
3.0     4857
1.0     2239
NaN      512
4.0      468
0.0      344
Name: education_level, dtype: int64

#### last_new_job

In [200]:
X.loc[X['last_new_job'] == '>4', 'last_new_job'] = 5
X.loc[X['last_new_job'] == 'never', 'last_new_job'] = 0

X['last_new_job'] = pd.to_numeric(X['last_new_job'], downcast='integer')

In [201]:
X['last_new_job'].value_counts()

1.0    8924
5.0    3643
2.0    3242
0.0    2710
3.0    1157
4.0    1148
Name: last_new_job, dtype: int64

#### exp

In [202]:
X.loc[X['experience'] == '>20', 'experience'] = 21
X.loc[X['experience'] == '<1', 'experience'] = 0

X['experience'] = pd.to_numeric(X['experience'], downcast='integer')

In [203]:
X['experience'].value_counts(dropna=False)

21.0    3669
5.0     1593
4.0     1548
3.0     1508
6.0     1346
2.0     1255
7.0     1144
9.0     1093
10.0    1081
8.0      884
11.0     750
15.0     745
14.0     641
1.0      605
0.0      596
16.0     576
12.0     546
13.0     453
17.0     378
19.0     333
18.0     306
20.0     167
NaN       70
Name: experience, dtype: int64

#### enroll

In [204]:
enrolls = ['no_enrollment','Part time course','Full time course']
for i,v in enumerate(enrolls):
    X.loc[X['enrolled_university'] == v, 'enrolled_university'] = i

In [205]:
X['enrolled_university'].value_counts(dropna=False)

0.0    15336
2.0     4192
1.0     1342
NaN      417
Name: enrolled_university, dtype: int64

In [206]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21287 entries, 0 to 2128
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             21287 non-null  int64  
 1   city                    21287 non-null  object 
 2   city_development_index  21287 non-null  float64
 3   gender                  16271 non-null  object 
 4   relevent_experience     21287 non-null  object 
 5   enrolled_university     20870 non-null  object 
 6   education_level         20775 non-null  object 
 7   major_discipline        18162 non-null  object 
 8   experience              21217 non-null  float64
 9   company_size            14727 non-null  object 
 10  company_type            14513 non-null  object 
 11  last_new_job            20824 non-null  float64
 12  training_hours          21287 non-null  int64  
 13  target                  21287 non-null  float64
dtypes: float64(4), int64(2), object(8)
memo

In [207]:
X.isna().sum().sort_values(ascending=False)

company_type              6774
company_size              6560
gender                    5016
major_discipline          3125
education_level            512
last_new_job               463
enrolled_university        417
experience                  70
enrollee_id                  0
city                         0
city_development_index       0
relevent_experience          0
training_hours               0
target                       0
dtype: int64

#### 결측치 3% 이하 컬럼의 결측치는 KNN으로 처리

In [208]:
to_mode = ['education_level', 'enrolled_university',
           'experience', 'last_new_job']

In [209]:
limit = X.shape[0] * 0.03
under_list = []
for col in X.columns:
    if 1 < X[col].isna().sum() < limit:
        under_list.append(col)
under_list.append('company_size')
under_list

['enrolled_university',
 'education_level',
 'experience',
 'last_new_job',
 'company_size']

In [210]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors = 1)
X[under_list] = imputer.fit_transform(X[under_list])

ValueError: could not convert string to float: '50-99'

In [None]:
# 최빈값으로 결측치 처리
# for col in to_mode:
#     X.loc[X[col].isna(),col] = X[col].mode().values[0]   

#### gender: nan->위에서부터 반: Male / 나머지 반: Female

In [None]:
sns.countplot(x='gender', data=X)

In [None]:
X['gender'].value_counts(dropna=False)

### 1차 전처리: 결측치를 반반 남녀로 배분

In [None]:
# gender_nan_cnt = X['gender'].isna().sum()//2
# # 내 방법
# # idx = 0
# # for idx in range(X.shape[0]):
# #     if gender_nan_cnt <= 0:
# #         break
# #     if X.loc[idx,'gender'] == np.nan:
# #         X.loc[idx,'gender'] = 'Male'
# #         gender_nan_cnt -= 1
# # X.loc[X['gender'].isna(), 'gender'] = 'Female'

# # 우상님 방법
# X['gender'].fillna('Female', limit=gender_nan_cnt, inplace=True)
# X['gender'].fillna('Male', inplace=True)

#### company_size, type

In [None]:
# X.loc[X['company_size'].isna(), 'company_size'] = 'unknown'
# X.loc[X['company_size']=='<10', 'company_size'] = '~10'
# X.loc[X['company_type'].isna(), 'company_type'] = 'unknown'

In [None]:
#### major: 비율 맞춰 major nomajor로 나눠 결측치 처리x
#### -> 최빈값과 나머지의 합과의 비율 차이가 크므로 최빈값 major로 처리

In [None]:
# # 비율맞춰 처리
# stem = X['major_discipline'].value_counts(dropna=False)['STEM']
# nonstem = X.shape[0] - X['major_discipline'].isna().sum() - stem
# stem, nonstem

In [None]:
# non_major_list = []
# for val in X['major_discipline'].unique():
#     if val != 'STEM':
#         non_major_list.append(val)
        
# non_major = X['major_discipline'].isin(non_major_list)
# major = X['major_discipline'] == 'STEM'

# X.loc[non_major, 'major_discipline'] = 'non_major'
# X.loc[major, 'major_discipline'] = 'major'

In [None]:
# nomajor_lst = list(X['major_discipline'].value_counts(dropna=False).index)
# nomajor_lst.remove(np.nan)
# nomajor_lst.remove('STEM')
# print(nomajor_lst)

In [None]:
# X.loc[X['major_discipline'] == 'STEM', 'major_discipline'] = 'major'
# X.loc[X['major_discipline'].isin(nomajor_lst), 'major_discipline'] = 'no_major'

In [None]:
# major_limit = int(X['major_discipline'].isna().sum() / (stem+nonstem) * stem)
# X['major_discipline'].fillna('major', limit=major_limit, inplace=True)
# X['major_discipline'].fillna('no_major', inplace=True)

In [None]:
# X.loc[(X['major_discipline'] == 'STEM')|(X['major_discipline'].isna()) , 'major_discipline'] = 'major'
# X.loc[X['major_discipline'] != 'major', 'major_discipline'] = 'no_major'

### 2차: 결측치를 큰 남자에게

#### gender

In [None]:
X['gender'].fillna('Male', inplace=True)

#### company_size, type

In [None]:
X.loc[X['company_size']=='<10', 'company_size'] = '~10'
X.loc[X['company_size']=='10/49', 'company_size'] = '10~49'

company_siz = ['~10', '10~49', '50-99', '100-500', '500-999','1000-4999','5000-9999','10000+']
for i, v in enumerate(company_siz):
    X.loc[X['company_size']==v, 'company_size'] = i

#### major: 비율 맞춰 major nomajor로 나눠 결측치 처리x
#### -> 최빈값과 나머지의 합과의 비율 차이가 크므로 최빈값 major로 처리

In [None]:
nomajor_lst = list(X['major_discipline'].value_counts(dropna=False).index)
nomajor_lst.remove(np.nan)
nomajor_lst.remove('STEM')
print(nomajor_lst)

In [None]:
X.loc[(X['major_discipline'] == 'STEM')|(X['major_discipline'].isna()) , 'major_discipline'] = 'major'
X.loc[X['major_discipline'] != 'major', 'major_discipline'] = 'no_major'

### 인코딩 전 현재 위아래로 concat되있어 
### 중복되있으므로 인덱스 순번대로 재지정

In [None]:
X.reset_index(drop=True, inplace=True)

## 1차 전처리:  city-lbe, elseobject-ohe

#### 라벨 인코딩

In [None]:
# X['city'] = LabelEncoder().fit_transform(X['city'])

#### 원핫인코딩

In [None]:
# X['company_size'].unique()

In [None]:
# to_ohe = list(X.columns[X[X.columns].dtypes == 'object'])
# to_ohe

# # 원핫 인코딩x 컬럼들
# X_not_ohe = X.loc[:,~X.columns.isin(to_ohe)]
# X_not_ohe.shape, X_not_ohe.columns

# # 원핫 인코딩 컬럼들
# X_ohe = X[to_ohe]
# X_ohe

# # 원핫인코딩 실행
# ohe = OneHotEncoder(sparse=False)
# X_df = ohe.fit_transform(X_ohe)
# X_ohe = pd.DataFrame(X_df, columns=ohe.get_feature_names())
# X_ohe.shape, X_ohe.columns

# # 합치기
# X_new = pd.concat([X_ohe, X_not_ohe], axis=1)
# X_new

# X_new.info()

## 2차 전처리: object 전체 lbe

In [None]:
to_lb = list(X.columns[X[X.columns].dtypes == 'object'])
to_lb

In [None]:
def lbe(x):
    if x.name in to_lb:
        return LabelEncoder().fit_transform(x)
    else:
        return x
    
X_new = X.apply(lbe)
X_new

### StandardScaled

In [None]:
from sklearn.preprocessing import StandardScaler

y_tmp = X_new['target']
X_scaled = StandardScaler().fit_transform(X_new)
X_scaled_df = pd.DataFrame(X_scaled, columns=X_new.columns)
X_scaled_df.drop(columns='target',inplace=True)
X_scaled_df = pd.concat([X_scaled_df, y_tmp], axis=1)

#### enrollee_id 제거, 다시 train,test 분리 후 target 분리

In [None]:
X_scaled_df.loc[X_scaled_df.index,'target']

In [None]:
X_train = X_scaled_df.loc[X_scaled_df.loc[X_scaled_df.index, 'target'] != -1]
X_test = X_scaled_df.loc[X_scaled_df.loc[X_scaled_df.index, 'target'] == -1]
X_test.drop(columns='target',inplace=True)

#### 테이블 저장

### 1차

In [None]:
# X_train.to_csv('../data/prepcd_train.csv', index=False)
# X_test.to_csv('../data/prepcd_test.csv', index=False)

### 2차

In [None]:
X_train.to_csv('../data/prepcd_lbe_train.csv', index=False)
X_test.to_csv('../data/prepcd_lbe_test.csv', index=False)