In [7]:
# 1. 문제 정의
# 평가: roc-auc
# target: target

# 최종 파일: result.csv(컬럼 1개 pred, 1 확률값)

# 2. 라이브러리 및 데이터 불러오기
import pandas as pd
train = pd.read_csv("hr_train.csv")
test = pd.read_csv("hr_test.csv")

# 3. 탐색적 데이터 분석 (EDA)
print("===== 데이터 정보(자료형) =====")
print(train.info())

print("\n===== train 결측치 수 =====")
print(train.isnull().sum())

print("\n===== test 결측치 수 =====")
print(test.isnull().sum())

print("\n===== target 빈도 =====")
print(train['target'].value_counts())

===== 데이터 정보(자료형) =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15326 entries, 0 to 15325
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             15326 non-null  int64  
 1   city                    15326 non-null  object 
 2   city_development_index  15326 non-null  float64
 3   gender                  11750 non-null  object 
 4   relevent_experience     15326 non-null  object 
 5   enrolled_university     15012 non-null  object 
 6   education_level         14961 non-null  object 
 7   major_discipline        13045 non-null  object 
 8   experience              15272 non-null  object 
 9   company_size            10539 non-null  object 
 10  company_type            10383 non-null  object 
 11  last_new_job            14984 non-null  object 
 12  training_hours          15326 non-null  int64  
 13  target                  15326 non-null  float64
dtypes: float64(2),

In [9]:
# 4. 데이터 전처리
target = train.pop('target')
# 결측치 처리
train = train.fillna("X")
test = test.fillna("X")

# train과 test 합쳐서 원-핫 인코딩
combined = pd.concat([train, test])
combined_dummies= pd.get_dummies(combined)
n_train = len(train)
train = combined_dummies[:n_train]
test = combined_dummies[n_train:]

# 5. 검증 데이터 나누기
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size=0.2, random_state=0)

# 6. 머신러닝 학습 평가
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_tr, y_tr)
pred = rf.predict_proba(X_val)

from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_val, pred[:, 1])
print('roc_auc:', roc_auc)

# 7. 예측 및 겨과 파일 생성
pred = rf.predict_proba(test)
submit = pd.DataFrame({'pred':pred[:, 1]})
submit.to_csv("result.csv", index=False)

roc_auc: 0.7730742036233207


In [11]:
# 성능 개선
# 2. 라이브러리 및 데이터 불러오기
import pandas as pd
train = pd.read_csv("hr_train.csv")
test = pd.read_csv("hr_test.csv")

# 4. 데이터 전처리
target = train.pop('target')

# 결측치 처리
train = train.fillna("X")
test = test.fillna("X")

# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
combined = pd.concat([train, test])
cols = train.select_dtypes(include='object').columns
for col in cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])
n_train = len(train)
train = combined[:n_train]
test = combined[n_train:]

# 스케일링
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
n_cols = train.select_dtypes(exclude='object').columns
train = scaler.fit_transform(train)
test = scaler.transform(test)
 
# 5. 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size=0.2, random_state=0)

# 6. 머신러닝 학습 평가
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=7, n_estimators=200, random_state=0)
rf.fit(X_tr, y_tr)
pred = rf.predict_proba(X_val)

from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_val, pred[:, 1])
print('roc_auc:', roc_auc)

# 7. 예측 및 겨과 파일 생성
pred = rf.predict_proba(test)
submit = pd.DataFrame({'pred':pred[:, 1]})
submit.to_csv("result.csv", index=False)

roc_auc: 0.7825363713412095
