# 코드

In [1]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

In [2]:
# 데이터 불러오기
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
# info, 결측치 확인
print(train.info())
print(train.isnull().sum())
print(test.isnull().sum())

print(train['근로기간'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96294 entries, 0 to 96293
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            96294 non-null  object 
 1   대출금액          96294 non-null  int64  
 2   대출기간          96294 non-null  object 
 3   근로기간          96294 non-null  object 
 4   주택소유상태        96294 non-null  object 
 5   연간소득          96294 non-null  int64  
 6   부채_대비_소득_비율   96294 non-null  float64
 7   총계좌수          96294 non-null  int64  
 8   대출목적          96294 non-null  object 
 9   최근_2년간_연체_횟수  96294 non-null  int64  
 10  총상환원금         96294 non-null  int64  
 11  총상환이자         96294 non-null  float64
 12  총연체금액         96294 non-null  float64
 13  연체계좌수         96294 non-null  float64
 14  대출등급          96294 non-null  object 
dtypes: float64(4), int64(5), object(6)
memory usage: 11.0+ MB
None
ID              0
대출금액            0
대출기간            0
근로기간            0
주택소유상태          0
연간

In [4]:
# 근로기간 데이터 변경
train['근로기간'] = train['근로기간'].replace({
    '10+ years': '10+ years',
    '10+years': '10+ years',
    '<1 year': '< 1 year',
    '1 years': '< 1 year',
    '3': '3 years'
})

test['근로기간'] = test['근로기간'].replace({
    '10+ years': '10+ years',
    '10+years': '10+ years',
    '<1 year': '< 1 year',
    '1 years': '< 1 year',
    '3': '3 years'
})

In [5]:
# 근로기간 Unknown -> 10+ years로 변경(최빈값)
train['근로기간'].replace({'Unknown' : '10+ years'}, inplace = True)
test['근로기간'].replace({'Unknown' : '10+ years'}, inplace = True)

# '근로기간' 열의 값 변환 함수 정의
def convert_experience(experience):
    if '+' in experience:
        return 10
    elif '< 1' in experience:
        return 0
    else:
        return int(''.join(filter(str.isdigit, experience)))

# '근로기간' 열 변환
train['근로기간'] = train['근로기간'].map(convert_experience)
test['근로기간'] = test['근로기간'].map(convert_experience)

# 변환 후의 값 확인
print(train['근로기간'].value_counts())

# 데이터 변경
test['대출목적'] = test['대출목적'].replace({'결혼' : '기타'})

10    38152
2      8450
0      8200
3      7670
1      6249
5      5665
4      5588
8      4888
6      3874
7      3814
9      3744
Name: 근로기간, dtype: int64


In [6]:
# 파생변수 생성
train['대출기간(month)'] = train['대출기간'].str.extract('(\d+)').astype(int)
test['대출기간(month)'] = test['대출기간'].str.extract('(\d+)').astype(int)

train['월상환액'] = train['대출금액'] / train['대출기간(month)']
test['월상환액'] = test['대출금액'] / test['대출기간(month)']

train['총상환원금 비율'] = train['총상환원금'] / train['대출금액']
test['총상환원금 비율'] = test['총상환원금'] / test['대출금액']
train['총상환이자 비율'] = train['총상환이자'] / train['대출금액']
test['총상환이자 비율'] = test['총상환이자'] / test['대출금액']

train['연간소득_log'] = np.log1p(train['연간소득'])
test['연간소득_log'] = np.log1p(test['연간소득'])

train = train.drop(columns = ['대출기간', '연간소득'], axis= 1)
test = test.drop(columns = ['대출기간', '연간소득'], axis= 1)

In [7]:
# 스케일링
scaler = MinMaxScaler()
numeric = train.select_dtypes(exclude= 'object').columns
train[numeric] = scaler.fit_transform(train[numeric])
test[numeric] = scaler.transform(test[numeric])

# 테스트 데이터 ID 추출
test_id = test['ID']

# ID 제거
train = train.drop('ID', axis = 1)
test = test.drop('ID', axis = 1)

# 라벨인코딩
le = LabelEncoder()
object = train.select_dtypes(include = 'object').columns
for i in object:
    if i != '대출등급':
        train[i] = le.fit_transform(train[i])
        test[i] = le.transform(test[i])

# 데이터 분할
X = train.drop('대출등급', axis = 1)
y = train['대출등급']

In [None]:
# Optuna의 목적 함수를 정의합니다.
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'max_features': trial.suggest_float('max_features', 0.1, 1),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
    }

    # class_weight 파라미터를 Optuna로 최적화
    class_weights = trial.suggest_categorical('class_weights', [None, 'balanced'])

    model = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        min_samples_split=params['min_samples_split'],
        max_features=min(params['max_features'], 0.999),  # float type
        max_depth=params['max_depth'],
        random_state=2024,
        n_jobs=-1,
        class_weight=class_weights
    )

    # Define the StratifiedKFold
    stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True)

    # Use StratifiedKFold in cross_val_score
    score = cross_val_score(model, X, y, cv=stratified_kfold, scoring='f1_macro').mean()

    return score

# Optuna 스터디를 생성합니다.
study = optuna.create_study(direction='maximize', study_name='RandomForestOptimization')

# 목적 함수를 최적화합니다.
study.optimize(objective, n_trials=50, n_jobs=-1)

[I 2024-02-08 09:48:12,021] A new study created in memory with name: RandomForestOptimization
[I 2024-02-08 09:58:58,333] Trial 0 finished with value: 0.9289108068412751 and parameters: {'n_estimators': 134, 'min_samples_split': 3, 'max_features': 0.3105374083990755, 'max_depth': 49, 'class_weights': 'balanced'}. Best is trial 0 with value: 0.9289108068412751.
[I 2024-02-08 09:59:09,632] Trial 2 finished with value: 0.881475105520674 and parameters: {'n_estimators': 174, 'min_samples_split': 2, 'max_features': 0.1480439381179584, 'max_depth': 28, 'class_weights': 'balanced'}. Best is trial 0 with value: 0.9289108068412751.
[I 2024-02-08 10:04:32,409] Trial 9 finished with value: 0.6784777813061811 and parameters: {'n_estimators': 111, 'min_samples_split': 7, 'max_features': 0.11387502923330106, 'max_depth': 40, 'class_weights': None}. Best is trial 0 with value: 0.9289108068412751.
[I 2024-02-08 10:06:20,573] Trial 8 finished with value: 0.9315284055114867 and parameters: {'n_estimator

In [None]:
# 최적의 하이퍼파라미터를 가져옵니다.
best_params = study.best_params

# 최적의 하이퍼파라미터로 모델을 생성합니다.
best_model = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    min_samples_split=best_params['min_samples_split'],
    max_features=min(best_params['max_features'], 0.999),  # float type
    max_depth=best_params['max_depth'],
    random_state=2024,
    n_jobs=-1,
    class_weight=best_params['class_weights']
)

# 전체 학습 데이터로 모델을 학습합니다.
best_model.fit(X, y)

# 최종 테스트 세트에 대한 예측을 수행합니다.
y_test_pred = best_model.predict(test)

# 제출용 데이터프레임을 생성합니다.
submission = pd.DataFrame({'ID': test_id, '대출등급': y_test_pred})

# CSV
submission.to_csv('submission_RandomForestClassifier_optuna_cv5_StratifiedKFold_non_split_class_weight.csv', index=False, encoding='utf-8-sig')