In [None]:
pip install optuna==4.3.0

Collecting optuna==4.3.0
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna==4.3.0)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna==4.3.0)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import numpy as np
import pandas as pd
import optuna
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score

In [None]:
# 데이터 불러오기
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [None]:
# 결측값 처리
categorical_cols = ['분야', '기업가치(백억원)']
for col in categorical_cols:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(test[col].mode()[0])

numerical_cols = ['직원 수', '고객수(백만명)']
for col in numerical_cols:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(test[col].median())

# 범주형 인코딩
categorical_features = ['국가', '분야', '투자단계', '인수여부', '상장여부', '기업가치(백억원)']
for feature in categorical_features:
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature].astype(str))
    test[feature] = le.transform(test[feature].astype(str))

In [None]:
### 확인
pd.set_option('display.max_columns', None)
print(train.head())

           ID  설립연도  국가  분야  투자단계    직원 수  인수여부  상장여부  고객수(백만명)  총 투자금(억원)  \
0  TRAIN_0000  2009   4   6     2  4126.0     0     0      56.0     3365.0   
1  TRAIN_0001  2023   5   8     1  4167.0     1     0      80.0     4069.0   
2  TRAIN_0002  2018   6   2     2  3132.0     1     1      54.0     6453.0   
3  TRAIN_0003  2016   5   4     1  3245.0     1     1      49.0      665.0   
4  TRAIN_0004  2020   1   5     1  1969.0     0     1      94.0      829.0   

   연매출(억원)  SNS 팔로워 수(백만명)  기업가치(백억원)  성공확률  
0   4764.0            4.71          3   0.3  
1    279.0            1.00          1   0.8  
2  12141.0            4.00          2   0.5  
3  10547.0            2.97          3   0.7  
4   9810.0            1.00          0   0.1  


In [None]:
# 1. 효율성 기반 비율 피처
train['직원당연매출'] = train['연매출(억원)'] / (train['직원 수'] + 1)
train['팔로워당매출'] = train['연매출(억원)'] / (train['SNS 팔로워 수(백만명)'] + 1)
train['투자금대비매출'] = train['연매출(억원)'] / (train['총 투자금(억원)'] + 1)
train['기업가치대비투자'] = train['총 투자금(억원)'] / (train['기업가치(백억원)'] + 1)

# 2. 로그 변환 (수치가 치우친 변수들)
train['log_연매출'] = np.log1p(train['연매출(억원)'])
train['log_투자금'] = np.log1p(train['총 투자금(억원)'])
train['log_직원수'] = np.log1p(train['직원 수'])
train['log_팔로워'] = np.log1p(train['SNS 팔로워 수(백만명)'])

# 3. 순위 기반 피처 (상대적 크기 강조)
train['연매출순위'] = train['연매출(억원)'].rank(pct=True)
train['직원수순위'] = train['직원 수'].rank(pct=True)

# 4. 시간 기반 피처
train['기업연차'] = 2025 - train['설립연도']

# 5. 범주형 그룹핑 예시 (헬스케어/핀테크 분리)
train['핫분야'] = train['분야'].apply(lambda x: 1 if x in ['헬스케어', '핀테크'] else 0)

In [None]:
#상관관계 분석
#train = train.drop(columns=['ID'])
train.corr()['성공확률'].sort_values(ascending=False)


Unnamed: 0,성공확률
성공확률,1.0
기업가치(백억원),0.033876
상장여부,0.031413
국가,0.02957
log_팔로워,0.020756
SNS 팔로워 수(백만명),0.017927
직원당연매출,0.009359
설립연도,0.002147
기업연차,-0.002147
투자단계,-0.007835


In [None]:
### 확인
pd.set_option('display.max_columns', None)
print(train.head())

   설립연도  국가  분야  투자단계    직원 수  인수여부  상장여부  고객수(백만명)  총 투자금(억원)  연매출(억원)  \
0  2009   4   6     2  4126.0     0     0      56.0     3365.0   4764.0   
1  2023   5   8     1  4167.0     1     0      80.0     4069.0    279.0   
2  2018   6   2     2  3132.0     1     1      54.0     6453.0  12141.0   
3  2016   5   4     1  3245.0     1     1      49.0      665.0  10547.0   
4  2020   1   5     1  1969.0     0     1      94.0      829.0   9810.0   

   SNS 팔로워 수(백만명)  기업가치(백억원)  성공확률    직원당연매출       팔로워당매출    투자금대비매출  \
0            4.71          3   0.3  1.154349   834.325744   1.415330   
1            1.00          1   0.8  0.066939   139.500000   0.068550   
2            4.00          2   0.5  3.875199  2428.200000   1.881159   
3            2.97          3   0.7  3.249230  2656.675063  15.836336   
4            1.00          0   0.1  4.979695  4905.000000  11.819277   

   기업가치대비투자   log_연매출   log_투자금   log_직원수   log_팔로워     연매출순위     직원수순위  기업연차  \
0    841.25  8.469053  8.121480  8.

In [None]:
# 데이터 분리
X_train = train.drop(columns=['ID', '성공확률'])
y_train = train['성공확률']

# Optuna - XGBoost (n_estimators 고정)
def objective_xgb(trial):
    model = XGBRegressor(
        n_estimators=500,  # 고정
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
        max_depth=trial.suggest_int('max_depth', 3, 30),
        min_child_weight=trial.suggest_int('min_child_weight', 1, 10),
        subsample=trial.suggest_float('subsample', 0.5, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1.0),
        reg_alpha=trial.suggest_float('reg_alpha', 0.0, 1.0),
        reg_lambda=trial.suggest_float('reg_lambda', 0.0, 1.0),
        random_state=42,
        n_jobs=-1
    )
    return cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1).mean()

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=30)

xgb_model = XGBRegressor(**study_xgb.best_params, n_estimators=500, random_state=42, n_jobs=-1)

# Optuna - RandomForest (n_estimators 고정)
def objective_rf(trial):
    model = RandomForestRegressor(
        n_estimators=500,  # 고정
        max_depth=trial.suggest_int('max_depth', 3, 20),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 4),
        max_features=trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        random_state=42,
        n_jobs=-1
    )
    return cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1).mean()

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=30)

rf_model = RandomForestRegressor(**study_rf.best_params, n_estimators=500, random_state=42, n_jobs=-1)

[I 2025-05-16 04:45:08,204] A new study created in memory with name: no-name-aeaf8d6d-144e-402c-8b4b-61a0b722f58c
[I 2025-05-16 04:45:18,307] Trial 0 finished with value: -0.21244158070723915 and parameters: {'learning_rate': 0.22833003012112513, 'max_depth': 23, 'min_child_weight': 7, 'subsample': 0.7912800674185398, 'colsample_bytree': 0.7569578098431622, 'reg_alpha': 0.051428405453881165, 'reg_lambda': 0.1302315442349513}. Best is trial 0 with value: -0.21244158070723915.
[I 2025-05-16 04:45:22,089] Trial 1 finished with value: -0.20936766939523554 and parameters: {'learning_rate': 0.09920879900558476, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 0.9251151198342584, 'colsample_bytree': 0.7213062894076061, 'reg_alpha': 0.036217932714581846, 'reg_lambda': 0.7352391244133477}. Best is trial 1 with value: -0.20936766939523554.
[I 2025-05-16 04:45:26,354] Trial 2 finished with value: -0.21071597657640698 and parameters: {'learning_rate': 0.1821874339543086, 'max_depth': 15, 'min_c