In [5]:
# automate_feature_selection.py (수정본)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
import warnings
from tqdm.auto import tqdm
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import RidgeCV
import pickle
import torch

# ✅✅✅ 누락되었던 import 구문 추가 ✅✅✅
from transformers import AutoTokenizer, AutoModel

warnings.filterwarnings('ignore')

# ------------------------------------------------------------------
# 설정 변수
# ------------------------------------------------------------------
CANDIDATE_FEATURES_TO_REMOVE = [
    '일반분양', '특별분양', '세대수', 
    '분양가(만원)', '공급면적(㎡)', '전용면적(㎡)',
    '대형마트 - 1.5km 이내', '대형쇼핑 - 3km 이내', '편의점 - 500m 이내',
    '은행 - 1km 이내', '공원 - 1.5km 이내', '관공서 - 1.5km 이내',
    '상급병원 - 1.5km 이내', '상권 - 3km 이내', '초등학교(2km 이내)',
    '중학교(2km 이내)', '고등학교(2km 이내)', '지하철 - 반경 1.5km 이내',
    '버스 - 반경 500m 이내', '고속철도 - 10km 이내', '고속도로IC - 10km 이내'
]
PERFORMANCE_THRESHOLD = 0.66

# ------------------------------------------------------------------

# 헬퍼 함수
def extract_brand(apt_name, brand_list):
    for brand in brand_list:
        if brand in apt_name: return brand
    return '기타 브랜드'

def get_embeddings(data, model, tokenizer):
    embeddings = []
    for text in data:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=50)
        with torch.no_grad(): outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
        embeddings.append(cls_embedding)
    return np.vstack(embeddings)

def train_and_evaluate_model(dataframe):
    """주어진 데이터프레임으로 모델을 학습하고 R² 점수를 반환하는 함수"""
    df = dataframe.copy()
    
    # --- 특성 공학 (수정된 부분) ---
    df.dropna(subset=['분양률'], inplace=True)
    
    # ✅ 각 컬럼이 존재하는지 확인하는 조건문 추가
    if '세대수' in df.columns:
        df = df[df['세대수'] > 3].reset_index(drop=True)
        df['세대수'] = pd.to_numeric(df.get('세대수'), errors='coerce')
        df['소규모단지여부'] = (df['세대수'] < 10).astype(int)
    else:
        df['소규모단지여부'] = 0 # 세대수 정보가 없으면 0으로 처리

    if '일반분양' in df.columns:
        df['일반분양'] = pd.to_numeric(df.get('일반분양'), errors='coerce').fillna(0)

    if '특별분양' in df.columns:
        df['특별분양'] = pd.to_numeric(df.get('특별분양'), errors='coerce').fillna(0)
        df['특별분양유무'] = df['특별분양'].apply(lambda x: 1 if x > 0 else 0)
    else:
        df['특별분양유무'] = 0

    df['기준년월'] = pd.to_datetime(df['기준년월'], format='%Y%m')
    df['년'] = df['기준년월'].dt.year
    df['월'] = df['기준년월'].dt.month
    df['월_sin'] = np.sin(2 * np.pi * df['월']/12)
    df['월_cos'] = np.cos(2 * np.pi * df['월']/12)
    df['분기'] = df['월'].apply(lambda x: (x-1)//3 + 1)
    df.drop(columns=['기준년월', '미분양수', '주변시세 평균', '월'], inplace=True, errors='ignore')
    
    with open('brand_priority_list.txt', 'r', encoding='utf-8') as f: brand_priority_list = [line.strip() for line in f.readlines()]
    df['브랜드'] = df['아파트'].apply(lambda x: extract_brand(str(x), brand_priority_list))
    
    top_10_builders = ['삼성물산', '현대건설', '대우건설', '현대엔지니어링', '지에스건설', '디엘이앤씨', '포스코이앤씨', '롯데건설', '에스케이에코플랜트', '호반건설']
    df['건설사_등급'] = df['건설사'].apply(lambda x: 'Top10' if any(builder in str(x) for builder in top_10_builders) else 'Other')
    df['지역_브랜드'] = df['지역'] + '_' + df['브랜드']
    
    infra_cols = [col for col in df.columns if 'km' in col or '500m' in col]
    if infra_cols:
        df['인프라_점수'] = df[infra_cols].sum(axis=1)

    if '분양가(만원)' in df.columns and '공급면적(㎡)' in df.columns:
        df['평당분양가'] = (df['분양가(만원)'] / df['공급면적(㎡)'] / 3.3).replace([np.inf, -np.inf], 0).fillna(0)
        
    if '전용면적(㎡)' in df.columns and '공급면적(㎡)' in df.columns:
        df['전용률'] = ((df['전용면적(㎡)'] / df['공급면적(㎡)']) * 100).replace([np.inf, -np.inf], 0).fillna(0)

    # --- 모델링 데이터 준비 ---
    target = '분양률'
    # 제거할 컬럼 목록에서 infra_cols만 확실히 제거하고, 나머지는 이미 없으므로 errors='ignore'로 처리
    drop_cols = infra_cols + ['아파트', '브랜드', '건설사', '분양률']
    X = df.drop(columns=drop_cols, errors='ignore')
    y = df[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1004)
    numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())]), numerical_cols),
        ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), categorical_cols)
    ])
    
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # --- 모델 학습 및 평가 ---
    estimators = [('lgbm', lgb.LGBMRegressor(random_state=42)), ('xgb', xgb.XGBRegressor(random_state=42)), ('rf', RandomForestRegressor(random_state=42, n_jobs=-1))]
    stacking_model = StackingRegressor(estimators=estimators, final_estimator=RidgeCV(), cv=5, n_jobs=-1)
    stacking_model.fit(X_train_processed, y_train)
    
    y_pred = stacking_model.predict(X_test_processed)
    return r2_score(y_test, y_pred)

# --- 메인 실행 로직 ---
if __name__ == "__main__":
    df_original = pd.read_csv("final_data.csv")
    
    print("최초 BERT 임베딩을 생성합니다. 시간이 다소 걸릴 수 있습니다...")
    MODEL_NAME = "kykim/bert-kor-base"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME)
    
    temp_df_for_embedding = df_original[['아파트', '건설사']].copy()
    temp_df_for_embedding['브랜드'] = temp_df_for_embedding['아파트'].apply(lambda x: extract_brand(str(x), [b.strip() for b in open('brand_priority_list.txt', 'r', encoding='utf-8').readlines()]))
    
    brand_embeddings = get_embeddings(temp_df_for_embedding['브랜드'], model, tokenizer)
    co_embeddings = get_embeddings(temp_df_for_embedding['건설사'], model, tokenizer)
    brand_embed_df = pd.DataFrame(brand_embeddings, columns=[f'brand_embed_{i}' for i in range(brand_embeddings.shape[1])])
    co_embed_df = pd.DataFrame(co_embeddings, columns=[f'co_embed_{i}' for i in range(co_embeddings.shape[1])])
    print("✅ BERT 임베딩 생성 완료.")

    features_to_drop_permanently = []
    
    print("\n--- 기준 성능 측정 (모든 피처 사용) ---")
    base_df = pd.concat([df_original.reset_index(drop=True), brand_embed_df, co_embed_df], axis=1)
    best_score = train_and_evaluate_model(base_df)
    print(f"▶ 기준 R² Score: {best_score:.4f}\n")

    remaining_features = CANDIDATE_FEATURES_TO_REMOVE.copy()

    for i in range(len(CANDIDATE_FEATURES_TO_REMOVE)):
        print(f"\n--- [ 반복 {i+1} ] 가장 덜 중요한 피처를 찾습니다 ---")
        iteration_results = {}
        
        # 이번 라운드에 테스트할 후보군이 없으면 종료
        if not remaining_features:
            print("▶ 테스트할 피처가 더 이상 없습니다.")
            break

        for feature_to_test in tqdm(remaining_features, desc=f"피처 중요도 평가 중"):
            current_drop_list = features_to_drop_permanently + [feature_to_test]
            
            temp_df = df_original.drop(columns=current_drop_list, errors='ignore')
            temp_df_processed = pd.concat([temp_df.reset_index(drop=True), brand_embed_df, co_embed_df], axis=1)

            score = train_and_evaluate_model(temp_df_processed)
            iteration_results[feature_to_test] = score
            
        best_feature_to_remove = max(iteration_results, key=iteration_results.get)
        score_after_removal = iteration_results[best_feature_to_remove]
        
        print(f"▶ 이번 라운드 최적 제거 후보: '{best_feature_to_remove}' (제거 시 성능: {score_after_removal:.4f})")
        
        if score_after_removal >= PERFORMANCE_THRESHOLD:
            best_score = score_after_removal
            features_to_drop_permanently.append(best_feature_to_remove)
            remaining_features.remove(best_feature_to_remove)
            print(f"✅ 성능이 {PERFORMANCE_THRESHOLD} 이상이므로 '{best_feature_to_remove}'를 최종 제거합니다.\n")
        else:
            print(f"❌ 더 이상 피처를 제거하면 성능이 {PERFORMANCE_THRESHOLD} 아래로 떨어집니다. 과정을 종료합니다.\n")
            break

    # 최종 결과 출력
    print("\n" + "="*50)
    print("        ✨ 피처 선택 자동화 완료 ✨")
    print("="*50)
    print(f"최종 성능 (R² Score): {best_score:.4f}")
    print("\n✅ 최종적으로 제외된 피처 목록:")
    if features_to_drop_permanently:
        for f in features_to_drop_permanently: print(f"  - {f}")
    else:
        print("  (제외된 피처 없음)")
        
    print("\n✅ 최종 모델에 사용될 필수 입력 피처 목록:")
    final_input_features = [f for f in CANDIDATE_FEATURES_TO_REMOVE if f not in features_to_drop_permanently]
    for f in final_input_features: print(f"  - {f}")
    print("\n(위 목록 외 '아파트', '건설사', '지역', '기준년월'은 필수입니다.)")
    print("="*50)

최초 BERT 임베딩을 생성합니다. 시간이 다소 걸릴 수 있습니다...
✅ BERT 임베딩 생성 완료.

--- 기준 성능 측정 (모든 피처 사용) ---


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.152588 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 85065
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1594
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79186
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1592
[LightGBM] [Info] Start training from score 0.824270
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.113309 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto

피처 중요도 평가 중:   0%|          | 0/21 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84902
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1593
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79039
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.349371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1591
[LightGBM] [Info] Total Bins 80051
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.279396 seconds.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023902 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84555
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024272 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77623
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1589
[LightGBM] [Info] Start training from score 0.829997
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.256071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79228
[LightGBM] [Info] Auto-choosing row-wise multi-thre

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.082453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 85062
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1594
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036990 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79736
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1591
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049815 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.187489 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76350
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1590
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.346620 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79181
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.349959 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.282988 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, th

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051436 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85064
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1594
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046746 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 76357
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.076651 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79186
[LightGBM] [Info] Auto-choosing row-

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.088578 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85058
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1594
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037893 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79180
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1592
[LightGBM] [Info] Start training from score 0.824270
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto

피처 중요도 평가 중:   0%|          | 0/20 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.144306 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84392
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1591
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.096680 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77469
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1588
[LightGBM] [Info] Start training from score 0.829997
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.197413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83299
[LightGBM] [Info] Number of data points in the train set: 1771, number of used features: 1595
[LightGBM] [Info] Start training from score 0.821503
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.043723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78380
[LightGBM] [Info] Number of data points in the train set: 1417, number of used features: 1587
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.150237 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78561
[LightGBM] [Info] Start training from sc

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.097645 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 75842
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.277879 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77619
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.141624 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78673
[LightGBM] [Info] Number of d

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026696 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84552
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.306854 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79688
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1588
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.223665 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 78673
[LightGBM] [Info] Number of data points 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.060055 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84549
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.108415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 75840
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1588
[LightGBM] [Info] Start training from score 0.822896
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.136624 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Tota

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.123352 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84551
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.167034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056283 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.214913 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75843
[LightG

피처 중요도 평가 중:   0%|          | 0/19 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84388
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1591
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.228429 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79078
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1588
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.223634 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78526
[LightGBM] [Info] Number of data points 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045424 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84550
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073977 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75842
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1588
[LightGBM] [Info] Start training from score 0.822896
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.102806 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.103691 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75838
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1588
[LightGBM] [Info] Start training from score 0.822896
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.108087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79222
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.185925 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1589
[LightGBM] [Info] Total Bins 77615
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1589
[LightGBM]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042180 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84548
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069697 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79224
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1589
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.134362 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75841
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1588
[LightGBM]

피처 중요도 평가 중:   0%|          | 0/18 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.080916 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84387
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1591
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.189835 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.165023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1587
[LightGBM] 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.121296 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84549
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.130263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77617
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.165022 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.091198 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84547
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.131194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 78669
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.086809 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1590
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of t

피처 중요도 평가 중:   0%|          | 0/17 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.062058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84387
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1591
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.192640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.238747 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79077
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1588
[LightGBM] [Info] Total Bins 75694
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1587
[LightGBM]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071663 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84550
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.145015 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79226
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.152028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.117310 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84545
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.088223 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75838
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1588
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.161467 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77613
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.228621 seconds.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84551
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.187127 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78673
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1590
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.248523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Start training from score 0.824270
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 

피처 중요도 평가 중:   0%|          | 0/16 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84384
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1591
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.255967 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.180564 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79533
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1587
[LightGBM] [Info] Total Bins 79074
[LightGBM] [Info] Auto-choosing row-wise

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046727 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84546
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.141245 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77614
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1589
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.109439 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Start training from score 0.829997
[LightGBM] [Info] Auto

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84544
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.162068 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.102933 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.161987 seconds.
You can set `force_row_wise=true` to

피처 중요도 평가 중:   0%|          | 0/15 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.089516 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84384
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1591
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.165180 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.156700 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77461
[LightGBM] [Info] Total Bins 75691
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1588
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1587
[LightGBM]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.166863 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84402
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1590
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.105886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77483
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.127153 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79554
[LightGBM] [Info] Number of data points in the train set: 1268, number

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030599 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84541
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074144 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 78663
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1590
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073667 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77609
[LightGBM] [Info] Start training from score 0.824270
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1589
[LightGBM] [Info] Sta

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.092220 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84545
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1592
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044002 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79220
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1589
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094934 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75837
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1588
[LightGBM]

피처 중요도 평가 중:   0%|          | 0/14 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84379
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1591
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.175986 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75686
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1587
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.232860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79070
[LightGBM] [Info] Start training from score 0.822896
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1588
[LightGBM] [Info] Aut

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83288
[LightGBM] [Info] Number of data points in the train set: 1771, number of used features: 1595
[LightGBM] [Info] Start training from score 0.821503
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.106303 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78550
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.182748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.226779 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightG

피처 중요도 평가 중:   0%|          | 0/13 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.078454 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83869
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1589
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.431447 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 78560
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1586
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.493483 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79018
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1585
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.174401 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84028
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1590
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.301257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75321
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1586
[LightGBM] [Info] Start training from score 0.822896
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.368524 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78150
[LightGBM] [Info] Auto-choosing row-wise multi-thre

피처 중요도 평가 중:   0%|          | 0/12 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037498 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83865
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1589
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.157024 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75172
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.217219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 78003
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1585
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1587
[LightGBM] [Info] Start training from score 0.824270
[LightGBM] [Info] Aut

피처 중요도 평가 중:   0%|          | 0/11 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.078223 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83862
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1589
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.076927 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75169
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1585
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.115527 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 79011
[LightGBM] [Info] Start training from sc

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.090792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84022
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1590
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072691 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 78698
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.129561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1587
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of t

피처 중요도 평가 중:   0%|          | 0/10 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.062690 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83856
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1589
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058910 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78547
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1586
[LightGBM] [Info] Start training from score 0.825793
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.176542 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

피처 중요도 평가 중:   0%|          | 0/9 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.180888 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83853
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1589
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.166039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.089734 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 75160
[LightGBM] [Info] Total Bins 76930
[LightGBM] [Info] Number of data points in the train set: 1268, number

피처 중요도 평가 중:   0%|          | 0/8 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.093113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83850
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1589
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.184785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 78541
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1586
[LightGBM] [Info] Start training from score 0.825793
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.221077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

피처 중요도 평가 중:   0%|          | 0/7 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83705
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1587
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.079486 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77855
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1585
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.129729 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Start training from score 0.824270
[LightGBM] [Info] Tota

피처 중요도 평가 중:   0%|          | 0/6 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065590 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83450
[LightGBM] [Info] Number of data points in the train set: 1585, number of used features: 1586
[LightGBM] [Info] Start training from score 0.826072
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.062253 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76540
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.089963 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Number of data points in the train set: 1268, number of used features: 1583
[LightGBM] [Info] Total Bins 78615
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.114193 seconds.


피처 중요도 평가 중:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 82188
[LightGBM] [Info] Number of data points in the train set: 1771, number of used features: 1589
[LightGBM] [Info] Start training from score 0.821503
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.075789 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 78014
[LightGBM] [Info] Number of data points in the train set: 1417, number of used features: 1581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.127839 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77299
[LightGBM] [Info] Start training from score 0.821996
[LightGBM] [Info] Number of data points in the trai

피처 중요도 평가 중:   0%|          | 0/4 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015922 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 82185
[LightGBM] [Info] Number of data points in the train set: 1771, number of used features: 1589
[LightGBM] [Info] Start training from score 0.821503
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.114910 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.084745 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77296
[LightGBM] [Info] Total Bins 77479
[LightGBM] [Info] Number of data points in the train set: 1417, number of used features: 1581
[LightGBM] [Info] Number of data points 

피처 중요도 평가 중:   0%|          | 0/3 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023450 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 82182
[LightGBM] [Info] Number of data points in the train set: 1771, number of used features: 1589
[LightGBM] [Info] Start training from score 0.821503
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.137367 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.191053 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.194340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `

피처 중요도 평가 중:   0%|          | 0/2 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.075895 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 82179
[LightGBM] [Info] Number of data points in the train set: 1771, number of used features: 1589
[LightGBM] [Info] Start training from score 0.821503
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77290
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046976 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Number of data points in the train set: 1417, number of used features: 1581
[LightGBM] [Info] Total Bins 79967
[LightGBM] [Info] Start training from sc

피처 중요도 평가 중:   0%|          | 0/1 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.110888 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 82165
[LightGBM] [Info] Number of data points in the train set: 1771, number of used features: 1588
[LightGBM] [Info] Start training from score 0.821503
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.111711 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79953
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.174259 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77276
[LightGBM] [Info] Number of data points in the train set: 1417, number of used features: 1582
[LightGBM] [Info] Number of data points 

In [7]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
import warnings
from tqdm.auto import tqdm
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import RidgeCV
import pickle
from scipy.stats import randint, uniform

warnings.filterwarnings('ignore')

# 헬퍼 함수 (기존과 동일)
def get_embeddings(data, model, tokenizer):
    """주어진 텍스트 데이터에 대한 BERT 임베딩을 생성합니다."""
    embeddings = []
    for text in tqdm(data, desc="텍스트 임베딩 진행 중"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=50)
        with torch.no_grad():
            outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
        embeddings.append(cls_embedding)
    return np.vstack(embeddings)

def extract_brand(apt_name, brand_list):
    """아파트 이름에서 브랜드명을 추출합니다."""
    for brand in brand_list:
        if brand in apt_name:
            return brand
    return '기타 브랜드'

# 데이터 로드
print("🚚 데이터를 로드합니다...")
df = pd.read_csv("final_data.csv")

# 자동 피처 선택 결과 반영
features_to_drop = [
    '전용면적(㎡)', '공원 - 1.5km 이내', '고속도로IC - 10km 이내', '초등학교(2km 이내)',
    '편의점 - 500m 이내', '대형마트 - 1.5km 이내', '지하철 - 반경 1.5km 이내',
    '분양가(만원)', '관공서 - 1.5km 이내', '상급병원 - 1.5km 이내', '고등학교(2km 이내)',
    '대형쇼핑 - 3km 이내', '상권 - 3km 이내', '특별분양', '공급면적(㎡)', '일반분양',
    '버스 - 반경 500m 이내', '고속철도 - 10km 이내', '은행 - 1km 이내', '중학교(2km 이내)'
]
print(f"자동 선택된 {len(features_to_drop)}개의 피처를 제외합니다.")
df = df.drop(columns=features_to_drop, errors='ignore')

# 특성 공학
df.dropna(subset=['분양률'], inplace=True)
df = df[df['세대수'] > 3].reset_index(drop=True)
df['기준년월'] = pd.to_datetime(df['기준년월'], format='%Y%m')
df['세대수'] = pd.to_numeric(df.get('세대수'), errors='coerce')
df['소규모단지여부'] = (df['세대수'] < 10).astype(int)
df['년'] = df['기준년월'].dt.year
df['월'] = df['기준년월'].dt.month
df['월_sin'] = np.sin(2 * np.pi * df['월']/12)
df['월_cos'] = np.cos(2 * np.pi * df['월']/12)
df['분기'] = df['월'].apply(lambda x: (x-1)//3 + 1)
df.drop(columns=['기준년월', '미분양수', '주변시세 평균', '월'], inplace=True, errors='ignore')

with open('brand_priority_list.txt', 'r', encoding='utf-8') as f:
    brand_priority_list = [line.strip() for line in f.readlines()]
df['브랜드'] = df['아파트'].apply(lambda x: extract_brand(str(x), brand_priority_list))

top_10_builders = ['삼성물산', '현대건설', '대우건설', '현대엔지니어링', '지에스건설', '디엘이앤씨', '포스코이앤씨', '롯데건설', '에스케이에코플랜트', '호반건설']
df['건설사_등급'] = df['건설사'].apply(lambda x: 'Top10' if any(builder in str(x) for builder in top_10_builders) else 'Other')
df['지역_브랜드'] = df['지역'] + '_' + df['브랜드']
print("✅ 특성 공학 완료")

# 텍스트 임베딩 생성
MODEL_NAME = "kykim/bert-kor-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

brand_embeddings = get_embeddings(df['브랜드'], model, tokenizer)
co_embeddings = get_embeddings(df['건설사'], model, tokenizer)
brand_embed_df = pd.DataFrame(brand_embeddings, columns=[f'brand_embed_{i}' for i in range(brand_embeddings.shape[1])])
co_embed_df = pd.DataFrame(co_embeddings, columns=[f'co_embed_{i}' for i in range(co_embeddings.shape[1])])

df_processed = pd.concat([df.reset_index(drop=True), brand_embed_df, co_embed_df], axis=1)
print("✅ 임베딩 생성 및 결합 완료")

# 데이터 분리 및 전처리 파이프라인 설정
target = '분양률'
drop_cols = ['아파트', '브랜드', '건설사', '분양률']
X = df_processed.drop(columns=drop_cols, errors='ignore')
y = df_processed[target]

numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1004)
print(f"✅ 데이터 준비 완료 (Train: {len(X_train)}, Test: {len(X_test)})")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_cols)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print("✅ 데이터 전처리 완료")


# ===================================================================
# ✅ 하이퍼파라미터 튜닝 (개별 튜닝 후 결합) ✅
# ===================================================================

# --- 1. 탐색할 파라미터 공간 축소 및 개별 정의 ---
n_iterations = 20 # 튜닝 반복 횟수 (기존 50회 -> 20회)
cv_folds = 3      # 교차 검증 폴드 수

lgbm_params = {
    'n_estimators': randint(100, 700),
    'learning_rate': uniform(0.01, 0.1),
    'num_leaves': randint(20, 50)
}
xgb_params = {
    'n_estimators': randint(100, 700),
    'learning_rate': uniform(0.01, 0.1),
    'max_depth': randint(3, 10)
}
rf_params = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(5, 20),
    'min_samples_leaf': randint(1, 5)
}


# --- 2. 각 기본 모델을 개별적으로 RandomizedSearchCV 튜닝 ---
print(f"\n🔥 개별 모델 하이퍼파라미터 튜닝을 시작합니다... (n_iter={n_iterations}, cv={cv_folds})")

# LightGBM 튜닝 (verbose=-1 추가하여 경고 메시지 미출력)
print("\n[1/3] 🚀 LightGBM 모델 튜닝 중...")
lgbm_search = RandomizedSearchCV(lgb.LGBMRegressor(random_state=42, verbose=-1), lgbm_params, n_iter=n_iterations, cv=cv_folds, scoring='r2', n_jobs=-1, random_state=42, verbose=1)
lgbm_search.fit(X_train_processed, y_train)
best_lgbm = lgbm_search.best_estimator_
print(f"✅ LightGBM 최적 파라미터: {lgbm_search.best_params_}")

# XGBoost 튜닝
print("\n[2/3] 🚀 XGBoost 모델 튜닝 중...")
xgb_search = RandomizedSearchCV(xgb.XGBRegressor(random_state=42), xgb_params, n_iter=n_iterations, cv=cv_folds, scoring='r2', n_jobs=-1, random_state=42, verbose=1)
xgb_search.fit(X_train_processed, y_train)
best_xgb = xgb_search.best_estimator_
print(f"✅ XGBoost 최적 파라미터: {xgb_search.best_params_}")

# RandomForest 튜닝
print("\n[3/3] 🚀 RandomForest 모델 튜닝 중...")
rf_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), rf_params, n_iter=n_iterations, cv=cv_folds, scoring='r2', n_jobs=-1, random_state=42, verbose=1)
rf_search.fit(X_train_processed, y_train)
best_rf = rf_search.best_estimator_
print(f"✅ RandomForest 최적 파라미터: {rf_search.best_params_}")


# --- 3. 튜닝된 최적의 모델들로 StackingRegressor 최종 구성 ---
print("\n✨ 튜닝된 모델들로 최종 Stacking 모델을 구성하고 학습합니다...")
estimators = [
    ('lgbm', best_lgbm),
    ('xgb', best_xgb),
    ('rf', best_rf)
]
final_stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=RidgeCV(),
    cv=5, # 최종 모델 학습 시에만 교차검증 적용
    n_jobs=-1
)

# 최종 모델 학습
final_stacking_model.fit(X_train_processed, y_train)


# ===================================================================
# 튜닝된 최종 모델로 성능 평가 및 저장
# ===================================================================
y_pred_tuned = final_stacking_model.predict(X_test_processed)
r2_tuned = r2_score(y_test, y_pred_tuned)

print("\n" + "="*50)
print(f"🎯 튜닝 후 최종 R² Score: {r2_tuned:.4f}")
print("="*50 + "\n")

with open('stacking_model.pkl', 'wb') as f: pickle.dump(final_stacking_model, f)
with open('preprocessor.pkl', 'wb') as f: pickle.dump(preprocessor, f)

print("✅ 튜닝된 최종 모델과 전처리기가 'stacking_model.pkl', 'preprocessor.pkl' 파일로 덮어쓰기 저장되었습니다.")

🚚 데이터를 로드합니다...
자동 선택된 20개의 피처를 제외합니다.
✅ 특성 공학 완료


텍스트 임베딩 진행 중:   0%|          | 0/1982 [00:00<?, ?it/s]

텍스트 임베딩 진행 중:   0%|          | 0/1982 [00:00<?, ?it/s]

✅ 임베딩 생성 및 결합 완료
✅ 데이터 준비 완료 (Train: 1585, Test: 397)
✅ 데이터 전처리 완료

🔥 개별 모델 하이퍼파라미터 튜닝을 시작합니다... (n_iter=20, cv=3)

[1/3] 🚀 LightGBM 모델 튜닝 중...
Fitting 3 folds for each of 20 candidates, totalling 60 fits


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

✅ LightGBM 최적 파라미터: {'learning_rate': 0.01650515929852795, 'n_estimators': 487, 'num_leaves': 44}

[2/3] 🚀 XGBoost 모델 튜닝 중...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
✅ XGBoost 최적 파라미터: {'learning_rate': 0.0831993941811405, 'max_depth': 7, 'n_estimators': 120}

[3/3] 🚀 RandomForest 모델 튜닝 중...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
✅ RandomForest 최적 파라미터: {'max_depth': 17, 'min_samples_leaf': 1, 'n_estimators': 202}

✨ 튜닝된 모델들로 최종 Stacking 모델을 구성하고 학습합니다...

🎯 튜닝 후 최종 R² Score: 0.7145

✅ 튜닝된 최종 모델과 전처리기가 'stacking_model.pkl', 'preprocessor.pkl' 파일로 덮어쓰기 저장되었습니다.
