In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
import lightgbm as lgb
import xgboost as xgb
import warnings
from tqdm.auto import tqdm

warnings.filterwarnings('ignore')

# 1. ÌÖçÏä§Ìä∏ ÏûÑÎ≤†Îî© Ìï®Ïàò (Ïù¥Ï†ÑÍ≥º ÎèôÏùº)
def get_embeddings(data, model, tokenizer):
    embeddings = []
    for text in tqdm(data, desc="ÌÖçÏä§Ìä∏ ÏûÑÎ≤†Îî© ÏßÑÌñâ Ï§ë"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=50)
        with torch.no_grad():
            outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
        embeddings.append(cls_embedding)
    return np.vstack(embeddings)

# ‚≠êÔ∏è 2. Ïö∞ÏÑ†ÏàúÏúÑÎ•º Ï†ÅÏö©Ìïú Î∏åÎûúÎìú Ï∂îÏ∂ú Ìï®Ïàò Î∞è Î¶¨Ïä§Ìä∏ Ï†ïÏùò
def extract_brand(apt_name, brand_list):
    """ÏïÑÌååÌä∏ Ïù¥Î¶ÑÏóêÏÑú Ïö∞ÏÑ†ÏàúÏúÑÍ∞Ä Í∞ÄÏû• ÎÜíÏùÄ Î∏åÎûúÎìúÎ™ÖÏùÑ Ï∂îÏ∂úÌïòÎäî Ìï®Ïàò"""
    for brand in brand_list:
        if brand in apt_name:
            return brand  # Î¶¨Ïä§Ìä∏Ïùò Ïïû ÏàúÏÑú(Îçî ÎÜíÏùÄ Ïö∞ÏÑ†ÏàúÏúÑ)Ïóê ÏûàÎäî Î∏åÎûúÎìúÎ•º Î®ºÏ†Ä Î∞òÌôò
    return 'Í∏∞ÌÉÄ Î∏åÎûúÎìú'

# Íµ≠ÎÇ¥ Ï£ºÏöî ÏïÑÌååÌä∏ Î∏åÎûúÎìú 'Ïö∞ÏÑ†ÏàúÏúÑ' Î¶¨Ïä§Ìä∏
brand_priority_list = [
    # --- ÏµúÏÉÅÏúÑ ÌîÑÎ¶¨ÎØ∏ÏóÑ Î∏åÎûúÎìú ---
    'ÎîîÏóêÏù¥Ïπò', 'ÏïÑÌÅ¨Î°ú', 'Î•¥Ïóò', 'Ïò§Ìã∞ÏóêÎ•¥',
    # --- 1Íµ∞ Î©îÏù¥Ï†Ä Î∏åÎûúÎìú ---
    'ÏûêÏù¥', 'ÌûêÏä§ÌÖåÏù¥Ìä∏', 'ÎûòÎØ∏Ïïà', 'Ìë∏Î•¥ÏßÄÏò§', 'ÎçîÏÉµ', 'Î°ØÎç∞Ï∫êÏä¨', 'ÏïÑÏù¥ÌååÌÅ¨', 'eÌé∏ÌïúÏÑ∏ÏÉÅ', 'SKÎ∑∞', 'Ìè¨Î†àÎÇò',
    # --- Ï§ëÍ≤¨/Í∞ïÏÜå Î∏åÎûúÎìú ---
    'Ïñ¥Ïö∏Î¶º', 'Îç∞ÏãúÏïô', 'Ìò∏Î∞òÏç®Î∞ã', 'Ï§ëÌù•S-ÌÅ¥ÎûòÏä§', 'Ïö∞ÎØ∏Î¶∞', 'ÏÑúÌù¨Ïä§ÌÉÄÌûêÏä§', 'ÌïúÎùºÎπÑÎ∞úÎîî',
    'ÌïòÎäòÏ±Ñ', 'Î≤†Î•¥ÎîîÏõÄ', 'Ïä§ÏúÑÏ≤∏', 'ÍøàÏóêÍ∑∏Î¶∞', 'Î∞òÎèÑÏú†Î≥¥Îùº', 'Ï†úÏùºÌíçÍ≤ΩÏ±Ñ', 'Í∏àÍ∞ïÌéúÌÖåÎ¶¨ÏõÄ',
    'ÎèôÎ¨∏ÍµøÎ™®ÎãùÌûê', 'Ïã†ÎèôÏïÑÌååÎ∞ÄÎ¶¨Ïóê', 'ÏΩîÏïÑÎ£®', 'ÎëêÏÇ∞ÏúÑÎ∏å', 'ÏåçÏö©ÏòàÍ∞Ä', 'Ïù¥ÏàòÎ∏åÎùºÏö¥Ïä§ÌÜ§',
    'ÌÉúÏòÅÎç∞ÏãúÏïô', 'Í≥ÑÎ£°Î¶¨ÏäàÎπå', 'ÌôîÏÑ±ÌååÌÅ¨ÎìúÎ¶º', 'Ïö∞Î∞©ÏïÑÏù¥Ïú†Ïâò', 'ÏÇºÏ†ïÍ∑∏Î¶∞ÏΩîÏïÑ', 'ÏùºÏÑ±Ìä∏Î£®Ïóò',
    'ÎåÄÎ∞©ÎÖ∏Î∏îÎûúÎìú', 'ÎåÄÍ¥ëÎ°úÏ†úÎπÑÏïô', 'ÏñëÏö∞ÎÇ¥ÏïàÏï†', 'Í≤ΩÎÇ®ÏïÑÎÑàÏä§Îπå', 'ÏÇºÎ∂ÄÎ•¥ÎÑ§ÏÉÅÏä§', 'ÌïúÏñëÏàòÏûêÏù∏',
    'Ïã†ÏïàÏù∏Ïä§Îπå', 'ÌååÎùºÍ≥§', 'Í≥®ÎìúÌÅ¥ÎûòÏä§', 'ÏãúÌã∞ÌîÑÎùºÎîîÏõÄ', 'Ìï¥ÎßÅÌÑ¥ÌîåÎ†àÏù¥Ïä§', 'Î™®ÏïÑÏóòÍ∞Ä',
    'ÎπåÎ¶¨Î∏å', 'ÏßÄÏõ∞', 'Ìä∏Î£®Ïóò', 'Ïπ∏ÌÉÄÎπå',
    # --- ÏÑúÎ∏å Î∏åÎûúÎìú Î∞è Îã®ÏßÄÎ™Ö ÌäπÏßï (Í∞ÄÏû• ÎÇÆÏùÄ Ïö∞ÏÑ†ÏàúÏúÑ) ---
    'ÎçîÌçºÏä§Ìä∏', 'ÎçîÏõê', 'ÎçîÎ¶¨Î∏å', 'Î¶¨ÎçîÏä§Ìè¨Î†à', 'Ìä∏Î¶¨Ïö∞Ïä§', 'Í∑∏ÎùºÏãúÏóò',
    'Î¶¨Î≤ÑÌååÌÅ¨', 'ÏÑºÌä∏Îü¥ÌååÌÅ¨', 'ÎîîÏò§ÏÖò', 'ÏóêÎìÄÌè¨Î†à', 'ÌååÌÅ¨Î∑∞', 'Î†àÏù¥ÌÅ¨'
]


# 3. Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è ÌäπÏÑ± Í≥µÌïô
df = pd.read_csv("final_data.csv")
df['ÌäπÎ≥ÑÎ∂ÑÏñë'] = pd.to_numeric(df['ÌäπÎ≥ÑÎ∂ÑÏñë'], errors='coerce').fillna(0)
df['Í∏∞Ï§ÄÎÖÑÏõî'] = pd.to_datetime(df['Í∏∞Ï§ÄÎÖÑÏõî'])
df['ÎÖÑ'] = df['Í∏∞Ï§ÄÎÖÑÏõî'].dt.year
df['Ïõî'] = df['Í∏∞Ï§ÄÎÖÑÏõî'].dt.month
df['ÏùºÎ∞òÎ∂ÑÏñë'] = df['ÏùºÎ∞òÎ∂ÑÏñë'].replace('-', 0).astype('int64')
df['ÌäπÎ≥ÑÎ∂ÑÏñë'] = df['ÌäπÎ≥ÑÎ∂ÑÏñë'].replace('-', 0).astype('int64')
df['ÏùºÎ∞òÎ∂ÑÏñë'] = df['ÏùºÎ∞òÎ∂ÑÏñë'].astype('int64')
df['ÌäπÎ≥ÑÎ∂ÑÏñë'] = df['ÌäπÎ≥ÑÎ∂ÑÏñë'].astype('int64')
df.drop(columns=['Í∏∞Ï§ÄÎÖÑÏõî', 'ÎØ∏Î∂ÑÏñëÏàò', 'Ï£ºÎ≥ÄÏãúÏÑ∏ ÌèâÍ∑†'], inplace=True, errors='ignore')
for col in ['Î∂ÑÏñëÍ∞Ä(ÎßåÏõê)', 'Í≥µÍ∏âÎ©¥Ï†Å(„é°)', 'Ï£ºÎ≥ÄÏãúÏÑ∏ ÌèâÍ∑†(ÎßåÏõê)', 'ÏÑ∏ÎåÄÏàò']:
    if col in df.columns:
        df[col].replace(0, 1, inplace=True)
infra_cols = [col for col in df.columns if 'km' in col or '500m' in col]
df['ÌèâÎãπÎ∂ÑÏñëÍ∞Ä'] = df['Î∂ÑÏñëÍ∞Ä(ÎßåÏõê)'] / (df['Í≥µÍ∏âÎ©¥Ï†Å(„é°)'] / 3.3)
df['Ïù∏ÌîÑÎùº_Ï†êÏàò'] = df[infra_cols].sum(axis=1)
df['ÏãúÏÑ∏Ï¥àÍ≥ºÎπÑÏú®'] = ((df['Î∂ÑÏñëÍ∞Ä(ÎßåÏõê)'] - df['Ï£ºÎ≥ÄÏãúÏÑ∏ ÌèâÍ∑†(ÎßåÏõê)']) / df['Ï£ºÎ≥ÄÏãúÏÑ∏ ÌèâÍ∑†(ÎßåÏõê)']).replace([np.inf, -np.inf], np.nan)
df['ÏãúÏÑ∏Ï∞®ÏùµÎ•†'] = (df['ÏãúÏÑ∏Ï∞®Ïùµ(ÎßåÏõê)'] / df['Î∂ÑÏñëÍ∞Ä(ÎßåÏõê)']).replace([np.inf, -np.inf], np.nan)
df['Ï†ÑÏö©Î•†'] = (df['Ï†ÑÏö©Î©¥Ï†Å(„é°)'] / df['Í≥µÍ∏âÎ©¥Ï†Å(„é°)']) * 100
df['ÏÑ∏ÎåÄÎãπÎ©¥Ï†Å'] = df['Í≥µÍ∏âÎ©¥Ï†Å(„é°)'] / df['ÏÑ∏ÎåÄÏàò']
df['ÌäπÎ≥ÑÎ∂ÑÏñëÏú†Î¨¥'] = df['ÌäπÎ≥ÑÎ∂ÑÏñë'].apply(lambda x: 1 if x > 0 else 0)
if 'Í∏àÎ¶¨' in df.columns:
    df['Í∏àÎ¶¨Íµ¨Í∞Ñ'] = pd.cut(df['Í∏àÎ¶¨'], bins=[1, 2.5, 3.0, 3.5, np.inf],
                            labels=['1~2.5%', '2.5~3.0%', '3.0~3.5%', '3.5%~'], right=False)

# 4. Î∏åÎûúÎìú Ï∂îÏ∂ú Î∞è ÏûÑÎ≤†Îî© ÏÉùÏÑ±
print("üèôÔ∏è ÏïÑÌååÌä∏ Ïù¥Î¶ÑÏóêÏÑú Î∏åÎûúÎìúÎ•º Ï∂îÏ∂úÌï©ÎãàÎã§...")
df['Î∏åÎûúÎìú'] = df['ÏïÑÌååÌä∏'].apply(lambda x: extract_brand(str(x), brand_priority_list))
print("--- Î∏åÎûúÎìú Ï∂îÏ∂ú Í≤∞Í≥º (ÏÉÅÏúÑ 5Í∞ú) ---")
print(df[['ÏïÑÌååÌä∏', 'Î∏åÎûúÎìú']].head())

MODEL_NAME = "kykim/bert-kor-base"
print(f"\nü§ñ '{MODEL_NAME}' Î™®Îç∏Í≥º ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†ÄÎ•º Î°úÎìúÌï©ÎãàÎã§...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

brand_embeddings = get_embeddings(df['Î∏åÎûúÎìú'], model, tokenizer)
co_embeddings = get_embeddings(df['Í±¥ÏÑ§ÏÇ¨'], model, tokenizer)

brand_embed_df = pd.DataFrame(brand_embeddings, columns=[f'brand_embed_{i}' for i in range(brand_embeddings.shape[1])])
co_embed_df = pd.DataFrame(co_embeddings, columns=[f'co_embed_{i}' for i in range(co_embeddings.shape[1])])

df_processed = pd.concat([df.reset_index(drop=True), brand_embed_df, co_embed_df], axis=1)

# 5. Î™®Îç∏ÎßÅ Ï§ÄÎπÑ
target = 'Î∂ÑÏñëÎ•†'
drop_cols = infra_cols + ['ÏïÑÌååÌä∏', 'Î∏åÎûúÎìú', 'Í±¥ÏÑ§ÏÇ¨', 'Ï£ºÎ≥ÄÏãúÏÑ∏ ÌèâÍ∑†(ÎßåÏõê)', 'ÏãúÏÑ∏Ï∞®Ïùµ(ÎßåÏõê)', 'Î∂ÑÏñëÎ•†']
X = df_processed.drop(columns=drop_cols, errors='ignore')
y = df_processed[target]
y.fillna(y.mean(), inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

# 6. Ï†ÑÏ≤òÎ¶¨ Î∞è Î™®Îç∏ ÌõàÎ†®
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())]), numerical_features),
        ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), categorical_features)
    ],
    remainder='passthrough'
)
print("\nÎç∞Ïù¥ÌÑ∞ Ï†ÑÏ≤òÎ¶¨Î•º ÏãúÏûëÌï©ÎãàÎã§...")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print("‚úÖ Îç∞Ïù¥ÌÑ∞ Ï†ÑÏ≤òÎ¶¨ ÏôÑÎ£å")

lgbm = lgb.LGBMRegressor(random_state=42, verbose=-1)
xgb_model = xgb.XGBRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)
param_dist_lgbm = {'n_estimators': [300, 500], 'learning_rate': [0.05, 0.1], 'max_depth': [5, 7], 'num_leaves': [31, 63]}
param_dist_xgb = {'n_estimators': [300, 500], 'learning_rate': [0.05, 0.1], 'max_depth': [5, 7], 'subsample': [0.8, 0.9]}
param_dist_rf = {'n_estimators': [300, 500], 'max_depth': [10, None], 'min_samples_split': [2, 5]}
print("\nüöÄ Î™®Îç∏ ÏµúÏ†ÅÌôî(RandomizedSearchCV)Î•º ÏãúÏûëÌï©ÎãàÎã§...")
search_lgbm = RandomizedSearchCV(lgbm, param_dist_lgbm, n_iter=10, cv=3, n_jobs=-1, random_state=42, scoring='r2')
search_xgb = RandomizedSearchCV(xgb_model, param_dist_xgb, n_iter=10, cv=3, n_jobs=-1, random_state=42, scoring='r2')
search_rf = RandomizedSearchCV(rf, param_dist_rf, n_iter=10, cv=3, n_jobs=-1, random_state=42, scoring='r2')
search_lgbm.fit(X_train_processed, y_train)
print("‚úÖ LightGBM ÏµúÏ†ÅÌôî ÏôÑÎ£å")
search_xgb.fit(X_train_processed, y_train)
print("‚úÖ XGBoost ÏµúÏ†ÅÌôî ÏôÑÎ£å")
search_rf.fit(X_train_processed, y_train)
print("‚úÖ RandomForest ÏµúÏ†ÅÌôî ÏôÑÎ£å")
best_lgbm = search_lgbm.best_estimator_
best_xgb = search_xgb.best_estimator_
best_rf = search_rf.best_estimator_

print("\nüî• Ïä§ÌÉúÌÇπ Î™®Îç∏ ÌõàÎ†®ÏùÑ ÏãúÏûëÌï©ÎãàÎã§...")
stack_model = StackingRegressor(
    estimators=[('lgbm', best_lgbm), ('xgb', best_xgb), ('rf', best_rf)],
    final_estimator=GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, random_state=42),
    cv=3,
    passthrough=True,
    n_jobs=-1
)
stack_model.fit(X_train_processed, y_train)

y_pred = stack_model.predict(X_test_processed)
print("\nüèÅ ÏµúÏ¢Ö ÏÑ±Îä• Í≤∞Í≥º (Î∏åÎûúÎìú Ïö∞ÏÑ†ÏàúÏúÑ ÏûÑÎ≤†Îî© Ï†ÅÏö©):")
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"R¬≤ Score: {r2_score(y_test, y_pred):.4f}")

üèôÔ∏è ÏïÑÌååÌä∏ Ïù¥Î¶ÑÏóêÏÑú Î∏åÎûúÎìúÎ•º Ï∂îÏ∂úÌï©ÎãàÎã§...
--- Î∏åÎûúÎìú Ï∂îÏ∂ú Í≤∞Í≥º (ÏÉÅÏúÑ 5Í∞ú) ---
         ÏïÑÌååÌä∏ Î∏åÎûúÎìú
0  Í∞ïÎ¶âÏûêÏù¥Î•¥ÎÑ§ÎîîÏò§ÏÖò  ÏûêÏù¥
1  Í∞ïÎ¶âÏûêÏù¥Î•¥ÎÑ§ÎîîÏò§ÏÖò  ÏûêÏù¥
2  Í∞ïÎ¶âÏûêÏù¥Î•¥ÎÑ§ÎîîÏò§ÏÖò  ÏûêÏù¥
3  Í∞ïÎ¶âÏûêÏù¥Î•¥ÎÑ§ÎîîÏò§ÏÖò  ÏûêÏù¥
4  Í∞ïÎ¶âÏûêÏù¥Î•¥ÎÑ§ÎîîÏò§ÏÖò  ÏûêÏù¥

ü§ñ 'kykim/bert-kor-base' Î™®Îç∏Í≥º ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†ÄÎ•º Î°úÎìúÌï©ÎãàÎã§...


tokenizer_config.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/476M [00:00<?, ?B/s]

ÌÖçÏä§Ìä∏ ÏûÑÎ≤†Îî© ÏßÑÌñâ Ï§ë:   0%|          | 0/2214 [00:00<?, ?it/s]

ÌÖçÏä§Ìä∏ ÏûÑÎ≤†Îî© ÏßÑÌñâ Ï§ë:   0%|          | 0/2214 [00:00<?, ?it/s]


Îç∞Ïù¥ÌÑ∞ Ï†ÑÏ≤òÎ¶¨Î•º ÏãúÏûëÌï©ÎãàÎã§...
‚úÖ Îç∞Ïù¥ÌÑ∞ Ï†ÑÏ≤òÎ¶¨ ÏôÑÎ£å

üöÄ Î™®Îç∏ ÏµúÏ†ÅÌôî(RandomizedSearchCV)Î•º ÏãúÏûëÌï©ÎãàÎã§...
‚úÖ LightGBM ÏµúÏ†ÅÌôî ÏôÑÎ£å
‚úÖ XGBoost ÏµúÏ†ÅÌôî ÏôÑÎ£å
‚úÖ RandomForest ÏµúÏ†ÅÌôî ÏôÑÎ£å

üî• Ïä§ÌÉúÌÇπ Î™®Îç∏ ÌõàÎ†®ÏùÑ ÏãúÏûëÌï©ÎãàÎã§...

üèÅ ÏµúÏ¢Ö ÏÑ±Îä• Í≤∞Í≥º (Î∏åÎûúÎìú Ïö∞ÏÑ†ÏàúÏúÑ ÏûÑÎ≤†Îî© Ï†ÅÏö©):
MSE: 0.0440
MAE: 0.1406
R¬≤ Score: 0.5325
