In [26]:
#라이브러리 및 파일 불러오기

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import missingno as msno
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from catboost import CatBoostClassifier

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [27]:
plt.rc('font', family='Malgun Gothic')  # Windows: 맑은 고딕
plt.rcParams['axes.unicode_minus'] = False  # 한글 폰트 적용 시 마이너스 부호 깨짐 방지
train.head()

Unnamed: 0,ID,매물확인방식,보증금,월세,전용면적,해당층,총층,방향,방수,욕실수,주차가능여부,총주차대수,관리비,중개사무소,제공플랫폼,게재일,허위매물여부
0,TRAIN_0000,현장확인,402500000.0,470000,,,15.0,서향,1.0,1.0,가능,40.0,96,t93Nt6I2I0,B플랫폼,2024-10-09,0
1,TRAIN_0001,현장확인,170500000.0,200000,,3.0,4.0,남동향,2.0,1.0,불가능,,0,q39iV5J4E6,D플랫폼,2024-12-26,0
2,TRAIN_0002,전화확인,114000000.0,380000,,2.0,3.0,동향,1.0,1.0,불가능,,0,b03oE4G3F6,A플랫폼,2024-11-28,0
3,TRAIN_0003,현장확인,163500000.0,30000,36.3,3.0,9.0,남동향,2.0,1.0,가능,13.0,10,G52Iz8V2B9,A플랫폼,2024-11-26,0
4,TRAIN_0004,현장확인,346000000.0,530000,,3.0,3.0,동향,2.0,1.0,불가능,,0,N45gM0M7R0,B플랫폼,2024-06-25,1


In [28]:
print(train.dtypes)
cat_features = ['매물확인방식', '방향', '주차가능여부', '중개사무소', '제공플랫폼']

ID         object
매물확인방식     object
보증금       float64
월세          int64
전용면적      float64
해당층       float64
총층        float64
방향         object
방수        float64
욕실수       float64
주차가능여부     object
총주차대수     float64
관리비         int64
중개사무소      object
제공플랫폼      object
게재일        object
허위매물여부      int64
dtype: object


In [None]:
# '게재일'을 datetime 형식으로 변환
train['게재일'] = pd.to_datetime(train['게재일'], format='%Y-%m-%d', errors='coerce')
test['게재일'] = pd.to_datetime(test['게재일'], format='%Y-%m-%d', errors='coerce')


# 데이터셋에서 가장 최근 날짜 찾기
latest_date = max(train['게재일'].max(), test['게재일'].max())

# 기준 날짜를 가장 최근 날짜로 설정하여 경과일수 계산
train['게재_경과일'] = (latest_date - train['게재일']).dt.days
test['게재_경과일'] = (latest_date - test['게재일']).dt.days
# ✅ 보증금 대비 월세 비율
train['보증금_월세비율'] = train['보증금'] / (train['월세'] + 1)  # 0으로 나누는 것 방지
test['보증금_월세비율'] = test['보증금'] / (test['월세'] + 1)

# ✅ 1m²당 관리비
train['평당관리비'] = train['관리비'] / (train['전용면적'] + 1)
test['평당관리비'] = test['관리비'] / (test['전용면적'] + 1)
# 원본 '게재일' 컬럼 삭제
train = train.drop(['게재일'], axis=1)
test = test.drop(['게재일'], axis=1)

In [30]:
X_train = train.drop(['허위매물여부', 'ID'], axis=1)  # 피처 데이터
y_train = train['허위매물여부']  # 타겟 값
X_test = test.drop('ID', axis=1)  # 테스트 데이터

In [None]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

# 1️⃣ 하이퍼파라미터 튜닝 함수 정의
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 500, 3000, step=500),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "loss_function": "Logloss",
        "cat_features": cat_features
    }
    
    model = CatBoostClassifier(**params, verbose=0)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
    return score

# 2️⃣ Optuna 실행 (최적 하이퍼파라미터 탐색)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# 3️⃣ 최적 하이퍼파라미터 확인
print("Best params:", study.best_params)

[I 2025-02-13 17:33:25,102] A new study created in memory with name: no-name-a0984be5-89ed-43b4-88ee-3b028b538c76


In [31]:

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',  # 이진 분류 문제 (허위매물 여부 예측)
    cat_features=cat_features,  # 범주형 변수 설정
    verbose=100
)

model.fit(X_train, y_train)

0:	learn: 0.5659652	total: 44.5ms	remaining: 44.4s
100:	learn: 0.0470011	total: 2.16s	remaining: 19.2s
200:	learn: 0.0248050	total: 4.1s	remaining: 16.3s
300:	learn: 0.0146408	total: 6.07s	remaining: 14.1s
400:	learn: 0.0099358	total: 8.15s	remaining: 12.2s
500:	learn: 0.0071850	total: 10.1s	remaining: 10.1s
600:	learn: 0.0056451	total: 12.1s	remaining: 8.04s
700:	learn: 0.0045117	total: 14.1s	remaining: 6s
800:	learn: 0.0036435	total: 16s	remaining: 3.98s
900:	learn: 0.0032730	total: 18s	remaining: 1.98s
999:	learn: 0.0030315	total: 19.9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1d7233ce580>

In [32]:
predictions = model.predict(X_test)

# 6️⃣ 제출 파일 생성
submission = pd.read_csv('./sample_submission.csv')
submission['허위매물여부'] = predictions  # 결과 컬럼 반영
submission.to_csv('submission.csv', index=False)

print("✅ 제출 파일 (submission.csv) 생성 완료!")



✅ 제출 파일 (submission.csv) 생성 완료!
