In [143]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [144]:
train = pd.read_csv(r'c:\Users\user\Downloads\real_estate\real.csv')

In [145]:
train.head()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2452 entries, 0 to 2451
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      2452 non-null   object 
 1   매물확인방식  2452 non-null   object 
 2   보증금     2452 non-null   float64
 3   월세      2452 non-null   int64  
 4   전용면적    1665 non-null   float64
 5   해당층     2223 non-null   float64
 6   총층      2436 non-null   float64
 7   방향      2452 non-null   object 
 8   방수      2436 non-null   float64
 9   욕실수     2434 non-null   float64
 10  주차가능여부  2452 non-null   object 
 11  총주차대수   1756 non-null   float64
 12  관리비     2452 non-null   int64  
 13  중개사무소   2452 non-null   object 
 14  제공플랫폼   2452 non-null   object 
 15  게재일     2452 non-null   object 
 16  허위매물여부  2452 non-null   int64  
dtypes: float64(7), int64(3), object(7)
memory usage: 325.8+ KB


In [146]:
train['욕실수'].value_counts()

욕실수
1.0    2350
2.0      84
Name: count, dtype: int64

## 데이터전처리

In [147]:
x = train.drop(['ID','허위매물여부'],axis=1)
y = train['허위매물여부']

In [148]:
from sklearn.impute import KNNImputer

# KNNImputer : 결측값을 K-최근접 이웃 방식으로 대체
knn_imputer = KNNImputer()

# 결측값을 KNN으로 대체할 열 목록
columns_fill_mean = ['해당층', '총층', '전용면적', '방수', '욕실수', '총주차대수']

# 학습 데이터에 fit 및 transform
x[columns_fill_mean] = knn_imputer.fit_transform(x[columns_fill_mean])

In [150]:
# Label Encoding 적용 열
label_encode_cols = ['중개사무소','게재일','제공플랫폼','방향']

label_encoders = {}
for col in label_encode_cols:
    le = LabelEncoder()
    x[col] = le.fit_transform(x[col].astype(str))
    label_encoders[col] = le

In [152]:
# One-Hot Encoding 적용 열
one_hot_cols = ['매물확인방식', '주차가능여부']

# One-Hot Encoding 적용 (scikit-learn 1.2 이상)
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Train 데이터 변환
x_encoded = one_hot_encoder.fit_transform(x[one_hot_cols])

# 변환된 데이터를 DataFrame으로 변환하여 기존 데이터와 병합
x_encoded_df = pd.DataFrame(x_encoded, columns=one_hot_encoder.get_feature_names_out(one_hot_cols), index=x.index)

# 기존 데이터와 One-Hot Encoding 결과 병합
x = pd.concat([x.drop(columns=one_hot_cols), x_encoded_df], axis=1)


In [153]:
# 대체된 데이터 일부 확인
print(x.head())


           보증금      월세    전용면적  해당층    총층  방향   방수  욕실수  총주차대수  관리비  중개사무소  \
0  402500000.0  470000  17.500  9.6  15.0   7  1.0  1.0   40.0   96    158   
1  170500000.0  200000  36.334  3.0   4.0   0  2.0  1.0    2.4    0    144   
2  114000000.0  380000  23.522  2.0   3.0   3  1.0  1.0   16.2    0     58   
3  163500000.0   30000  36.300  3.0   9.0   0  2.0  1.0   13.0   10    204   
4  346000000.0  530000  38.260  3.0   3.0   3  2.0  1.0    1.0    0    249   

   제공플랫폼  게재일  매물확인방식_서류확인  매물확인방식_전화확인  매물확인방식_현장확인  주차가능여부_가능  주차가능여부_불가능  
0      1  328          0.0          0.0          1.0        1.0         0.0  
1      6  415          0.0          0.0          1.0        0.0         1.0  
2      0  384          0.0          1.0          0.0        0.0         1.0  
3      0  382          0.0          0.0          1.0        1.0         0.0  
4      1  210          0.0          0.0          1.0        0.0         1.0  


### test data

In [155]:
test = pd.read_csv(r'c:\Users\user\Downloads\real_estate\estate.csv')

In [156]:
# Test 결측값 대체
test[columns_fill_mean] = knn_imputer.transform(test[columns_fill_mean])

In [157]:
# Label Encoding
for col in label_encode_cols:
    if col in test.columns:
        le = label_encoders[col]
        test[col] = test[col].astype(str)
        unseen = set(test[col].unique()) - set(le.classes_)
        if unseen:
            le.classes_ = np.append(le.classes_, list(unseen))
        test[col] = le.transform(test[col])

In [158]:
# One-Hot Encoding
test_encoded = one_hot_encoder.transform(test[one_hot_cols])
test_encoded_df = pd.DataFrame(test_encoded, columns=one_hot_encoder.get_feature_names_out(one_hot_cols), index=test.index)

test = pd.concat([test.drop(columns=one_hot_cols), test_encoded_df], axis=1)

In [159]:
test.drop(columns=['ID'],inplace=True)

## 모델 정의

In [160]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from optuna.samplers import TPESampler

# 데이터를 train과 validation으로 분할
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 5, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 50)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 50)
    max_features = trial.suggest_categorical('max_features', ['sqrt', None])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=42
    )

    # 모델 학습
    model.fit(x_train, y_train)

    # Validation 데이터로 성능 평가
    y_pred = model.predict(x_val)
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy  # Validation accuracy 반환

# 최적화 진행
seed = 42
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=seed))
study.optimize(objective, n_trials=150)

# 최적의 하이퍼파라미터와 결과 출력
print(f"Best hyperparameters: {study.best_params}")
print(f"Best score: {study.best_value}")


[I 2025-01-17 13:22:16,494] A new study created in memory with name: no-name-b6a00abb-8a82-4d30-87c1-41cb034187a0
[I 2025-01-17 13:22:17,250] Trial 0 finished with value: 0.8920570264765784 and parameters: {'n_estimators': 218, 'max_depth': 96, 'min_samples_split': 37, 'min_samples_leaf': 30, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.8920570264765784.
[I 2025-01-17 13:22:18,250] Trial 1 finished with value: 0.8859470468431772 and parameters: {'n_estimators': 321, 'max_depth': 72, 'min_samples_split': 3, 'min_samples_leaf': 49, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.8920570264765784.
[I 2025-01-17 13:22:18,922] Trial 2 finished with value: 0.9164969450101833 and parameters: {'n_estimators': 187, 'max_depth': 55, 'min_samples_split': 23, 'min_samples_leaf': 15, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 2 with value: 0.9164969450101833.
[I 2025-01-17 13:22:19,723] Trial 3 finished with value: 0.887983706

Best hyperparameters: {'n_estimators': 373, 'max_depth': 41, 'min_samples_split': 27, 'min_samples_leaf': 16, 'max_features': None, 'bootstrap': False}
Best score: 0.9592668024439919


In [161]:
import xgboost as xgb

# 최적 하이퍼파라미터로 XGBoost 모델 정의
best_params = {'n_estimators': 373, 'max_depth': 41, 'min_samples_split': 27, 'min_samples_leaf': 16, 'max_features': None, 'bootstrap': False}
model = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    bootstrap=best_params['bootstrap'],
    random_state=42
)

# 모델 훈련
model.fit(x, y)


## 예측 및 제출

In [162]:
pred = pd.Series(model.predict(test))

In [163]:
submit = pd.read_csv(r'c:\Users\user\Downloads\real_estate\sample_submission.csv')

In [164]:
submit['허위매물여부'] = pred
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      613 non-null    object
 1   허위매물여부  613 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 9.7+ KB


In [165]:
submit.head()

Unnamed: 0,ID,허위매물여부
0,TEST_000,0
1,TEST_001,0
2,TEST_002,0
3,TEST_003,0
4,TEST_004,0


In [166]:
#csv 파일 저장
submit.to_csv(r'c:\Users\user\Downloads\real_estate\real_estate_9.csv',index=False)