In [13]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler

In [2]:
# C드라이브 SEOUL 폴더에 있는 Housing.xlsx 파일 읽기
housing = pd.read_excel("C:/SEOUL/Housing.xlsx", sheet_name=0)

# 데이터 확인 (상위 5행)
print(housing.head())

   id  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0   1  7420         4          2        3      yes        no       no   
1   2  8960         4          4        4      yes        no       no   
2   3  9960         3          2        2      yes        no      yes   
3   4  7500         4          2        2      yes        no      yes   
4   5  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus     price  
0              no             yes        2      yes        furnished  13300000  
1              no             yes        3       no        furnished  12250000  
2              no              no        2      yes   semi-furnished  12250000  
3              no             yes        3      yes        furnished  12215000  
4              no             yes        2       no        furnished  11410000  


In [3]:
# 데이터 정보 보기
print(housing.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
 13  price             545 non-null    int64 
dtypes: int64(7), object(7)
memory usage: 59.7+ KB
None


In [4]:
housing = housing.drop(columns=["id"])

# 확인
print(housing.head())

   area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  7420         4          2        3      yes        no       no   
1  8960         4          4        4      yes        no       no   
2  9960         3          2        2      yes        no      yes   
3  7500         4          2        2      yes        no      yes   
4  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus     price  
0              no             yes        2      yes        furnished  13300000  
1              no             yes        3       no        furnished  12250000  
2              no              no        2      yes   semi-furnished  12250000  
3              no             yes        3      yes        furnished  12215000  
4              no             yes        2       no        furnished  11410000  


In [7]:
# 데이터 전처리 : 범주형 데이터 -> One-Hot Encoding, 더미변수
housing.columns
# mainroad,guestroom, basement, hotwaterheating, airconditioning, prefarea, furnishingstatus
# 범주형 변수 목록
categorical_cols = [
    "mainroad", "guestroom", "basement",
    "hotwaterheating", "airconditioning",
    "prefarea", "furnishingstatus"
]

# One-Hot Encoding 변환
housing = pd.get_dummies(housing, columns=categorical_cols, drop_first=False)

# 변환된 데이터 확인
print(housing.head())
print(housing.shape)

   area  bedrooms  bathrooms  stories  parking     price  mainroad_no  \
0  7420         4          2        3        2  13300000        False   
1  8960         4          4        4        3  12250000        False   
2  9960         3          2        2        2  12250000        False   
3  7500         4          2        2        3  12215000        False   
4  7420         4          1        2        2  11410000        False   

   mainroad_yes  guestroom_no  guestroom_yes  ...  basement_yes  \
0          True          True          False  ...         False   
1          True          True          False  ...         False   
2          True          True          False  ...          True   
3          True          True          False  ...          True   
4          True         False           True  ...          True   

   hotwaterheating_no  hotwaterheating_yes  airconditioning_no  \
0                True                False               False   
1                True     

In [9]:
# 데이터 분할하기
# y : price
# X : price를 제외한 모든 열
# 독립변수(X), 종속변수(y) 정의
X = housing.drop("price", axis=1)
y = housing["price"]

# 훈련 70%, 평가 30% 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# 데이터 크기 출력
print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)

X_train: (381, 20)
X_test : (164, 20)
y_train: (381,)
y_test : (164,)


In [12]:
# 모델 정의
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, learning_rate=0.1,
                            max_depth=3, random_state=42, verbosity=0)
}

# RMSE 결과 저장
results = {}

for name, model in models.items():
    # 학습
    model.fit(X_train, y_train)
    # 예측
    y_pred = model.predict(X_test)
    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results[name] = rmse

# RMSE 결과 출력
results_df = pd.DataFrame.from_dict(results, orient="index", columns=["RMSE"]).sort_values(by="RMSE")
print(results_df)

                          RMSE
LinearRegression  1.234107e+06
Lasso             1.234107e+06
Ridge             1.234149e+06
GradientBoosting  1.269418e+06
XGBoost           1.276035e+06
RandomForest      1.352696e+06
DecisionTree      1.844628e+06


In [14]:
# 예측모형의 성능을 개선하고 싶음
# (1) X 변환 : Min-Max Normalization(정규화) : X가 0~1 사이로 변경
# 1. Min-Max Scaling
scaler   = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# 2. 훈련/평가 데이터 다시 분할
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    X_scaled, y, test_size = 0.3, random_state = 42
)

# 3. 사용할 모델 정의
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01),
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, random_state=42)
}

# 4. 성능 저장용 딕셔너리
results = {}

# 5. 각 모델 학습 및 평가
for name, model in models.items():
    model.fit(X_train_scaled, y_train)                       # 학습
    y_pred = model.predict(X_test_scaled)                    # 예측
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))       # RMSE 계산
    results[name] = rmse
    print(f"{name} RMSE: {rmse:.2f}")

# 6. 결과를 DataFrame으로 정리
results_df = pd.DataFrame(list(results.items()), columns=["Model", "RMSE"]).sort_values(by="RMSE")
print("\n▶ RMSE 기준 성능 비교 (Min-Max Scaling 적용 후)")
print(results_df)


Linear Regression RMSE: 1234106.75
Ridge Regression RMSE: 1226824.36
Lasso Regression RMSE: 1234106.75
Random Forest RMSE: 1355183.56
Gradient Boosting RMSE: 1305445.02
XGBoost RMSE: 1435543.28

▶ RMSE 기준 성능 비교 (Min-Max Scaling 적용 후)
               Model          RMSE
1   Ridge Regression  1.226824e+06
0  Linear Regression  1.234107e+06
2   Lasso Regression  1.234107e+06
4  Gradient Boosting  1.305445e+06
3      Random Forest  1.355184e+06
5            XGBoost  1.435543e+06
