# 캘리포니아 집값 데이터

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error

#pip install catboost xgboost lightgbm
#pip install numpy<2
#TODO: 만약에 CatBoostRegressor 오류 발생시 numpy 다시 업데이트 후에 계속 진행
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


## 데이터 불러오기

In [2]:
df = pd.read_csv('data/housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## 데이터 전처리

### NA값 처리·중위값 추가

In [3]:
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)


### 특정 속성이 과도한 지표를 형성하고 있어서, 정리

In [4]:
df['bed_per_room'] = df['total_bedrooms'] / df['total_rooms'] # 지표를 나눔
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,bed_per_room
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0.146591
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,0.155797
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,0.129516
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,0.184458
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,0.172096


### 데이터가 한쪽으로 치우쳐져 있는지 확인 후 정리

In [5]:
df['median_house_value'].skew() # 첨도: 한쪽으로 데이터가 치우침

0.9777632739098341

In [6]:
X = df.drop(['median_house_value'], axis=1)
y = np.log(df['median_house_value'])

In [7]:
from scipy.stats import skew    
skew_df = pd.DataFrame(X.select_dtypes(np.number).columns, columns=["Feature"]) # Features 를 행으로 변환
skew_df['Skew'] = skew_df["Feature"].apply(lambda feature: skew(df[feature])) # Skew(첨도) Feature 추가 
skew_df["ABS_Skew"] = skew_df["Skew"].apply(abs) # 절댓값 Skew Feature 추가
skew_df["Skew"] = skew_df["ABS_Skew"].apply(lambda x: True if x > 0.5 else False) # 첨도가 치우쳐져 있으면 True, 치우쳐져 있지 않으면 False
skew_df

Unnamed: 0,Feature,Skew,ABS_Skew
0,longitude,False,0.29778
1,latitude,False,0.465919
2,housing_median_age,False,0.060326
3,total_rooms,True,4.147042
4,total_bedrooms,True,3.480888
5,population,True,4.9355
6,households,True,3.41019
7,median_income,True,1.646537
8,bed_per_room,True,6.316445


+ median_house_value 값 눌려짐

In [8]:
skew_col = skew_df[skew_df["ABS_Skew"] > 0.5]['Feature'].values
skew_col

array(['total_rooms', 'total_bedrooms', 'population', 'households',
       'median_income', 'bed_per_room'], dtype=object)

In [9]:
for col in skew_col:
    X[col] = np.log(X[col])

In [10]:
X.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,bed_per_room
0,-122.23,37.88,41.0,6.779922,4.859812,5.774552,4.836282,2.119287,NEAR BAY,-1.92011
1,-122.22,37.86,21.0,8.867709,7.008505,7.783641,7.037028,2.116424,NEAR BAY,-1.859204
2,-122.24,37.85,52.0,7.290975,5.247024,6.206576,5.17615,1.982022,NEAR BAY,-2.043951
3,-122.25,37.85,52.0,7.149917,5.459586,6.324359,5.389072,1.730434,NEAR BAY,-1.690331
4,-122.25,37.85,52.0,7.394493,5.63479,6.336826,5.556828,1.347086,NEAR BAY,-1.759704


### 문자열을 숫자로 변경

In [11]:
encoder = LabelEncoder()
X['ocean_proximity'] = encoder.fit_transform(X['ocean_proximity']) # ocean_proximity 행으로 추가
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,bed_per_room
0,-122.23,37.88,41.0,6.779922,4.859812,5.774552,4.836282,2.119287,3,-1.920110
1,-122.22,37.86,21.0,8.867709,7.008505,7.783641,7.037028,2.116424,3,-1.859204
2,-122.24,37.85,52.0,7.290975,5.247024,6.206576,5.176150,1.982022,3,-2.043951
3,-122.25,37.85,52.0,7.149917,5.459586,6.324359,5.389072,1.730434,3,-1.690331
4,-122.25,37.85,52.0,7.394493,5.634790,6.336826,5.556828,1.347086,3,-1.759704
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,7.417580,5.924256,6.739337,5.799093,0.444878,1,-1.493325
20636,-121.21,39.49,18.0,6.546785,5.010635,5.874931,4.736198,0.938756,1,-1.536150
20637,-121.22,39.43,17.0,7.720462,6.184149,6.914731,6.070738,0.530628,1,-1.536313
20638,-121.32,39.43,18.0,7.528332,6.013715,6.608001,5.855072,0.624440,1,-1.514617


## 데이터 분리

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 분류기 모음

In [13]:
lr = LinearRegression() # 선형회귀
lr.fit(X_train, y_train)
predict_lr = lr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict_lr))
r2 = r2_score(y_test, predict_lr)
print(rmse, r2) # baseLine 없이 좋은지 판단 불가

0.3444038157651874 0.6345457124333707


In [14]:
knn = KNeighborsRegressor() # KNN : rmse 감소, 상관계수는 증가
knn.fit(X_train, y_train)
predict_knn = knn.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict_knn))
r2 = r2_score(y_test, predict_knn)
print(rmse, r2)

0.3244770405871314 0.6756117632134155


In [16]:
rf = RandomForestRegressor() # 랜덤 포레스트 : 성능 개선됨 or 과대적합이 생길 수 있음
rf.fit(X_train, y_train)
predict_rf = rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict_rf))
r2 = r2_score(y_test, predict_rf)
print(rmse, r2)

0.2343390041280899 0.8308054723191719


In [18]:
cat = CatBoostRegressor()
cat.fit(X_train, y_train)
predict_cat = cat.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict_cat))
r2 = r2_score(y_test, predict_cat)
print(rmse, r2)

Learning rate set to 0.063766
0:	learn: 0.5477066	total: 2.97ms	remaining: 2.96s
1:	learn: 0.5272182	total: 5.56ms	remaining: 2.78s
2:	learn: 0.5082013	total: 8.22ms	remaining: 2.73s
3:	learn: 0.4912890	total: 11ms	remaining: 2.74s
4:	learn: 0.4750285	total: 14.1ms	remaining: 2.8s
5:	learn: 0.4605128	total: 18.3ms	remaining: 3.03s
6:	learn: 0.4475175	total: 24.9ms	remaining: 3.54s
7:	learn: 0.4353580	total: 48.4ms	remaining: 6s
8:	learn: 0.4240331	total: 50.9ms	remaining: 5.61s
9:	learn: 0.4146684	total: 53.1ms	remaining: 5.26s
10:	learn: 0.4052256	total: 57ms	remaining: 5.12s
11:	learn: 0.3969183	total: 61.1ms	remaining: 5.03s
12:	learn: 0.3896859	total: 64.2ms	remaining: 4.87s
13:	learn: 0.3833644	total: 66.4ms	remaining: 4.68s
14:	learn: 0.3761449	total: 68.6ms	remaining: 4.5s
15:	learn: 0.3707153	total: 70.5ms	remaining: 4.33s
16:	learn: 0.3645221	total: 72.3ms	remaining: 4.18s
17:	learn: 0.3598349	total: 74ms	remaining: 4.04s
18:	learn: 0.3544757	total: 75.8ms	remaining: 3.91s
19:

In [19]:
xg = XGBRegressor()  # 데이터셋이 상관관계를 갖고 있음(rmse가 증가, r2 감소)
xg.fit(X_train, y_train)
predict_xg = xg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict_xg))
r2 = r2_score(y_test, predict_xg)
print(rmse, r2)

0.23215147603571726 0.833949552238219


In [20]:
lgb = LGBMRegressor()
lgb.fit(X_train, y_train)
predict_lgb = lgb.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict_lgb))
r2 = r2_score(y_test, predict_lgb)
print(rmse, r2)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000423 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2098
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 10
[LightGBM] [Info] Start training from score 12.086494
0.23079958378806373 0.8358778509571465


In [21]:
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
predict_gb = gb.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict_gb))
r2 = r2_score(y_test, predict_gb)
print(rmse, r2)

0.26775969372719793 0.7791041495755577


## 커스텀 앙상블

In [22]:
f_predict = (
    0.25 * predict_cat +
    0.25 * predict_lgb +
    0.25 * predict_rf +
    0.25 * predict_xg
)

In [23]:
rmse = np.sqrt(mean_squared_error(y_test, f_predict)) # 여러 알고리즘으로 상관계수를 구함
r2 = r2_score(y_test, f_predict)
print(rmse, r2)

0.22028359543740592 0.8504930212981799
