# 집값 예측(Linear Regression)

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import myutils as my

### 데이터 준비

In [38]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
type(housing)

sklearn.utils._bunch.Bunch

In [39]:
housing.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [40]:
df = pd.DataFrame(housing.data,
                 columns = housing.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [41]:
df['target'] = housing.target
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [42]:
df.shape

(20640, 9)

In [43]:
# housing.DESCR

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   target      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [45]:
df.isna().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
target        0
dtype: int64

In [46]:
df.duplicated().sum() # 중복치 확인

0

In [47]:
# X, y 

X = df.iloc[:,:3]
y = df.iloc[:,-1]

In [48]:
# 테스트 데이터 분리
# 스케일링 X
# 학습
# RMSE(X_test)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size = 0.2,
                                                   random_state=2022)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(16512, 3) (4128, 3) (16512,) (4128,)


In [49]:
# 스케일링
# 학습
# RMSE(X_test)

In [50]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
y_train = y_train.values

In [51]:
X_test = scaler.fit_transform(X_test) 
y_test = y_test.values

In [52]:
from sklearn.linear_model import  LinearRegression

lr = LinearRegression()
lr.fit(X_train,y_train)

In [53]:
lr.coef_, lr.intercept_ # 기울기, 절편

(array([ 0.85208536,  0.20838524, -0.085883  ]), 2.076720981710271)

In [54]:
y_pred = lr.predict(X_test)

In [57]:
from sklearn.metrics import mean_squared_error

# 에러는 낮을수록 좋습니다.
# MSE
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmse

0.8045075088355537

In [62]:
from sklearn.model_selection import cross_val_score
# 테스트데이터를 여러가지 경우로 바꿔서 평균내는 함수
# 더 정확한 결과를 낼 수 있음

mse = cross_val_score(lr, X_test, y_test, 
                      scoring='neg_mean_squared_error', #neg_ 함수를 음수로 바꾸려고 (높을수록 좋게 만드려고)
                     cv=3)
mse

array([-0.63879692, -0.6664322 , -0.66737045])

In [64]:
np.mean(np.sqrt(-mse))

0.8108427192491202

In [71]:
### 결정 트리

from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
mse = cross_val_score(dtr, X_test, y_test, 
                      scoring='neg_mean_squared_error', #neg_ 함수를 음수로 바꾸려고 (높을수록 좋게 만드려고)
                     cv=3)
np.mean(np.sqrt(-mse))

1.0393997361811171

In [72]:
### RandomForest

from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
mse = cross_val_score(rfr, X_test, y_test, 
                      scoring='neg_mean_squared_error', #neg_ 함수를 음수로 바꾸려고 (높을수록 좋게 만드려고)
                     cv=3)
np.mean(np.sqrt(-mse))

0.7873127809873827

In [74]:
### 서포트 벡터 머신

from sklearn.svm import SVR

svr = SVR()
svr.fit(X_train, y_train)
mse = cross_val_score(svr, X_test, y_test, 
                      scoring='neg_mean_squared_error', #neg_ 함수를 음수로 바꾸려고 (높을수록 좋게 만드려고)
                     cv=3)
np.mean(np.sqrt(-mse))

0.7458199916008016

In [None]:
# svm.score(X_train, y_train) #정확도
# lr.score(X_train, y_train) #R^2 0~1

In [75]:
from sklearn.linear_model import Lasso, Ridge