<a href="https://colab.research.google.com/github/SeongilHeo/hufs_ai_camp/blob/master/Day4_dnn_california_housingprice_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 캘리포니아 집값 예측하기: 회귀

In [None]:
# 산점도 행렬을 그리기 위해 seaborn 패키지를 설치합니다
!pip install pandas

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals, unicode_literals

import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

try:
  # Colab only
  %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

In [None]:
# 런타임에서 할당하는데 필요한 양만큼의 GPU 메모리를 할당
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_memory_growth(gpus[0], True)
  except RuntimeError as e:
    # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
    print(e)

### California Housing Prices 데이터셋 로딩

In [None]:
dataset_path = "housing.csv"
raw_dataset = pd.read_csv(dataset_path)

dataset = raw_dataset.copy()
dataset.tail()

In [None]:
dataset.info()

### 데이터 분포 분석

In [None]:
dataset.hist(bins=50, figsize=(20,15))

In [None]:
dataset_stats = dataset.describe()
dataset_stats = dataset_stats.transpose()
dataset_stats

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x='longitude', y='latitude',data=dataset,hue='median_house_value')

In [None]:
dataset.corr()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(dataset.corr(), annot=True)

### 누락 데이터 삭제

In [None]:
dataset.isna().sum()

In [None]:
dataset = dataset.dropna()

### 범주형 데이터 변환

In [None]:
ocean_proximity = dataset.pop('ocean_proximity')
ocean_proximity.unique()

for proximity in ocean_proximity.unique():
    dataset[proximity] = (ocean_proximity == proximity)*1.0
    
dataset.tail()

### 데이터셋 분할

In [None]:
train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)

### 산점도 행렬로 데이터 조사

In [None]:
sns.pairplot(train_dataset[['median_house_value', 'total_rooms', 'median_income', 'population']], diag_kind='kde')

### 통계 확인

In [None]:
train_stats = train_dataset.describe()
train_stats.pop("median_house_value")
train_stats = train_stats.transpose()
train_stats

### 레이블 분리하기

In [None]:
train_labels = train_dataset.pop('median_house_value')
test_labels = test_dataset.pop('median_house_value')

### 데이터 정규화
표준정규분포 정규화 사용 $z = \frac{X - \mu}{\sigma} \sim \mathcal{N}(0,1)$

#### 입력 데이터 정규화

In [None]:
def norm(x):
  return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [None]:
def minmax_norm(x):
  return (x - train_stats['min']) / (train_stats['max']-train_stats['min'])

In [None]:
normed_train_data.tail()

#### 레이블 데이터 스케일링
입력 데이터는 정규화 되어 있는데 레이블 데이터가 너무 크면 훈련이 늦어지므로 일정 비율로 줄여준다.

In [None]:
Y_SCALE = 10000
normed_train_labels = train_labels/Y_SCALE
normed_test_labels = test_labels/Y_SCALE

## 모델

In [None]:
def build_model():
  model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.Adam()

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

In [None]:
model = build_model()

In [None]:
model.summary()

In [None]:
example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
example_result

### 모델 훈련

In [None]:
# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 100 == 0: print('')
    print('.', end='')

EPOCHS = 1000
batch_size=128

history = model.fit(
  normed_train_data, normed_train_labels, batch_size,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
  callbacks=[PrintDot()])

### 훈련 과정 시각화

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
import matplotlib.pyplot as plt

def plot_history(history):
  # DataFrame으로 변환
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  # epoch 추가
  plt.figure(figsize=(8,12))

  # MAE Graph
  plt.subplot(2,1,1)
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error')
  plt.plot(hist['epoch'], hist['mae'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mae'],
           label = 'Val Error')
  # plt.ylim([0,5])
  plt.legend()

  # MSE Graph
  plt.subplot(2,1,2)
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error')
  plt.plot(hist['epoch'], hist['mse'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mse'],
           label = 'Val Error')
  # plt.ylim([0,20])
  plt.legend()
  plt.show()

plot_history(history)

## 검증

In [None]:
loss, mae, mse = model.evaluate(normed_test_data, normed_test_labels, verbose=2)

print("테스트 세트의 평균 절대 오차: {:5.2f} ".format(mae))

## 예측

#### 예측 테스트 결과와 실제 레이블과의 상관 관계 그래프

In [None]:
test_predictions = model.predict(normed_test_data).flatten()*Y_SCALE

plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-600000, 600000], [-600000, 600000])


#### 오차 분포 히스토그램

In [None]:
# 오차의 분포
error = test_predictions - test_labels
plt.hist(error, bins = 25)
plt.xlabel("Prediction Error")
_ = plt.ylabel("Count")