<a href="https://colab.research.google.com/github/Shk-tech/my-colab-learning-material/blob/main/2507_korea_pytorch02_housing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()

In [None]:
import torch
from torch import nn
import matplotlib.pyplot as plt
import numpy as np

In [None]:
#전처리....
#카테코리형 데이터 -> onehotencoding
pd.get_dummies(housing)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,False,True,False,False,False
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,False,True,False,False,False
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,False,True,False,False,False
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,False,True,False,False,False


In [None]:
# train -> 학습
# valid -> 검증
# publish, deploy -> 모델 활용
# inference -> 추론

pd.get_dummies(housing.iloc[:1, :])
#fit - 카테고리가 몇 개 있는지 확인.. 전략 NEAR_BAY, NEAR_OCEAN, INLAND, ISLAND, <1H OCEAN
#transform - 실제 변환...

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_NEAR BAY
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,True


In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(housing[['ocean_proximity']])
encoder.transform(housing.loc[:0, ['ocean_proximity']])

array([[0., 0., 0., 1., 0.]])

In [None]:
# train - valid
# 전처리
x_data = housing.drop('median_house_value', axis=1)
y_data = housing[['median_house_value']]
#series.shape (N,)
#dataframe.shape (N, 1)
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size=int(len(x_data)*0.2), shuffle=True)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

#전처리 - numeric(결측치, scale), categorical(onehotencoding)
num = x_train.select_dtypes(np.number)
cat = x_train.select_dtypes(object)

# 전처리 파라미터(mean, std, categories) 학습..
imputer = SimpleImputer()
imputer.fit(num)
scaler = StandardScaler()
scaler.fit(num)
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(cat)

# 파라미터 적용(연습...)
num2 = scaler.transform(num) # pandas->numpy
cat2 = encoder.transform(cat) # pandas->numpy
np.concat([num2, cat2], axis=1)

# df : pandas dataframe
def preproc(df):
    num = df.select_dtypes(np.number)
    cat = df.select_dtypes(object)
    num2 = imputer.transform(num)
    num2 = scaler.transform(num2) # pandas->numpy
    cat2 = encoder.transform(cat) # pandas->numpy
    return np.concat([num2, cat2], axis=1)

out_scaler = StandardScaler()
out_scaler.fit(y_train)

In [None]:
x_t = preproc(x_train).astype(np.float32)
y_t = out_scaler.transform(y_train).astype(np.float32)
x_v = preproc(x_valid).astype(np.float32)
y_v = out_scaler.transform(y_valid).astype(np.float32)

dataset_train = list(zip(x_t, y_t))
dataset_valid = list(zip(x_v, y_v))

loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=32, shuffle=True, drop_last=True)
loader_valid = torch.utils.data.DataLoader(dataset_valid, batch_size=32)

for x, y in loader_train:
    break
x.shape, y.shape, x.dtype, y.dtype



(torch.Size([32, 13]), torch.Size([32, 1]), torch.float32, torch.float32)

In [None]:
model = nn.Sequential(
    nn.Linear(13, 128), #13은 입력 데이터의 크기로 정해짐
    nn.SiLU(),
    nn.Linear(128, 128),
    nn.SiLU(),
    nn.Linear(128, 1), # 1은 출력 크기(y_train)로 정해짐.
)

opt = torch.optim.Adam(model.parameters())
loss_fn = nn.MSELoss()

pred = model(x)
loss = loss_fn(pred, y)
loss.backward()

In [None]:
for epoch in range(10):
    model.train()
    for x, y in loader_train:
        #1. feed-forward(pred)
        #2. loss(pred, y)
        #3. backprop(grad)
        #4. update
        pred = model(x)
        loss = loss_fn(pred, y)
        opt.zero_grad()
        loss.backward()
        opt.step()

        #metrics - 현재 학습 상황...
        print('\r e=%d l=%f' % (epoch, loss.item()), end='') #???
    print()

    model.eval() #!!!!!!!!!!!!
    for x, y in loader_valid:
        #1. feed-forward(pred)
        #2. loss(pred, y)
        with torch.no_grad(): #!!!!!!!!
            pred = model(x)
        loss = loss_fn(pred, y)

        #metrics - 현재 학습 상황...
        print('\r                                            e=%d l=%f' % (epoch, loss.item()), end='') #???
    print()

 e=0 l=0.259548
                                            e=0 l=0.131365
 e=1 l=0.109614
                                            e=1 l=0.140146
 e=2 l=0.191603
                                            e=2 l=0.130866
 e=3 l=0.207054
                                            e=3 l=0.145811
 e=4 l=0.235055
                                            e=4 l=0.136291
 e=5 l=0.140488
                                            e=5 l=0.117265
 e=6 l=0.156417
                                            e=6 l=0.126242
 e=7 l=0.082855
                                            e=7 l=0.107376
 e=8 l=0.202074
                                            e=8 l=0.116649
 e=9 l=0.196943
                                            e=9 l=0.106499


In [None]:
sample = x_valid[:5]

In [None]:
xt = torch.tensor(preproc(sample)).float()
model.eval()
with torch.no_grad():
    pred = model(xt)
pred = out_scaler.inverse_transform(pred)
pred



array([[116823.97713238],
       [205364.90643987],
       [255712.29967189],
       ...,
       [149936.74332678],
       [182902.0791939 ],
       [196592.13987993]])

In [None]:
((pred - y_valid.values) ** 2).mean() ** 0.5 # rmse

np.float64(54214.958306075634)