# Torch Class Model 실습

In [1]:
import pandas as pd
import numpy as np

### Torch 모델을 이용한 아파트 실거래가 예측

- 데이터 로드

In [2]:
df = pd.read_csv('data/2020년 서울시 아파트 실거래가.csv', encoding="cp949")
df

Unnamed: 0,건축년도,거래년도,거래월,거래일자,법정동,거래금액,아파트이름,전용면적,지번,지역코드,층
0,2007,2020,1,3,필운동,100000,신동아블루아광화문의 꿈,111.97,254,11110,6
1,2008,2020,1,2,사직동,162000,광화문풍림스페이스본(106동),163.33,9-1,11110,2
2,1994,2020,1,6,내수동,88000,세종로대우,109.91,167,11110,11
3,2004,2020,1,9,내수동,130000,킹스매너,194.43,110-15,11110,4
4,2003,2020,1,20,내수동,149000,경희궁 파크팰리스,148.09,95,11110,8
...,...,...,...,...,...,...,...,...,...,...,...
80971,2011,2020,12,21,강일동,115000,고덕리엔파크2단지,84.83,717,11740,5
80972,2009,2020,12,26,강일동,109000,강일리버파크3단지,84.53,674,11740,7
80973,2009,2020,12,26,강일동,107000,강일리버파크4단지,84.83,673,11740,10
80974,2009,2020,12,31,강일동,111000,강일리버파크4단지,84.74,673,11740,12


 - 새 열 사용년수: 2020-거래년도

In [3]:
df['사용년수'] = 2020 - df['거래년도']
df['사용년수']

0        0
1        0
2        0
3        0
4        0
        ..
80971    0
80972    0
80973    0
80974    0
80975    0
Name: 사용년수, Length: 80976, dtype: int64

 - 거래금액 열 타입 변경
   - float 타입으로 변경
   - 천원 단위로 변경 (현재 원단위)
   - 변환 과정 중 발생하는 오류 해결

In [4]:
df['거래금액'] = df['거래금액'].str.strip().str.replace(',', '').astype('float')/1000
df['거래금액']

0        100.0
1        162.0
2         88.0
3        130.0
4        149.0
         ...  
80971    115.0
80972    109.0
80973    107.0
80974    111.0
80975    104.0
Name: 거래금액, Length: 80976, dtype: float64

- 거래년도, 지번, 거래일자 열 삭제

In [5]:
df = df.drop(['거래년도', '지번', '거래일자'], axis=1)
df

Unnamed: 0,건축년도,거래월,법정동,거래금액,아파트이름,전용면적,지역코드,층,사용년수
0,2007,1,필운동,100.0,신동아블루아광화문의 꿈,111.97,11110,6,0
1,2008,1,사직동,162.0,광화문풍림스페이스본(106동),163.33,11110,2,0
2,1994,1,내수동,88.0,세종로대우,109.91,11110,11,0
3,2004,1,내수동,130.0,킹스매너,194.43,11110,4,0
4,2003,1,내수동,149.0,경희궁 파크팰리스,148.09,11110,8,0
...,...,...,...,...,...,...,...,...,...
80971,2011,12,강일동,115.0,고덕리엔파크2단지,84.83,11740,5,0
80972,2009,12,강일동,109.0,강일리버파크3단지,84.53,11740,7,0
80973,2009,12,강일동,107.0,강일리버파크4단지,84.83,11740,10,0
80974,2009,12,강일동,111.0,강일리버파크4단지,84.74,11740,12,0


- 전체 데이터 중 거래량이 30개 이상인 아파트만 선택해서 df에 저장하세요.

In [6]:
idx = df['아파트이름'].value_counts()[df['아파트이름'].value_counts()<30].index
df = df[~df['아파트이름'].isin(idx)]
df

Unnamed: 0,건축년도,거래월,법정동,거래금액,아파트이름,전용면적,지역코드,층,사용년수
12,1995,1,명륜2가,89.8,아남1,84.9000,11110,5,0
14,1995,1,명륜2가,89.0,아남1,84.9000,11110,12,0
16,1999,1,창신동,64.0,두산,59.9500,11110,4,0
17,1993,1,창신동,51.0,창신쌍용2,79.8700,11110,3,0
18,1993,1,창신동,49.5,창신쌍용2,64.6600,11110,6,0
...,...,...,...,...,...,...,...,...,...
80945,2020,12,천호동,24.5,미사 아름채 아파트,12.5650,11740,12,0
80947,2000,12,천호동,87.0,동아하이빌아파트,84.9700,11740,16,0
80948,2020,12,천호동,26.5,미사 아름채 아파트,13.3272,11740,9,0
80949,2017,12,천호동,136.5,래미안강동팰리스,84.9700,11740,18,0


- 법정동, 아파트 이름을 정수로 인코딩하세요.

In [7]:
from sklearn.preprocessing import LabelEncoder

df['법정동'] = LabelEncoder().fit_transform(df['법정동'])
df['아파트이름'] = LabelEncoder().fit_transform(df['아파트이름'])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['법정동'] = LabelEncoder().fit_transform(df['법정동'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['아파트이름'] = LabelEncoder().fit_transform(df['아파트이름'])


Unnamed: 0,건축년도,거래월,법정동,거래금액,아파트이름,전용면적,지역코드,층,사용년수
12,1995,1,63,89.8,445,84.9000,11110,5,0
14,1995,1,63,89.0,445,84.9000,11110,12,0
16,1999,1,190,64.0,171,59.9500,11110,4,0
17,1993,1,190,51.0,569,79.8700,11110,3,0
18,1993,1,190,49.5,569,64.6600,11110,6,0
...,...,...,...,...,...,...,...,...,...
80945,2020,12,193,24.5,266,12.5650,11740,12,0
80947,2000,12,193,87.0,166,84.9700,11740,16,0
80948,2020,12,193,26.5,266,13.3272,11740,9,0
80949,2017,12,193,136.5,189,84.9700,11740,18,0


 - 데이터 선택

In [8]:
features = [
    '건축년도',
    '거래월',
    '법정동',
    '아파트이름',
    '전용면적',
    '지역코드',
    '층',
    '사용년수'
]
X = df[features]
Y = df['거래금액']

 - 데이터 분할
   - 분할 비율 7:3

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y,
                                                    test_size=.3,
                                                    random_state=0)

### 모델 구성

In [10]:
import torch
from torch import nn, optim

 - GPU 사용이 가능한지 장치 확인 후 device 변수 설정

In [11]:
# code here


'cuda'

 - DNNRegressor 모델을 클래스로 생성하세요.
 - DNNRegressor 모델은 3개의 은닉층을 갖습니다.
   - 각 은닉층은 256, 64, 32개의 퍼셉트론을 갖고 활성화는 relu입니다.

In [12]:
# code here


 - 시드를 777로 설정하세요.

In [13]:
# code here


<torch._C.Generator at 0x17e360565b0>

 - 모델 클래스를 이용하여 모델 인스턴스를 생성하세요.

In [14]:
# code here


 - 모델의 구성을 출력하세요.

In [15]:
# code here


DNNRegressor(
  (fc1): Linear(in_features=8, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=1, bias=True)
  (relu): ReLU()
)


 - 모델의 summary를 출력하세요.

In [16]:
# code here


Layer (type:depth-idx)                   Param #
DNNRegressor                             --
├─Linear: 1-1                            2,304
├─Linear: 1-2                            16,448
├─Linear: 1-3                            2,080
├─Linear: 1-4                            33
├─ReLU: 1-5                              --
Total params: 20,865
Trainable params: 20,865
Non-trainable params: 0

- 데이터에 적합한 loss와 optimizer를 선정하여 설정하세요.

In [17]:
# code here


- train 데이터를 학습시키세요.
- epoch=100000회

In [18]:
# code here


  0%|          | 0/100000 [00:00<?, ?it/s]

- 학습시킨 모델로 mse, $r^2$를 출력하세요.

In [19]:
from sklearn.metrics import mean_squared_error

train_pred = model(torch.FloatTensor(x_train.values).to(device)).detach().cpu().numpy()
test_pred = model(torch.FloatTensor(x_test.values).to(device)).detach().cpu().numpy()

mean_squared_error(y_train, train_pred), mean_squared_error(y_test, test_pred)

(562.5936114358478, 577.4957564108436)

In [20]:
from sklearn.metrics import r2_score

r2_score(y_train, train_pred), r2_score(y_test, test_pred)

(0.7994387463700738, 0.7886819574574498)

 - 검증 데이터의 결과를 학습과정에서 확인할 수 있도록 구성한 후 학습을 수행하세요.

- 시드를 777로 설정하세요.

In [26]:
# code here


<torch._C.Generator at 0x17e360565b0>

- 모델 인스턴스를 생성하세요.

In [27]:
# code here


- 데이터에 적합한 loss와 optimizer를 선정하여 설정하세요.

In [28]:
# code here


- train 데이터를 학습시키세요.
- epoch=150000회

In [29]:
# code here


  0%|          | 0/150000 [00:00<?, ?it/s]

 - 학습시킨 모델로 mse, $r^2$를 출력하세요.

In [30]:
from sklearn.metrics import mean_squared_error

train_pred = model(torch.FloatTensor(x_train.values).to(device)).detach().cpu().numpy()
test_pred = model(torch.FloatTensor(x_test.values).to(device)).detach().cpu().numpy()

mean_squared_error(y_train, train_pred), mean_squared_error(y_test, test_pred)

(481.52094597896684, 504.2256765957213)

In [31]:
from sklearn.metrics import r2_score

r2_score(y_train, train_pred), r2_score(y_test, test_pred)

(0.8283406661370846, 0.815493045974701)

- 랜덤포레스트로 모델링한 후 r2를 출력하세요.

In [32]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=0, n_jobs=-1)
rf.fit(x_train, y_train)
rf.score(x_train, y_train), rf.score(x_test, y_test)

(0.9968821312475078, 0.9799872302595628)

- 그래디언트 부스팅으로 모델링한 후 r2를 출력하세요.

In [33]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(random_state=0)
gb.fit(x_train, y_train)
gb.score(x_train, y_train), gb.score(x_test, y_test)

(0.8795355661645858, 0.8810740250267032)

- XGBoost로 모델링한 후 r2를 출력하세요.

In [40]:
from xgboost import XGBRegressor

gb = XGBRegressor(random_state=0)
gb.fit(x_train, y_train)
gb.score(x_train, y_train), gb.score(x_test, y_test)

(0.984291038373904, 0.9730610677828229)

- Light GBM으로 모델링한 후 r2를 출력하세요.

In [41]:
from lightgbm import LGBMRegressor

gb = LGBMRegressor(random_state=0)
gb.fit(x_train, y_train)
gb.score(x_train, y_train), gb.score(x_test, y_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 863
[LightGBM] [Info] Number of data points in the train set: 35998, number of used features: 7
[LightGBM] [Info] Start training from score 85.025036


(0.9605868885943858, 0.9557835956325829)

- Catboost로 모델링한 후 r2를 출력하세요.

In [42]:
from catboost import CatBoostRegressor

gb = CatBoostRegressor(random_state=0)
gb.fit(x_train, y_train, verbose=0)
gb.score(x_train, y_train), gb.score(x_test, y_test)

(0.9788935598361715, 0.9732466083985435)