In [68]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [69]:
df_train = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('./house-prices-advanced-regression-techniques/test.csv')

print(df_train.shape, df_test.shape)

(1460, 81) (1459, 80)


In [70]:
df_train.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [71]:
df_test.head(3) # 예측해야 하는 값이므로 SalePrice가 없다.

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal


In [72]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [73]:
df_train.corr(numeric_only=True)['SalePrice']

Id              -0.021917
MSSubClass      -0.084284
LotFrontage      0.351799
LotArea          0.263843
OverallQual      0.790982
OverallCond     -0.077856
YearBuilt        0.522897
YearRemodAdd     0.507101
MasVnrArea       0.477493
BsmtFinSF1       0.386420
BsmtFinSF2      -0.011378
BsmtUnfSF        0.214479
TotalBsmtSF      0.613581
1stFlrSF         0.605852
2ndFlrSF         0.319334
LowQualFinSF    -0.025606
GrLivArea        0.708624
BsmtFullBath     0.227122
BsmtHalfBath    -0.016844
FullBath         0.560664
HalfBath         0.284108
BedroomAbvGr     0.168213
KitchenAbvGr    -0.135907
TotRmsAbvGrd     0.533723
Fireplaces       0.466929
GarageYrBlt      0.486362
GarageCars       0.640409
GarageArea       0.623431
WoodDeckSF       0.324413
OpenPorchSF      0.315856
EnclosedPorch   -0.128578
3SsnPorch        0.044584
ScreenPorch      0.111447
PoolArea         0.092404
MiscVal         -0.021190
MoSold           0.046432
YrSold          -0.028923
SalePrice        1.000000
Name: SalePr

In [74]:
df_train.corr(numeric_only=True)['SalePrice'][(df_train.corr(numeric_only=True)['SalePrice'] > 0.5) | (df_train.corr(numeric_only=True)['SalePrice'] < -0.5)]

OverallQual     0.790982
YearBuilt       0.522897
YearRemodAdd    0.507101
TotalBsmtSF     0.613581
1stFlrSF        0.605852
GrLivArea       0.708624
FullBath        0.560664
TotRmsAbvGrd    0.533723
GarageCars      0.640409
GarageArea      0.623431
SalePrice       1.000000
Name: SalePrice, dtype: float64

In [75]:
feature_list = df_train.corr(numeric_only=True)['SalePrice'][(df_train.corr(numeric_only=True)['SalePrice'] > 0.5) | (df_train.corr(numeric_only=True)['SalePrice'] < -0.5)].index
print(feature_list)

Index(['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF',
       'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea',
       'SalePrice'],
      dtype='object')


In [76]:
df_train[feature_list].isnull().sum() # 결측치 확인

OverallQual     0
YearBuilt       0
YearRemodAdd    0
TotalBsmtSF     0
1stFlrSF        0
GrLivArea       0
FullBath        0
TotRmsAbvGrd    0
GarageCars      0
GarageArea      0
SalePrice       0
dtype: int64

In [77]:
df_train[feature_list].dtypes

OverallQual     int64
YearBuilt       int64
YearRemodAdd    int64
TotalBsmtSF     int64
1stFlrSF        int64
GrLivArea       int64
FullBath        int64
TotRmsAbvGrd    int64
GarageCars      int64
GarageArea      int64
SalePrice       int64
dtype: object

In [78]:
features = feature_list[:-1]
features

Index(['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF',
       'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea'],
      dtype='object')

In [79]:
for feature in features:
    df_test.fillna((df_train[feature].mean()),inplace=True) # test 데이터셋 결측치가 있으면 평균값으로 대체

In [80]:
X_train = df_train[features]
y_train = df_train[['SalePrice']].values
X_test = df_test[features]

In [81]:
type(y_train)

numpy.ndarray

In [82]:
std_scaler = StandardScaler()
std_scaler.fit(X_train)

X_train_tensor = torch.from_numpy(std_scaler.transform(X_train)).float()
X_test_tensor = torch.from_numpy(std_scaler.transform(X_test)).float()
y_train_tensor = torch.from_numpy(y_train).float()


In [83]:
print(X_train_tensor.shape, X_test_tensor.shape, y_train_tensor.shape)

torch.Size([1460, 10]) torch.Size([1459, 10]) torch.Size([1460, 1])


In [84]:
nb_epochs = 10000
minibatch_size = 256

In [85]:
class FunModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear_layers = nn.Sequential(
            nn.Linear(input_dim, 20),
            nn.LeakyReLU(0.1),
            nn.Linear(20, 10),
            nn.LeakyReLU(0.1),
            nn.Linear(10, 8),
            nn.LeakyReLU(0.1),
            nn.Linear(8, 6),
            nn.LeakyReLU(0.1),
            nn.Linear(6, output_dim)
        )
    def forward(self, x):
        y = self.linear_layers(x)
        return y
        
    

In [86]:
input_dim = X_train_tensor.size(-1)
output_dim = y_train_tensor.size(-1)
print(input_dim, output_dim)
model = FunModel(input_dim, output_dim)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters()) # Adam 사용

10 1


In [87]:
indies = torch.randperm(X_train_tensor.size(0))
print(indies)
x_batch_list = torch.index_select(X_train_tensor, dim=0, index=indies)
y_batch_list = torch.index_select(y_train_tensor, dim=0, index=indies)
x_batch_list = x_batch_list.split(minibatch_size, dim=0)
y_batch_list = y_batch_list.split(minibatch_size, dim=0)

tensor([ 780,  989,  102,  ..., 1104,  167, 1265])


In [88]:
for index in range(nb_epochs):
    indies = torch.randperm(X_train_tensor.size(0))
    
    x_batch_list = torch.index_select(X_train_tensor, dim=0, index=indies)
    y_batch_list = torch.index_select(y_train_tensor, dim=0, index=indies)
    x_batch_list = x_batch_list.split(minibatch_size, dim=0)
    y_batch_list = y_batch_list.split(minibatch_size, dim=0)
    
    epoch_loss = list()
    for x_minibatch, y_minibatch in zip(x_batch_list, y_batch_list):
        y_minibatch_pred = model(x_minibatch)
        
        loss = torch.sqrt(loss_function(y_minibatch_pred, y_minibatch)) # RMSE
        epoch_loss.append(loss)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if(index % 100) == 0:
        print(index, sum(epoch_loss), sum(epoch_loss)/len(epoch_loss))

0 tensor(1184862.1250, grad_fn=<AddBackward0>) tensor(197477.0156, grad_fn=<DivBackward0>)
100 tensor(972512.3750, grad_fn=<AddBackward0>) tensor(162085.3906, grad_fn=<DivBackward0>)
200 tensor(415321.2188, grad_fn=<AddBackward0>) tensor(69220.2031, grad_fn=<DivBackward0>)
300 tensor(288772.8438, grad_fn=<AddBackward0>) tensor(48128.8086, grad_fn=<DivBackward0>)
400 tensor(224958.0312, grad_fn=<AddBackward0>) tensor(37493.0039, grad_fn=<DivBackward0>)
500 tensor(208914.4375, grad_fn=<AddBackward0>) tensor(34819.0742, grad_fn=<DivBackward0>)
600 tensor(204181.7812, grad_fn=<AddBackward0>) tensor(34030.2969, grad_fn=<DivBackward0>)
700 tensor(196366.2812, grad_fn=<AddBackward0>) tensor(32727.7129, grad_fn=<DivBackward0>)
800 tensor(198843.4844, grad_fn=<AddBackward0>) tensor(33140.5820, grad_fn=<DivBackward0>)
900 tensor(195320.5938, grad_fn=<AddBackward0>) tensor(32553.4316, grad_fn=<DivBackward0>)
1000 tensor(195821.8906, grad_fn=<AddBackward0>) tensor(32636.9824, grad_fn=<DivBackward0

### 테스트셋 기반 Evalution

#### model.eval()과 torch.no_grad()

- model.eval()
    - evalution과정에서 사용하지 않음 layer들을 수행하지 않음
    - 학습할 때만 필요한 Dropout, BatchNorm layer등을 수행하지 않음

- torch.no_grad()
    - gradient 계산을 수행하지 않음

In [None]:
y_pred_list = list() # 예측값 저장을 하기위해 list 초기화
x_test_batch_list = X_test_tensor.split(minibatch_size, dim=0) # 테스트 데이터를 미니배치로 분할해줍니다.
model.eval() # 모델을 평가(evaluation) 모드로 전환합니다.
with torch.no_grad(): # 해당 with 블록 안에서는 기울기(gradient)를 계산하지 않습니다. 예측만 할 때는 기울기가 필요없기 때문에
    for x_minibatch in x_test_batch_list:
        y_minibatch_pred = model(x_minibatch)
        # 텐서의 차원 중 크기가 1인 차원을 제거합니다 -> 현재 계산 그래프에서 이 텐서를 분리합니다. -> Pytorch 텐서를 리스트로 변환
        y_pred_list.extend(y_minibatch_pred.squeeze().detach().tolist()) 

submission = pd.DataFrame({'Id':df_test['Id'], 'SalePrice':y_pred_list})
submission.to_csv('submission.csv', index=False)
