In [2]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from sklearn.preprocessing import StandardScaler

In [5]:
df_train = pd.read_csv("data/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv("data/house-prices-advanced-regression-techniques/test.csv")

In [7]:
print(df_train.shape, df_test.shape)

(1460, 81) (1459, 80)


In [8]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [11]:
df_test.head() # 예측 해야 하는 target은 SalePrice

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [19]:
df_train.corr(numeric_only=True)['SalePrice']

Id              -0.021917
MSSubClass      -0.084284
LotFrontage      0.351799
LotArea          0.263843
OverallQual      0.790982
OverallCond     -0.077856
YearBuilt        0.522897
YearRemodAdd     0.507101
MasVnrArea       0.477493
BsmtFinSF1       0.386420
BsmtFinSF2      -0.011378
BsmtUnfSF        0.214479
TotalBsmtSF      0.613581
1stFlrSF         0.605852
2ndFlrSF         0.319334
LowQualFinSF    -0.025606
GrLivArea        0.708624
BsmtFullBath     0.227122
BsmtHalfBath    -0.016844
FullBath         0.560664
HalfBath         0.284108
BedroomAbvGr     0.168213
KitchenAbvGr    -0.135907
TotRmsAbvGrd     0.533723
Fireplaces       0.466929
GarageYrBlt      0.486362
GarageCars       0.640409
GarageArea       0.623431
WoodDeckSF       0.324413
OpenPorchSF      0.315856
EnclosedPorch   -0.128578
3SsnPorch        0.044584
ScreenPorch      0.111447
PoolArea         0.092404
MiscVal         -0.021190
MoSold           0.046432
YrSold          -0.028923
SalePrice        1.000000
Name: SalePr

In [20]:
# 상관관계가 높은 것만 추출
df_train.corr(numeric_only=True)['SalePrice'][(df_train.corr(numeric_only=True)['SalePrice'] > 0.5) | (df_train.corr(numeric_only=True)['SalePrice'] < -0.5)]

OverallQual     0.790982
YearBuilt       0.522897
YearRemodAdd    0.507101
TotalBsmtSF     0.613581
1stFlrSF        0.605852
GrLivArea       0.708624
FullBath        0.560664
TotRmsAbvGrd    0.533723
GarageCars      0.640409
GarageArea      0.623431
SalePrice       1.000000
Name: SalePrice, dtype: float64

In [21]:
feature_list = df_train.corr(numeric_only=True)['SalePrice'][(df_train.corr(numeric_only=True)['SalePrice'] > 0.5) | (df_train.corr(numeric_only=True)['SalePrice'] < -0.5)].index

In [22]:
feature_list

Index(['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF',
       'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea',
       'SalePrice'],
      dtype='object')

In [23]:
# feature_list에 결측치 확인
df_train[feature_list].isnull().sum()

OverallQual     0
YearBuilt       0
YearRemodAdd    0
TotalBsmtSF     0
1stFlrSF        0
GrLivArea       0
FullBath        0
TotRmsAbvGrd    0
GarageCars      0
GarageArea      0
SalePrice       0
dtype: int64

In [27]:
# SalePrice는 target이기 때문에 빼주는 것이 좋다.
features = list(feature_list[:-1])
features

['OverallQual',
 'YearBuilt',
 'YearRemodAdd',
 'TotalBsmtSF',
 '1stFlrSF',
 'GrLivArea',
 'FullBath',
 'TotRmsAbvGrd',
 'GarageCars',
 'GarageArea']

In [29]:
x_train = df_train[features]
y_train = df_train[['SalePrice']].values
x_test = df_test[features]

In [32]:
std_scaler = StandardScaler()
std_scaler.fit(x_train)
x_train_tensor = torch.from_numpy(std_scaler.transform(x_train)).float() # 표준화
x_test_tensor = torch.from_numpy(std_scaler.transform(x_test)).float()
y_train_tensor = torch.from_numpy(y_train).float() # target은 스케일링 필요 x 

In [35]:
print(x_train_tensor.shape, x_test_tensor.shape, y_train_tensor.shape)

torch.Size([1460, 10]) torch.Size([1459, 10]) torch.Size([1460, 1])


In [36]:
nb_epochs = 10000
minibatch_size = 256

In [45]:
class FunModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.linear_layers = nn.Sequential(
            nn.Linear(input_dim, 20),
            nn.LeakyReLU(),
            nn.Linear(20,10),
            nn.LeakyReLU(),
            nn.Linear(10,8),
            nn.LeakyReLU(),
            nn.Linear(8,6),
            nn.LeakyReLU(),
            nn.Linear(6, output_dim)
        )

    def forward(self, x):
        y=self.linear_layers(x)
        return y

In [46]:
input_dim = x_train_tensor.size(-1)
output_dim = y_train_tensor.size(-1)
print(input_dim, output_dim)

10 1


In [49]:
model = FunModel(input_dim, output_dim)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

In [50]:
indices = torch.randperm(x_train_tensor.size(0))
print(indices)

tensor([ 998,  370,   30,  ..., 1019,  192,  505])


In [51]:
x_batch_list = torch.index_select(x_train_tensor, 0, index = indices) # 섞고
y_batch_list = torch.index_select(y_train_tensor, 0, index = indices)
x_batch_list = x_batch_list.split(minibatch_size, 0) # 배치크기로 나누기
y_batch_list = y_batch_list.split(minibatch_size, 0)

In [54]:
for index in range(nb_epochs):
    indices = torch.randperm(x_train_tensor.size(0))

    x_batch_list = torch.index_select(x_train_tensor, 0, index = indices) # 섞고
    y_batch_list = torch.index_select(y_train_tensor, 0, index = indices)
    x_batch_list = x_batch_list.split(minibatch_size, 0) # 배치크기로 나누기
    y_batch_list = y_batch_list.split(minibatch_size, 0)

    epoch_loss = list()
    for x_minibatch, y_minibatch in zip(x_batch_list, y_batch_list):
        y_minibatch_pred = model(x_minibatch)

        loss = torch.sqrt(loss_function(y_minibatch_pred, y_minibatch)) # RMSE
        epoch_loss.append(loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (index % 100) == 0:
        print(index, sum(epoch_loss) / len(epoch_loss))

0 tensor(197368.8750, grad_fn=<DivBackward0>)
100 tensor(121533.3984, grad_fn=<DivBackward0>)
200 tensor(69889.4922, grad_fn=<DivBackward0>)
300 tensor(47638.3281, grad_fn=<DivBackward0>)
400 tensor(37836.2656, grad_fn=<DivBackward0>)
500 tensor(35801.9297, grad_fn=<DivBackward0>)
600 tensor(35319.0938, grad_fn=<DivBackward0>)
700 tensor(35244.0938, grad_fn=<DivBackward0>)
800 tensor(33905.2148, grad_fn=<DivBackward0>)
900 tensor(34718.4492, grad_fn=<DivBackward0>)
1000 tensor(34330.2148, grad_fn=<DivBackward0>)
1100 tensor(33970.7539, grad_fn=<DivBackward0>)
1200 tensor(34486.0742, grad_fn=<DivBackward0>)
1300 tensor(34945.1289, grad_fn=<DivBackward0>)
1400 tensor(34929.9023, grad_fn=<DivBackward0>)
1500 tensor(33678.0078, grad_fn=<DivBackward0>)
1600 tensor(33488.7031, grad_fn=<DivBackward0>)
1700 tensor(34495.3828, grad_fn=<DivBackward0>)
1800 tensor(33593.6523, grad_fn=<DivBackward0>)
1900 tensor(33849.4648, grad_fn=<DivBackward0>)
2000 tensor(33545.8086, grad_fn=<DivBackward0>)
21

In [55]:
# model.eval() 
# evaluation과정에서 사용하지 않을 layer들을 수행하지 않음
# 학습할 때만 필요한 dropout, batchnorm layer 등을 수행하지 않는다.

# torch.no_grad()   -> 역전파 쓰지 말아
# gradient 계산을 수행하지 않는다.
# 학습을 mini batch size 를 기반으로 하였다면, 예측도 mini batch size 단위로 진행해야 한다.

# 위의 두 개를 쓰지 않아야 test 데이터를 예측할 때 쓸 수 있어지는 것

In [57]:
y_pred_list = list()
x_test_batch_list = x_test_tensor.split(minibatch_size, 0)
model.eval() # 평가에서는 이거 해야함
with torch.no_grad(): # 평가에서는 이거 해야
    for x_minibatch in x_test_batch_list:
        y_minibatch_pred = model(x_minibatch)
        y_pred_list.extend(y_minibatch_pred.squeeze().detach().tolist()) # extend를 써야 하나의 리스트가 됨, append는 list 안에 list가 와버
submission = pd.DataFrame({"Id": df_test['Id'], 'SalePrice': y_pred_list})
submission.to_csv('submission.csv', index=False)

In [58]:
submission

Unnamed: 0,Id,SalePrice
0,1461,131188.265625
1,1462,155392.890625
2,1463,177552.437500
3,1464,179449.343750
4,1465,207572.375000
...,...,...
1454,2915,93164.843750
1455,2916,98629.125000
1456,2917,153310.328125
1457,2918,123316.710938
