In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('./data/reg.csv', index_col=[0])

In [3]:
X = df.drop("Price",axis=1).to_numpy()
Y = df["Price"].to_numpy().reshape((-1,1))

In [4]:
class TensorData(Dataset):
    def __init__(self, x_data, y_data) -> None:
        self.x_data = torch.FloatTensor(x_data)
        self.y_data = torch.FloatTensor(y_data)
        self.len = self.y_data.shape[0]

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.7)
trainset = TensorData(X_train, Y_train)
testset = TensorData(X_test, Y_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=False)


In [5]:
class Regressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(13,50,bias=True)
        self.fc2 = nn.Linear(50,30,bias=True)
        self.fc3 = nn.Linear(30,1,bias=True)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(F.relu(self.fc2(x)))
        x = F.relu(self.fc3(x))

        return x

In [6]:
kfold = KFold(n_splits=3, shuffle=True)
criterion = nn.MSELoss()


In [7]:
def evaluation(dataloader):
    
    predictions = torch.tensor([], dtype=torch.float)
    actual = torch.tensor([], dtype=torch.float)
    with torch.no_grad():
        model.eval()
        for data in dataloader:
            inputs, values = data
            outputs = model(inputs)
            predictions = torch.cat((predictions, outputs), 0)
            actual = torch.cat((actual, values), 0)

    predictions = predictions.numpy()
    actual = actual.numpy()
    rmse = np.sqrt(mean_squared_error(predictions, actual))
    model.train()
    return rmse

In [16]:
validation_loss = []
for fold, (train_idx, val_idx) in enumerate(kfold.split(trainset)):
    train_subsampler = SubsetRandomSampler(train_idx)
    val_subsampler = SubsetRandomSampler(val_idx)
    trainloader = DataLoader(trainset, batch_size=32, sampler=train_subsampler)
    valloader = DataLoader(trainset, batch_size=32, sampler=val_subsampler)

    model = Regressor()
    optimizer = optim.Adam(model.parameters(), lr = 0.001, weight_decay=1e-7)

    for epoch in range(401):
        for data in trainloader:
            inputs, values = data
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, values)
            loss.backward()
            optimizer.step()

    train_rmse = evaluation(trainloader)
    val_rmse = evaluation(valloader)
    print("kfold",fold, "train Loss:%.4f, validation loss: %.4f"%(train_rmse, val_rmse))
    validation_loss.append(val_rmse)


kfold 0 train Loss:0.0763, validation loss: 0.1172
kfold 1 train Loss:0.4664, validation loss: 0.4031
kfold 2 train Loss:0.0757, validation loss: 0.0962


In [15]:
validation_loss = np.array(validation_loss)
mean = np.mean(validation_loss)
std = np.std(validation_loss)
print("vlidation score: %.4f, +- %.4f"%(mean, std))

vlidation score: 0.1095, +- 0.0133


In [19]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=False)
train_rmse = evaluation(trainloader) # 학습 데이터의 RMSE
test_rmse = evaluation(testloader) # 시험 데이터의 RMSE

print("Train RMSE: %.4f" %train_rmse)
print("Test RMSE: %.4f" %test_rmse)

Train RMSE: 0.0830
Test RMSE: 0.1139
