In [7]:
!kaggle competitions download - c house-prices-advanced-regression-techniques
!unzip house-prices-advanced-regression-techniques.zip - d dataset


house-prices-advanced-regression-techniques.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  house-prices-advanced-regression-techniques.zip
  inflating: dataset/data_description.txt  
  inflating: dataset/sample_submission.csv  
  inflating: dataset/test.csv        
  inflating: dataset/train.csv       


In [138]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader, BatchSampler, random_split, WeightedRandomSampler, Subset
import pandas as pd
import os
import re
from collections import Counter
from cosine_annealing_warmup import CosineAnnealingWarmupRestarts
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import gc
from tqdm import tqdm

In [13]:
current_path = os.getcwd()
train_path = os.path.join(current_path, "./dataset/train.csv")
test_path = os.path.join(current_path, "./dataset/test.csv")
submission_path = os.path.join(current_path, "./dataset/sample_submission.csv")


In [14]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
submisson_df = pd.read_csv(submission_path)
train_df.head(5)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [30]:
from enum import Enum


class Filter_type(Enum):
    HIGH = 0
    MID = 1
    LOW = 2


def transform_df2input(df, mode: Filter_type, nor_mean=0, nor_std=1):
    if mode == Filter_type.HIGH:
        feature_list = ['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
                        'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
                        'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars',
                        'WoodDeckSF', 'OpenPorchSF', 'SalePrice']
    elif mode == Filter_type.MID:
        feature_list = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF',
                        '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces',
                        'GarageYrBlt', 'GarageCars', 'SalePrice']

    elif mode == Filter_type.LOW:
        feature_list = ['OverallQual', 'TotalBsmtSF', 'GrLivArea', 'GarageCars',
                        'SalePrice']

    df_selected = df[feature_list]]
    print(len(df_selected))
    print(len(df_selected))
    numpy_data= df_selected.values

    mean= np.nanmean(numpy_data, axis=0)
    std= np.nanstd(numpy_data, axis=0)

    # 정규화 수행
    normalized_array= (((numpy_data - mean) / (std) * nor_std)) + nor_mean
    normalized_array= np.nan_to_num(normalized_array)
    return numpy_data, normalized_array


data, nor_data= transform_df2input(train_df, Filter_type.HIGH)
nor_data.shape


1460
1460


(1460, 18)

In [106]:
class Dataset_type(Enum):
    TRAIN = 0
    TEST = 1


class House_dataset(Dataset):
    def __init__(self, df_path, dataset_type: Dataset_type, mode: Filter_type, nor_mean=0, nor_std=1):
        self.df_path = df_path
        self.df = pd.read_csv(df_path)
        self.nor_mean = nor_mean
        self.nor_std = nor_std
        self.dataset_type = dataset_type
        self.mode = mode
        self.input = self._transform_df2input(
            self.df, self.mode, self.nor_mean, self.nor_std)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src = self.input[0][idx]
        if self.dataset_type == Dataset_type.TRAIN:
            trg = self.input[1][idx]
            return src, trg
        else:
            return src
        return

    def _transform_df2input(self, df, mode: Filter_type, dataset_type: Dataset_type, nor_mean=0, nor_std=1):
        if mode == Filter_type.HIGH:
            feature_list = ['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
                            'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
                            'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars',
                            'WoodDeckSF', 'OpenPorchSF']
        elif mode == Filter_type.MID:
            feature_list = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF',
                            '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces',
                            'GarageYrBlt', 'GarageCars']

        elif mode == Filter_type.LOW:
            feature_list = ['OverallQual',
                            'TotalBsmtSF', 'GrLivArea', 'GarageCars']

        df_selected = df[feature_list]
        if self.dataset_type == Dataset_type.TRAIN:
            df_label = df["SalePrice"]
            numpy_label = df_label.values
        numpy_data = df_selected.values

        mean = np.nanmean(numpy_data, axis=0)
        std = np.nanstd(numpy_data, axis=0)

        # 정규화 수행
        normalized_array = (((numpy_data - mean) / (std) * nor_std)) + nor_mean
        normalized_array = np.nan_to_num(normalized_array)
        if self.dataset_type == Dataset_type.TRAIN:
            return (normalized_array, numpy_label)
        return normalized_array


In [107]:
train_set = House_dataset(
    df_path=train_path, dataset_type=Dataset_type.TRAIN, mode=Filter_type.HIGH, nor_mean=0.5)

In [108]:
train_set.input[1][0]


208500

In [109]:
train_set[0]


(array([0.79196567, 1.65147924, 2.05099379, 1.87866809, 1.51001534,
        1.57542484, 0.54069746, 0.20656621, 2.16185159, 1.37033344,
        1.78974052, 1.91220977, 0.04877351, 1.99242589, 1.31172464,
        0.24782416, 1.21650316]),
 208500)

In [115]:
# device = torch.device("mps")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cpu


In [164]:
class Linear(nn.Module):
    def __init__(self, input_dim):
        super(Linear, self).__init__()
        self.input_dim = input_dim
        print(input_dim)
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)

    def forward(self, x):
        x = nn.ReLU()(self.fc1(x))
        x = nn.ReLU()(self.fc2(x))
        x = nn.ReLU()(self.fc3(x))
        x = self.fc4(x)
        return x

In [186]:
def train(dataset, num_of_epoch, valid_rate, batch_size,
          optimizer, scheduler, criterion, valid_random_seed, patience=3):
    torch.manual_seed(valid_random_seed)

    optimizer = optimizer
    criterion = criterion

    train_set_count = int(len(dataset) * valid_rate)
    val_set_count = len(dataset) - train_set_count

    train_set, val_set = random_split(
        dataset, [train_set_count, val_set_count])

    train_loader = DataLoader(train_set, batch_size=batch_size,
                              shuffle=True)
    val_loader = DataLoader(val_set, batch_size=batch_size,
                            shuffle=True)
    scheduler = scheduler

    train_losses = []
    val_losses = []
    lrs = []
    best_val_loss = 0

    for epoch in tqdm(range(num_of_epoch)):
        model.train()
        correct_train = 0
        for batch_idx, (data, target) in tqdm(enumerate(train_loader)):
            data = data.to(device)
            optimizer.zero_grad()
            # print(data.shape)
            # print(data.dtype)
            data = data.type(torch.float32)
            output = model(data)
            loss = torch.sqrt(criterion(output, target))
            loss.backward()
            optimizer.step()
            lrs.append(optimizer.param_groups[0]["lr"])
            scheduler.step()
            _, pred = torch.max(output.data, 1)
            preds.extend(pred.tolist())
            targets.extend(target.tolist())

            if batch_idx % 100 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, lrs: {}'.format(
                    epoch + 1, batch_idx *
                    len(data[0]), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item(), lrs[-1]))

        train_losses.append(loss.item())
        print(correct_train)
        print(
            f"Epoch: {epoch + 1} - train loss: {train_losses[-1]}")

        gc.collect()

        model.eval()
        preds = []
        targets = []
        correct_val = 0
        print(len(val_set))

        with torch.no_grad():
            for batch_idx, (data, target) in tqdm(enumerate(val_loader)):
                data = data.to(device)
                data = data.type(torch.float32)
                output = model(data)
                loss = criterion(output, target)
                _, pred = torch.max(output.data, 1)
                preds.extend(pred.tolist())
                targets.extend(target.tolist())
                # gc.collect()

            val_loss = loss.item()
            val_losses.append(val_loss)

            if val_loss < best_val_loss:
                current_patience = patience
                best_val_loss = val_loss
                best_model = model.state_dict()
                torch.save(best_model, 'model.pt')
            else:
                current_patience -= 1
                if current_patience < 0:
                    print("Early Stopping!")
                    is_early_stopping = True

            print('Epoch {} finished: train loss = {}, val loss = {}'.format(epoch + 1,
                                                                             train_losses[-1], val_losses[-1]))
            print(
                f"Epoch: {epoch + 1} - Validation loss: {val_loss}")

            if is_early_stopping:
                break

In [187]:
model = Linear(input_dim=17)


17


In [None]:
model.load_state_dict(torch.load('model.pt'))


In [206]:
valid_rate = 0.9
batch_size = 64
learning_rate = 0.0005
num_of_epoch = 15
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingWarmupRestarts(
    optimizer, first_cycle_steps=4000, cycle_mult=1.0, max_lr=0.001, min_lr=0.0001, warmup_steps=500, gamma=1)

criterion = nn.MSELoss()


valid_random_seed = 5
train(dataset=train_set, valid_rate=valid_rate, num_of_epoch=num_of_epoch, batch_size=64, optimizer=optimizer,
      scheduler=scheduler, criterion=criterion, valid_random_seed=valid_random_seed)

0it [00:00, ?it/s]/15 [00:00<?, ?it/s]
  0%|          | 0/15 [00:00<?, ?it/s]


RuntimeError: Found dtype Long but expected Float