In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
from sklearn.model_selection import train_test_split

import torch
import torchvision.transforms as transforms
from torch.autograd import Variable
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import nn, optim
import torch.nn.init as init
import torch.utils.data as Data
import math


import matplotlib.pyplot as plt
import torch.multiprocessing as mp


In [2]:
mp.set_start_method('spawn')

## hyperparameters

In [3]:
batch_size = 512
#epochs = 2000
use_gpu = True
lr = 0.003
weight_decay = 10

# Batch size and learning rate is hyperparameters in deep learning
# suggest batch_size is reduced, lr is also reduced which will reduce concussion

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [5]:
X = pd.read_csv('./dataset-0510/train.csv')
X_test = pd.read_csv('./dataset-0510/test.csv')

In [6]:
y = X['total_price']
X = X.drop(columns=['building_id', 'total_price'], axis=1)

X_test = X_test.drop(columns=['building_id'], axis=1)

In [7]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.3, random_state=42) 

### scale y

In [8]:
y_scaler = StandardScaler()
y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_eval = y_scaler.fit_transform(y_eval.values.reshape(-1, 1))

## Imputer, Scaler, Feature selection

In [9]:
# step1. Imputation transformer for completing missing values.
step1 = ('Imputer', Imputer())
# step2. MinMaxScaler
step2 = ('MinMaxScaler', MinMaxScaler())
# step3. feature selection
#step3 = ('FeatureSelection', SelectFromModel(RandomForestRegressor()))
step3 = ('FeatureSelection', VarianceThreshold())

pipeline = Pipeline(steps=[step1, step2])





In [10]:
print(X_train.shape)
X_train = pipeline.fit_transform(X_train)
X_eval = pipeline.fit_transform(X_eval)

print(X_test.shape)
X_test = pipeline.fit_transform(X_test)


(42000, 233)
(10000, 233)


In [11]:
X_train = torch.from_numpy(X_train).float().to(device)
X_eval = torch.from_numpy(X_eval).float().to(device)

y_train = torch.from_numpy(y_train).float().to(device)
y_eval = torch.from_numpy(y_eval).float().to(device)

X_test = torch.from_numpy(X_test).float().to(device)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

torch.Size([42000, 233])
torch.Size([10000, 233])
torch.Size([42000, 1])


In [13]:
train_dataset = Data.TensorDataset(X_train, y_train)
loader = Data.DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=8,
    pin_memory=True,
)

eval_dataset = Data.TensorDataset(X_eval, y_eval)
loader = Data.DataLoader(
    dataset=eval_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=8,
    pin_memory=True,
)

## building model

In [14]:
class DNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.fc1 = nn.Linear(233, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 1)
        
        self.dropout = nn.Dropout(p=0.3)
    def forward(self, x):
        '''
        # fc1
        x = self.dropout(self.fc1(x))
        x = F.relu(x)
        # fc2
        x = self.dropout(self.fc2(x))
        x = F.relu(x)
        # fc3
        x = self.fc3(x)
        '''
        
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        x = F.relu(x)
        x = self.fc4(x)
        x = F.relu(x)
        x = self.fc5(x)
        
        return x

In [15]:
model = DNN().to(device)
criterion = nn.MSELoss()
optim = optim.Adam(model.parameters(), lr= lr)

In [16]:
'''
def train_func(model, loader, epochs=10):
    model.train()
    for e in range(epochs):
        train_loss = []
        for step, (batch_x, batch_y) in enumerate(loader):
            optim.zero_grad()
            pred = model(batch_x)
            loss = criterion(batch_y, pred)
            loss.backward()
            optim.step()

            train_loss.append(loss.item())

        print('training loss', np.array(train_loss).mean())
    return model
'''
def train_func(model, loader, accumlation_steps=512):
    model.train()
    optim.zero_grad()
    train_loss = []
    for step, (batch_x, batch_y) in enumerate(loader):
        pred = model(batch_x)
        loss = criterion(pred, batch_y)
        
        train_loss.append(loss.item())
        
        loss = loss / accumlation_steps
        loss.backward()
        
        if step % accumlation_steps == 0 or step == len(loader)-1:
            optim.step()
            optim.zero_grad()
            

        

    print('training loss', np.array(train_loss).mean())
    return model, np.array(train_loss).mean()

def eval_func(model, loader):
    model.eval()
    with torch.no_grad():
        for step, (batch_x, batch_y) in enumerate(loader):
            pred = model(batch_x)
            loss = criterion(pred, batch_y)
        print('testing loss', loss.item())
    return loss

def test_func(model, X, y_scaler=None):
    model.eval()
    
    with torch.no_grad():
        pred = model(X)
        pred = pred.cpu().numpy()
        
        if y_scaler != None:
            pred = y_scaler.inverse_transform(pred)
    return pred
    

In [None]:
train_losses = []
eval_losses = []
for t in range(1000):
    print('epochs', t)
    model, train_loss = train_func(model, train_dataset)
    if t % 10 == 0:
        eval_loss = eval_func(model, eval_dataset)
        eval_losses.append(eval_loss)
    
    train_losses.append(train_loss)
    

epochs 0
training loss 0.9691696249603504
testing loss 0.05750732868909836
epochs 1
training loss 0.9445983992329468
epochs 2
training loss 0.925549780493229
epochs 3
training loss 0.8930279550233519
epochs 4
training loss 0.9243453584431062
epochs 5
training loss 0.924835166447408
epochs 6
training loss 0.9623051986789349
epochs 7
training loss 0.9648746474474241
epochs 8
training loss 0.9952834410190204
epochs 9
training loss 0.9788775052847041
epochs 10
training loss 0.9667531290702371
testing loss 0.0017082947306334972
epochs 11
training loss 0.9584003555746065
epochs 12
training loss 0.9517859455053481
epochs 13
training loss 0.9467596439074009
epochs 14
training loss 0.9417647072705201
epochs 15
training loss 0.9381951432059862
epochs 16
training loss 0.9337698189878629
epochs 17
training loss 0.9286390800642413
epochs 18
training loss 0.925190449735734
epochs 19
training loss 0.9219114378885979
epochs 20
training loss 0.9213244702248279
testing loss 0.06824004650115967
epochs 21

training loss 0.086657491745641
epochs 175
training loss 0.09254419220840088
epochs 176
training loss 0.170582867488397
epochs 177
training loss 0.11750249858899722
epochs 178
training loss 0.10883479856452526
epochs 179
training loss 0.08485781720875024
epochs 180
training loss 0.08190622857119936
testing loss 0.18690411746501923
epochs 181
training loss 0.08225773635344648
epochs 182
training loss 0.07775954097657405
epochs 183
training loss 0.07658290211617917
epochs 184
training loss 0.07295039864943122
epochs 185
training loss 0.07608282184333208
epochs 186
training loss 0.07379461214630247
epochs 187
training loss 0.0800546998100686
epochs 188
training loss 0.09762246959209057
epochs 189
training loss 0.13728513648628515
epochs 190
training loss 0.3319001826521456
testing loss 0.01434732973575592
epochs 191
training loss 0.622643650427906
epochs 192
training loss 0.3017065221349694
epochs 193
training loss 0.29055997468611644
epochs 194
training loss 0.5447449525384223
epochs 195

training loss 0.09676986021082187
epochs 346
training loss 0.0875282865406394
epochs 347
training loss 0.07639625922718377
epochs 348
training loss 0.07002329106662442
epochs 349
training loss 0.056819988870765496
epochs 350
training loss 0.05127939184788046
testing loss 2.5077290534973145
epochs 351
training loss 0.04826802995179312
epochs 352
training loss 0.047467191730597165
epochs 353
training loss 0.046767113007701616
epochs 354
training loss 0.04531258500384042
epochs 355
training loss 0.046314240592022785
epochs 356
training loss 0.04616946855449495
epochs 357
training loss 0.043408070217087175
epochs 358
training loss 0.043343638871790276
epochs 359
training loss 0.044908844924772186
epochs 360
training loss 0.048258705703517694
testing loss 3.098332166671753
epochs 361
training loss 0.045819563589476954
epochs 362
training loss 0.05750326098192476
epochs 363
training loss 0.052038991002732654
epochs 364
training loss 0.053256826494888
epochs 365
training loss 0.04876560341280

training loss 0.05527164157347013
epochs 517
training loss 0.056403911892654465
epochs 518
training loss 0.05387905652364679
epochs 519
training loss 0.052914594160761136
epochs 520
training loss 0.051731708010525325
testing loss 0.9562615156173706
epochs 521
training loss 0.047597055588165106
epochs 522
training loss 0.05196041768621627
epochs 523
training loss 0.04758243716200472
epochs 524
training loss 0.05331874806764512
epochs 525
training loss 0.055936218432347494
epochs 526
training loss 0.04707012173814088
epochs 527
training loss 0.04735262056470417
epochs 528
training loss 0.05114605835989109
epochs 529
training loss 0.05272050921150015
epochs 530
training loss 0.05412985599366539
testing loss 0.9155073761940002
epochs 531
training loss 0.05000700046132539
epochs 532
training loss 0.05262995184772195
epochs 533
training loss 0.05692684825573909
epochs 534
training loss 0.08075440649683222
epochs 535
training loss 0.08442632658854517
epochs 536
training loss 0.128121784973290

training loss 0.032173010120541426
epochs 687
training loss 0.02846058082282681
epochs 688
training loss 0.026796230902344693
epochs 689
training loss 0.02638486042585817
epochs 690
training loss 0.026339164890045865
testing loss 0.7220249176025391
epochs 691
training loss 0.026817112613109442
epochs 692
training loss 0.02765084723252286
epochs 693
training loss 0.032964961467487404
epochs 694
training loss 0.037671341053174566
epochs 695
training loss 0.10240081300176083
epochs 696
training loss 0.18907045144456272
epochs 697
training loss 0.09732970181512214
epochs 698
training loss 0.06913128671848272
epochs 699
training loss 0.03877151626906723
epochs 700
training loss 0.03225628725596708
testing loss 0.7404713034629822
epochs 701
training loss 0.028090608919383232
epochs 702
training loss 0.02589552243098788
epochs 703
training loss 0.025538552973438555
epochs 704
training loss 0.025240882811925033
epochs 705
training loss 0.025035035046779265
epochs 706
training loss 0.0246866287

training loss 0.03946159041365476
epochs 856
training loss 0.044418291544624566
epochs 857
training loss 0.046497405094802186
epochs 858
training loss 0.03670281540817286
epochs 859
training loss 0.0358226701807831
epochs 860
training loss 0.05680377984360572
testing loss 0.7114660143852234
epochs 861
training loss 0.035564938170928824
epochs 862
training loss 0.03501978496296914
epochs 863
training loss 0.02381335722127742
epochs 864
training loss 0.02383368888399197
epochs 865
training loss 0.020692492082104386
epochs 866
training loss 0.019676876727662173
epochs 867
training loss 0.019393122670467274
epochs 868
training loss 0.019484853317097543
epochs 869
training loss 0.019098451072231765
epochs 870
training loss 0.019811275190132836
testing loss 0.5414528846740723
epochs 871
training loss 0.020395288324707463
epochs 872
training loss 0.02542214722258125
epochs 873
training loss 0.024862191374698965
epochs 874
training loss 0.03637131346059214
epochs 875
training loss 0.0319099504

### plot data 1000

In [None]:
plt.plot(train_losses, label='Training loss')
plt.legend(frameon=False)

In [None]:
plt.plot(eval_losses, label='Validation loss')


## vaildation

In [21]:
eval_func(model, eval_dataset)

testing loss 0.006189074832946062


tensor(0.0062, device='cuda:0')

### Submission

In [22]:
pred = test_func(model, X_test, y_scaler)
pred

array([[12178506. ],
       [ 1966244.4],
       [13700975. ],
       ...,
       [ 1966244.4],
       [ 5267488.5],
       [ 4375748.5]], dtype=float32)

In [23]:

submission = pd.read_csv('./dataset-0510/submit_test.csv')
submission['total_price'] = pred
submission.to_csv('submission/DNN2_result.csv', index=False)

### test result

Batch size use 128 or 32 , learning rate use 0.003 which find loss will stock in 0.6

Result 1 DNN 233->256->128->1, lr=0.001, batch_size=128, predict score : 13
change: 
- replacing Standard to MinMax 
- adding DropOut 0.3 layer
- batch size change to 512