In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.preprocessing import MinMaxScaler, StandardScaler,scale, MaxAbsScaler # MaxAbs is process sparse data
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
from sklearn.model_selection import train_test_split

import torch
import torchvision.transforms as transforms
from torch.autograd import Variable
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import nn, optim
import torch.nn.init as init
import torch.utils.data as Data
import math


import matplotlib.pyplot as plt
import torch.multiprocessing as mp
# add tuning 
from skorch import NeuralNetRegressor
from skorch.callbacks import EpochScoring


In [2]:
mp.set_start_method('spawn')

### default hyperparameters

In [3]:
batch_size = 128
use_gpu = True
y_scale = True
lr = 0.001
weight_decay = 0.0005

# Batch size and learning rate is hyperparameters in deep learning
# suggest batch_size is reduced, lr is also reduced which will reduce concussion

## using CUDA

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## reading file

In [5]:
X = pd.read_csv('./dataset-0510/train.csv')
X_test = pd.read_csv('./dataset-0510/test.csv')

columns = X.columns

In [6]:
y = X['total_price']
X = X.drop(columns=['building_id', 'total_price'], axis=1)
X_test = X_test.drop(columns=['building_id'], axis=1)

## define nn

In [7]:
class DNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.fc1 = nn.Linear(233, 256)
        self.bn1 = nn.BatchNorm1d(num_features=256)
        
        self.fc2 = nn.Linear(256, 512)
        self.bn2 = nn.BatchNorm1d(num_features=512)
        
        self.fc3 = nn.Linear(512, 512)
        self.bn3 = nn.BatchNorm1d(num_features=512)
        
        self.fc4 = nn.Linear(512, 256)
        self.bn4 = nn.BatchNorm1d(num_features=256)
        
        self.fc5 = nn.Linear(256, 128)
        self.bn5 = nn.BatchNorm1d(num_features=128)
        
        self.fc6 = nn.Linear(128, 64)
        self.bn6 = nn.BatchNorm1d(num_features=64)
        
        self.fc7 = nn.Linear(64, 32)
        self.bn7 = nn.BatchNorm1d(num_features=32)
        
        self.fc8 = nn.Linear(32, 1)
        
        
        self.dropout = nn.Dropout(p=0.5)
        
    def forward(self, x):
        #x = x.unsqueeze(0)
        
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.fc2(x)))
        x = F.relu(self.bn3(self.fc3(x)))
        x = F.relu(self.bn4(self.fc4(x)))
        x = F.relu(self.bn5(self.fc5(x)))
        x = F.relu(self.bn6(self.fc6(x)))
        x = F.relu(self.bn7(self.fc7(x)))
        x = torch.tanh(self.fc8(x))

        return x

## pipline

In [8]:
# step1. Imputation transformer for completing missing values.
step1 = ('Imputer', Imputer())
# step2. MinMaxScaler
step2 = ('MinMaxScaler', MinMaxScaler())
# step3. feature selection
#step3 = ('FeatureSelection', SelectFromModel(RandomForestRegressor()))
step3 = ('FeatureSelection', VarianceThreshold())

pipeline = Pipeline(steps=[step1, step2])



## preprocess

In [9]:

X = pipeline.fit_transform(X)

X_test = pipeline.transform(X_test)

y_scaler = MinMaxScaler(feature_range=[0, 5])
y = y_scaler.fit_transform(y.values.reshape(-1, 1))
# MaxAbs 0.00012
# MinMax 0.0008

In [10]:
print(X.shape)

print(X_test.shape)

print(y.shape)
y

(60000, 233)
(10000, 233)
(60000, 1)


array([[0.00041416],
       [0.00304174],
       [0.00918302],
       ...,
       [0.01138869],
       [0.01754978],
       [0.00814078]])

In [11]:
X = torch.from_numpy(X).float().to(device)
X_test = torch.from_numpy(X_test).float().to(device)
y = torch.from_numpy(y).float().to(device)

## define loss

In [12]:
class SqrtMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x, y):
        return torch.sqrt(torch.mean(torch.pow((x - y), 2)))

In [13]:
auc = EpochScoring(scoring='neg_mean_squared_error', lower_is_better=False)

net = NeuralNetRegressor(
    DNN,
    max_epochs=2000,
    lr=0.001,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    callbacks=[auc],
    device='cuda',
    criterion= SqrtMSELoss,
)

#net.fit(X, y)
#y_proba = net.predict_proba(X)

  epoch    neg_mean_squared_error    train_loss    valid_loss     dur
-------  ------------------------  ------------  ------------  ------
      1                   [36m-0.0106[0m        [32m0.1348[0m        [35m0.0947[0m  1.6053
      2                   [36m-0.0070[0m        [32m0.0806[0m        [35m0.0718[0m  1.4657
      3                   [36m-0.0055[0m        [32m0.0633[0m        [35m0.0591[0m  1.4650
      4                   [36m-0.0049[0m        [32m0.0536[0m        [35m0.0521[0m  1.4733
      5                   [36m-0.0045[0m        [32m0.0481[0m        [35m0.0472[0m  1.4742
      6                   [36m-0.0042[0m        [32m0.0439[0m        [35m0.0442[0m  1.4756
      7                   [36m-0.0041[0m        [32m0.0411[0m        [35m0.0417[0m  1.4750
      8                   [36m-0.0040[0m        [32m0.0393[0m        [35m0.0400[0m  1.4738
      9                   [36m-0.0039[0m        [32m0.0377[0m        [35m0.0387

     95                   [36m-0.0035[0m        0.0277        [35m0.0303[0m  1.4659
     96                   -0.0035        0.0276        0.0304  1.4680
     97                   -0.0035        0.0275        0.0304  1.4686
     98                   [36m-0.0034[0m        0.0276        [35m0.0303[0m  1.4693
     99                   -0.0035        0.0276        [35m0.0303[0m  1.4685
    100                   -0.0035        0.0276        0.0303  1.4714
    101                   [36m-0.0034[0m        [32m0.0270[0m        [35m0.0302[0m  1.4673
    102                   [36m-0.0034[0m        [32m0.0269[0m        [35m0.0302[0m  1.4683
    103                   -0.0034        0.0274        [35m0.0301[0m  1.4687
    104                   -0.0034        0.0269        0.0302  1.4667
    105                   -0.0034        [32m0.0266[0m        0.0302  1.4690
    106                   [36m-0.0034[0m        0.0270        [35m0.0301[0m  1.4685
    107                  

    200                   -0.0033        0.0237        0.0289  1.4632
    201                   [36m-0.0033[0m        [32m0.0232[0m        0.0288  1.4638
    202                   -0.0033        0.0235        0.0287  1.4863
    203                   [36m-0.0033[0m        0.0232        0.0289  1.4654
    204                   [36m-0.0033[0m        0.0232        [35m0.0287[0m  1.4660
    205                   [36m-0.0033[0m        [32m0.0228[0m        [35m0.0286[0m  1.4641
    206                   -0.0033        0.0232        0.0286  1.4641
    207                   [36m-0.0033[0m        0.0230        [35m0.0285[0m  1.4662
    208                   -0.0033        [32m0.0228[0m        0.0288  1.4631
    209                   -0.0033        0.0231        0.0288  1.4677
    210                   -0.0033        0.0230        0.0288  1.4636
    211                   -0.0033        0.0228        0.0285  1.4627
    212                   -0.0033        [32m0.0224[0m     

    304                   -0.0029        0.0198        0.0274  1.4720
    305                   [36m-0.0029[0m        0.0197        0.0272  1.4767
    306                   [36m-0.0029[0m        0.0198        0.0275  1.4765
    307                   -0.0029        0.0198        0.0271  1.4763
    308                   [36m-0.0029[0m        0.0198        0.0271  1.4748
    309                   [36m-0.0029[0m        [32m0.0196[0m        [35m0.0269[0m  1.4758
    310                   -0.0029        [32m0.0194[0m        0.0273  1.4761
    311                   -0.0029        0.0196        0.0272  1.4775
    312                   [36m-0.0028[0m        0.0196        0.0273  1.4774
    313                   [36m-0.0028[0m        0.0195        0.0272  1.4717
    314                   [36m-0.0028[0m        0.0194        0.0271  1.4771
    315                   [36m-0.0028[0m        0.0195        [35m0.0268[0m  1.4776
    316                   -0.0028        0.0197     

    410                   -0.0026        0.0172        0.0259  1.4711
    411                   -0.0026        0.0174        0.0258  1.4711
    412                   -0.0026        0.0172        0.0256  1.4742
    413                   -0.0026        0.0172        0.0258  1.4744
    414                   -0.0026        [32m0.0170[0m        0.0257  1.4703
    415                   -0.0026        0.0172        0.0259  1.4707
    416                   -0.0026        0.0172        0.0258  1.4747
    417                   -0.0026        [32m0.0170[0m        0.0257  1.4729
    418                   -0.0026        0.0171        0.0257  1.4706
    419                   [36m-0.0026[0m        0.0174        0.0257  1.4726
    420                   -0.0026        0.0170        0.0259  1.4735
    421                   -0.0026        0.0171        0.0256  1.4719
    422                   -0.0026        0.0171        0.0258  1.4704
    423                   -0.0026        [32m0.0170[0m       

In [15]:
X = X.astype(np.float32)
y = y.astype(np.float32)

params = {
    'lr': [0.0015, 0.003, 0.005],
    #'max_epochs': [2000, 5000],
}
gs = GridSearchCV(net, params, refit=False, cv=3, scoring='neg_mean_squared_error')

gs.fit(X, y)
print(gs.best_score_, gs.best_params_)

AttributeError: 'Tensor' object has no attribute 'astype'

In [14]:
y_proba = net.predict_proba(X_test)
pred = y_scaler.inverse_transform(y_proba)       
print(pred.shape)

(10000, 1)


In [15]:
submission = pd.read_csv('./dataset-0510/submit_test.csv')
submission['total_price'] = pred
submission.to_csv('submission/TuningDNN_result.csv', index=False)
submission

Unnamed: 0,building_id,total_price
0,X5gsdTWGS3W7JJQB,1.910691e+07
1,BTshNOJyKHnT2YIT,2.268285e+06
2,dhdymr0lV8N5kZOT,1.123758e+07
3,VEwyGGMcD56w5BOc,1.969187e+06
4,wmUeMoJZfsqaSX9b,-1.727904e+06
5,EtBjGAHmHCe9t7TZ,2.860874e+06
6,hPNH34vmaZtvBtqc,1.397353e+07
7,wXjeI38bYDMJJwZC,1.533666e+07
8,fxZSGX6aPAFKU8W4,4.654310e+06
9,ewr0Fx6ign87OwaV,-1.102196e+06
