In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
from sklearn.model_selection import train_test_split

import torch
import torchvision.transforms as transforms
from torch.autograd import Variable
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import nn, optim
import torch.nn.init as init
import torch.utils.data as Data
import math


import matplotlib.pyplot as plt
import torch.multiprocessing as mp


In [2]:
mp.set_start_method('spawn')

## hyperparameters

In [3]:
batch_size = 256
#epochs = 2000
use_gpu = True
lr = 0.001
weight_decay = 10

# Batch size and learning rate is hyperparameters in deep learning
# suggest batch_size is reduced, lr is also reduced which will reduce concussion

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [5]:
X = pd.read_csv('./dataset-0510/train.csv')
X_test = pd.read_csv('./dataset-0510/test.csv')

In [6]:
y = X['total_price']
X = X.drop(columns=['building_id', 'total_price'], axis=1)

X_test = X_test.drop(columns=['building_id'], axis=1)

In [7]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.3, random_state=42) 

### scale y

In [8]:
y_scaler = StandardScaler()
y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_eval = y_scaler.fit_transform(y_eval.values.reshape(-1, 1))

## Imputer, Scaler, Feature selection

In [9]:
# step1. Imputation transformer for completing missing values.
step1 = ('Imputer', Imputer())
# step2. MinMaxScaler
step2 = ('MinMaxScaler', MinMaxScaler())
# step3. feature selection
#step3 = ('FeatureSelection', SelectFromModel(RandomForestRegressor()))
step3 = ('FeatureSelection', VarianceThreshold())

pipeline = Pipeline(steps=[step1, step2])





In [10]:
print(X_train.shape)
X_train = pipeline.fit_transform(X_train)
X_eval = pipeline.fit_transform(X_eval)

print(X_test.shape)
X_test = pipeline.fit_transform(X_test)


(42000, 233)
(10000, 233)


In [11]:
X_train = torch.from_numpy(X_train).float().to(device)
X_eval = torch.from_numpy(X_eval).float().to(device)

y_train = torch.from_numpy(y_train).float().to(device)
y_eval = torch.from_numpy(y_eval).float().to(device)

X_test = torch.from_numpy(X_test).float().to(device)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

torch.Size([42000, 233])
torch.Size([10000, 233])
torch.Size([42000, 1])


In [13]:
train_dataset = Data.TensorDataset(X_train, y_train)
loader = Data.DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=1,
)

eval_dataset = Data.TensorDataset(X_eval, y_eval)
loader = Data.DataLoader(
    dataset=eval_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=1,
)

## building model

In [14]:
class DNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.fc1 = nn.Linear(233, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)
        
        self.dropout = nn.Dropout(p=0.5)
    def forward(self, x):
        '''
        # fc1
        x = self.dropout(self.fc1(x))
        x = F.relu(x)
        # fc2
        x = self.dropout(self.fc2(x))
        x = F.relu(x)
        # fc3
        x = self.fc3(x)
        '''
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x

In [15]:
model = DNN().to(device)
criterion = nn.MSELoss()
optim = optim.Adam(model.parameters(), lr= lr)

In [25]:
def train_func(model, loader, epochs=10):
    model.train()
    for e in range(epochs):
        train_loss = []
        for step, (batch_x, batch_y) in enumerate(loader):
            optim.zero_grad()
            pred = model(batch_x)
            loss = criterion(batch_y, pred)
            loss.backward()
            optim.step()

            train_loss.append(loss.item())

        print('training loss', np.array(train_loss).mean())
    return model

def eval_func(model, loader):
    model.eval()
    with torch.no_grad():
        for step, (batch_x, batch_y) in enumerate(loader):
            pred = model(batch_x)
            loss = criterion(batch_y, pred)
        print('testing loss', loss.item())

def test_func(model, X, y_scaler=None):
    model.eval()
    
    with torch.no_grad():
        pred = model(X)
        pred = pred.cpu().numpy()
        
        if y_scaler != None:
            pred = y_scaler.inverse_transform(pred)
    return pred
    

In [17]:
for t in range(10):
    print('epochs', t)
    model = train_func(model, train_dataset)
    #eval_func(model, eval_dataset)

epochs 0
training loss 0.9501436736957303
training loss 0.9233199489581488
training loss 0.9064398968114064
training loss 0.8836419139281173
training loss 0.8366680365052468
training loss 0.771222911463093
training loss 0.75759209820114
training loss 0.736790798985828
training loss 0.7432348111860205
training loss 0.7207435961083116
epochs 1
training loss 0.6984301521624928
training loss 0.6777804596303697
training loss 0.6726904134093714
training loss 0.6803147826298908
training loss 0.6530774530291549
training loss 0.6554120853077692
training loss 0.6363395146880658
training loss 0.6279896232595754
training loss 0.6197491216432028
training loss 0.6125582458116302
epochs 2
training loss 0.6098270424652111
training loss 0.5935704218102241
training loss 0.5812477491851915
training loss 0.5897203585360667
training loss 0.5921173570331565
training loss 0.5740297626913574
training loss 0.5815515164149602
training loss 0.5680268438226549
training loss 0.5636804933662125
training loss 0.5589

## vaildation

In [26]:
eval_func(model, eval_dataset)

NameError: name 'e' is not defined

### Submission

In [20]:
pred = test_func(model, X_test, y_scaler)

In [21]:

submission = pd.read_csv('./dataset-0510/submit_test.csv')
submission['total_price'] = pred
submission.to_csv('submission/DNN2_result.csv', index=False)

### test result

Batch size use 128 or 32 , learning rate use 0.003 which find loss will stock in 0.6