In [None]:
# https://dacon.io/competitions/official/235554/codeshare/634?page=1&dtype=recent&ptype=pub

In [1]:
import time
import random
import numpy as np  # 1.18.1
from numpy.random import shuffle
import pandas as pd  # 0.25.3
import torch  # 1.4.0
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data.sampler import Sampler, SequentialSampler
from torch.backends import cudnn

In [2]:
# https://pytorch.org/docs/stable/notes/randomness.html
# deterministic 결정론적 알고리즘 - 특정 입력이 들어오면 학습했던 그대로.
# benchmark - cudnn이 결정론적 알고리즘을 선택하게 만들고. 성능을 유지시키는(REPRODUCIBILITY) 경향.
# 아래 코드는 seed 부터 REPRODUCIBILITY 까지. 가능하게 하기 위한 코드.

# https://hoya012.github.io/blog/reproducible_pytorch/

torch.manual_seed(71)
torch.cuda.manual_seed(71)
torch.cuda.manual_seed_all(71) # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(71)
random.seed(71)

In [3]:
# https://pytorch.org/docs/stable/data.html
# __len__과 __iter__를 선언해야한다. class에선 __init__ 은 당연.

# https://subinium.github.io/pytorch-dataloader/ 
class ContinuousBatchSampler(Sampler):
    # https://hulk89.github.io/pytorch/2019/09/30/pytorch_dataset/
    # sampler를 상속받은 새로운 Batchsampler를 만든다. index_list를 반환한다.
    def __init__(self, sampler, batch_size, drop_last):
        # sequentialsampler 사용한다.0,1,2,3..810000 순서로 나온다.
        self.sampler = sampler
        self.batch_size = batch_size
        self.drop_last = drop_last # false 
        self.from_last_epoch = []

    def __iter__(self):
        # 같은 행이 걸리면 제외하고 drop_last가 false라 버리지 않고 뒤에 합쳐진다.
        idx_from_sampler = set(self.sampler)  # 0,1,2,3,4 810000... 에서
#         print(f"idx_from_sampler : {idx_from_sampler}")
        idx_to_exclude = set(self.from_last_epoch) # 배제할 idx
#         print(f"idx_to_exclude : {idx_to_exclude}")
        idx_after_exclusion = sorted(list(idx_from_sampler - idx_to_exclude)) 
#         print(f"idx_after_exclusion : {idx_after_exclusion}")
        shuffle(idx_after_exclusion) 
        first_batch = self.from_last_epoch + idx_after_exclusion[:self.batch_size - len(self.from_last_epoch)]
#         print(f"first_batch : {first_batch}")
        yield first_batch
    
        # 이부분은 이해가 안되는게 다시 붙이고 shuffle한다. 그러면 했던게 또 걸릴텐데.
        idx_of_left = sorted(idx_after_exclusion[self.batch_size - len(self.from_last_epoch):] + list(idx_to_exclude))
#         print(f"idx_of_left : {idx_of_left}")
        shuffle(idx_of_left)
        batch = []
        for idx in idx_of_left:
            batch.append(idx)
            if len(batch) == self.batch_size:
#                 print(f"batch : {batch}")
                yield batch
                batch = []
        # drop_last false로 뒀기 때문에 from_last_epoch에 이전 batch 를 계속 저장.
        if not self.drop_last:
            self.from_last_epoch = batch.copy()
#             print(f"from_last_epoch : {from_last_epoch}")
#         print(f"return : {(len(self.sampler) + len(self.from_last_epoch)) // self.batch_size}")
#         print(f"sampler lenght : {len(self.sampler)}")
#         print(f"from_last_epoch length : {len(self.from_last_epoch)}")

    def __len__(self):
        if self.drop_last:
            return len(self.sampler) // self.batch_size
        else:
#             print(f"return : {(len(self.sampler) + len(self.from_last_epoch)) // self.batch_size}")
#             print(f"sampler lenght : {len(self.sampler)}")
#             print(f"from_last_epoch length : {len(self.from_last_epoch)}")
            return (len(self.sampler) + len(self.from_last_epoch)) // self.batch_size


### 결론

1. 처음 한것은 batch가 끝나고 뒤에 붙인다 -> drop_last=False 이므로
2. yield 때문인지는 몰라도 first_batch 이후 batch가 나오게 되면서, 처음 len(self.sampler) // self.batch_size 이후,
 (len(self.sampler) + len(self.from_last_epoch)) // self.batch_size 게 된다. 
-> 이부분은 설명을 잘 못하겠습니다. 느낌입니다. print 끼고 하려니까 안나와서요. 훈련과정이 눈에 안보입니다.
결국, dynamic batch 느낌입니다. 보통 batch는 2048 2048 순서적이라면
이거 읽어보려구요. https://discuss.pytorch.org/t/dataloader-for-variable-batch-size/13840

In [4]:
device = torch.device('cuda:0') # 혹시 이 뒤에 숫자가 의미하는게 무엇인지 알고 있는분 계신가요?
num_epochs = 1000
batch_size = 2048
initial_learning_rate = 0.2
# https://pytorch.org/docs/stable/data.html
# which enables fast data transfer to CUDA-enabled GPUs.
loader_params = {'num_workers': 8, 'pin_memory': True}

train_data = np.array(pd.read_csv('../input/month1/data_mdc01/train.csv'), dtype=np.float32)
X_train = torch.tensor(train_data[:, 4:], dtype=torch.float32)
y_train = torch.tensor(train_data[:, :4], dtype=torch.float32)

dataset = TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(dataset, batch_sampler=ContinuousBatchSampler(
    sampler=SequentialSampler(range(len(dataset))), batch_size=batch_size, drop_last=False), **loader_params)

prediction = np.zeros((10000, 4), dtype=np.float32)
logging_term = 1000
logging_total = int(810000 * num_epochs / batch_size)

In [5]:
# https://wegonnamakeit.tistory.com/47
# batch에 있는 데이터가 변화하더라도 normalize하는 mean과 varianece 값이 바뀌지 않게 된다.

# 이게 중요한것 같다. 데이터가 변해도 mean과 variance 가 그대로면 정해진 범위안에서 노는것이다. 

for model_no in ['Model_01', 'Model_02', 'Model_03', 'Model_04', 'Model_05',
                 'Model_06', 'Model_07', 'Model_08', 'Model_09', 'Model_10',
                 'Model_11', 'Model_12', 'Model_13', 'Model_14', 'Model_15',
                 'Model_16', 'Model_17', 'Model_18', 'Model_19', 'Model_20']:

    net = nn.Sequential(
        nn.BatchNorm1d(226),
        nn.ReLU(),
        nn.Linear(226, 768),
        nn.BatchNorm1d(768),
        nn.ReLU(),
        nn.Linear(768, 768),
        nn.BatchNorm1d(768),
        nn.ReLU(),
        nn.Linear(768, 768),
        nn.BatchNorm1d(768),
        nn.ReLU(),
        nn.Linear(768, 768),
        nn.BatchNorm1d(768),
        nn.ReLU(),
        nn.Linear(768, 768),
        nn.BatchNorm1d(768),
        nn.ReLU(),
        nn.Linear(768, 768),
        nn.BatchNorm1d(768),
        nn.ReLU(),
        nn.Linear(768, 4)
    )

    model = net.to(device)
    running_loss = 0.
    running_counter = 0
    criterion = torch.nn.L1Loss()
    # momentum 은 운동 관성. 관성을 넣어 학습속도를 제어한다고 한다.
    # 학습 2에서는 weight_decay가 있는데 l2 loss라고 한다.
    # https://deepapple.tistory.com/6 - loss를 더해서 모델의 복잡도를 낮춘다.
    optimizer = optim.SGD(model.parameters(), lr=initial_learning_rate, momentum=0.9)
    
    # https://sanghyu.tistory.com/113
    # parameter에서 cosine annealing 함수를 따르고, restart는 250개의 iter가끝나고 상황보고
    # T_mult restart 후에 T_i를 증가시키는 factor 라는데 뜻은 모르겠다.
    # eta_min은 최소 lr ,last epoch 는 -1이 default다. 
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=250, T_mult=1, eta_min=0.005,
                                                               last_epoch=-1)
    count_a=0
    for epoch in range(num_epochs):

        model.train()
        count_b=0
        for xx, yy in train_loader:
            count_b+=1
            # gpu 계산으로.
            xx, yy = xx.to(device), yy.to(device)
            # gradient 계산을 불가능하게 하는것.
            with torch.no_grad():
                # noise를 더한다.
                xx += torch.randn((xx.shape[0], 226), device='cuda:0') * 0.003

            out = model(xx)
            loss = criterion(out, yy)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_counter += 1
            running_loss += loss.item()
            
            # 한 epoch 당 1000번 학습
            if running_counter % logging_term == 0:
                print(model_no + ' (iter {:6d}/{:6d}) {:.4f}'.format(running_counter, logging_total,
                                                                     running_loss / logging_term))
                running_loss = 0.
            if count_b==2:
                break
        # epoch 당 한번 scheduler로 learning_rate 조절.
        # 원래 epoch 끝나고 scheduler로 learning_rate 조절.
        scheduler.step()
        # 이렇게 한 이유는 epoch에 따라 어떻게 다른지 확인. 위에서 continuesampler 만들때
        # yield가 두번 들어가는데 yield는 몇번 들어가도 상관없고, first batch 보낸 이후에 idx_left_batch 계속 보낸다.
        if count_a==2:
            break
    break
    model.eval()
    output = model(
        torch.tensor(np.array(pd.read_csv('../input/month1/data_mdc01/test.csv'), dtype=np.float32))[:, 1:].to(device))
    output = np.array(output.detach().to('cpu'), dtype=np.float32)
    # 학습2,3,4에서는 1/7 모델을 20개 해서 0.05, 7개중의 1개라 1/7 앙상블을 뜻한다.
    prediction += output * 0.05
    
    # 모델 하나당 시간이 엄청 걸린다. 

len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048


Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch) : 2048
len(first_batch)

Exception in thread Thread-162:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 113, in get
    return _ForkingPickler.loads(res)
  File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
    fd = df.detach()
  File "/opt/conda/lib/python3.7/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/opt/conda/lib/python3.7/multiprocessing/resource_sharer.py", line 87, in get_connection
    c = Client(address, authkey=process.current_process().

KeyboardInterrupt: 

In [None]:
import math
import time
from itertools import chain
import argparse
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR
# 모델 학습을 위해 CUDA 환경 설정. : 지피유 설정
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

In [None]:
# 별도의 데이터 Pre-Processing 과정은 없고 모델 훈련시 검증을 위해 train 중 10000개를 validation 용으로 분리.
# 새로 만든 train.csv는 train1.csv, validation은 val.csv로 저장.
# dataframe.sample(frac=1) 을 통해 셔플.
from IPython.display import display

path_train = '../input/month1/data_mdc01/train.csv'
path_test = '../input/month1/data_mdc01/test.csv'
layers = [['layer_1','layer_2','layer_3','layer_4'], [str(i) for i in np.arange(0,226).tolist()]]
layers = list(chain(*layers))
display(len(layers))

train = pd.read_csv(path_train)
print(train.shape)
train = train.sample(frac=1)
rows, cols = train.shape

train1 = train.iloc[:rows - 10000,:]
train1 = train1.values
train1 = pd.DataFrame(data=train1,columns=layers)
display(train1)

# train1.to_csv('train1.csv', index_label='id')

print("train file saved....")
val = train.iloc[rows - 10000:,:]
val = val.values
val = pd.DataFrame(data=val,columns=layers)
# val.to_csv('val.csv', index_label='id')
display(val)

In [None]:
# 새로 만든 train/ val 모델 학습 데이터 경로를 설정.
# train_path = 'train1.csv'
# val_path = 'val.csv'

lr = 1e-03
adam_epsilon = 1e-06
epochs = 100
batch_size = 2048
warmup_step = 2000
loss_fn = nn.L1Loss()

# 이번에는 Dataset을 상속받은 custom dataset
class PandasDataset(Dataset):
    # df로 바꿈.
    def __init__(self, df):
        super(PandasDataset, self).__init__()
        train = df.iloc[:,1:]
        self.train_X, self.train_Y = train.iloc[:,4:], train.iloc[:,0:4]
        self.tmp_x , self.tmp_y = self.train_X.values, self.train_Y.values
    
    def __len__(self):
        return len(self.train_X)

    def __getitem__(self, idx):
        return {
            'X':torch.from_numpy(self.tmp_x)[idx],
            'Y':torch.from_numpy(self.tmp_y)[idx]
        }
            
train_dataset = PandasDataset(train1)
train_loader = DataLoader(train_dataset, batch_size=batch_size,  num_workers=4)

val_dataset = PandasDataset(val)
val_loader = DataLoader(val_dataset, batch_size=batch_size,  num_workers=4) 

sigmoid = $1\over{1+e^{-z}}$   
tanh = $2 sigmoid(z) -1$  
gelu = $1\over2$ $x$ ($\sqrt{1+{2\over\pi}}$) ($ x + 0.044715 * x^3 $ )  


https://arxiv.org/abs/1606.08415   
https://medium.com/@shoray.goel/gelu-gaussian-error-linear-unit-4ec59fb2e47c  

Gaussian Error Linear Unit (GELUs) 라 불리고, relu나 elu에 비해 잘 작동한다고 나와있다.  
보통 computer vision 에 많이 쓰이는것같다. 

In [None]:
# activation
class GELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
    
# layer 정규화
class LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-5):
        """
        Construct a layernorm module in the TF style (epsilon inside the square root).
        """
        super(LayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

        self.init_weights()

    def init_weights(self):
        self.weight.data.fill_(1.0)
        self.bias.data.zero_()

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias

In [None]:
# model
class skipConnectionModel(nn.Module):
    def __init__(self):
        super(skipConnectionModel, self).__init__()
        
        self.ln = LayerNorm(10000)
        self.ln1 = LayerNorm(7000)
        self.ln2 = LayerNorm(4000)
        self.ln3 = LayerNorm(2000)
        
        self.upblock1 = nn.Sequential(nn.Linear(226, 2000),GELU(),nn.BatchNorm1d(2000))
        self.upblock2 = nn.Sequential(nn.Linear(2000,4000),GELU(),nn.BatchNorm1d(4000))
        self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000))
        self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000))

        self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000))
        self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000))
        self.downblock3 = nn.Sequential(nn.Linear(4000, 2000),GELU(),nn.BatchNorm1d(2000))
        self.downblock4 = nn.Sequential(nn.Linear(2000, 300),GELU(),nn.BatchNorm1d(300))
        
        self.fclayer = nn.Sequential(nn.Linear(300,4))
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x):
        upblock1_out = self.upblock1(x)
        upblock2_out = self.upblock2(upblock1_out)
        upblock3_out = self.upblock3(upblock2_out)
        upblock4_out = self.upblock4(upblock3_out)
        
        downblock1_out = self.downblock1(self.ln(upblock4_out))
        skipblock1 = downblock1_out + upblock3_out
        downblock2_out = self.downblock2(self.ln1(skipblock1))
        skipblock2 = downblock2_out + upblock2_out
        downblock3_out = self.downblock3(self.ln2(skipblock2))
        skipblock3 = downblock3_out + upblock1_out
        downblock4_out = self.downblock4(self.ln3(skipblock3))
        
        output = self.fclayer(downblock4_out)
        
        return output

In [None]:
def get_constant_schedule(optimizer, last_epoch=-1):
    """ Create a schedule with a constant learning rate.
    """
    return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)

def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1):
    """ Create a schedule with a constant learning rate preceded by a warmup
    period during which the learning rate increases linearly between 0 and 1.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1.0, num_warmup_steps))
        return 1.0

    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)

def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
    """ Create a schedule with a learning rate that decreases linearly after
    linearly increasing during a warmup period.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )

    return LambdaLR(optimizer, lr_lambda, last_epoch)

def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1):
    """ Create a schedule with a learning rate that decreases following the
    values of the cosine function between 0 and `pi * cycles` after a warmup
    period during which it increases linearly between 0 and 1.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))

    return LambdaLR(optimizer, lr_lambda, last_epoch)

def get_cosine_with_hard_restarts_schedule_with_warmup(
    optimizer, num_warmup_steps, num_training_steps, num_cycles=1.0, last_epoch=-1
):
    """ Create a schedule with a learning rate that decreases following the
    values of the cosine function with several hard restarts, after a warmup
    period during which it increases linearly between 0 and 1.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        if progress >= 1.0:
            return 0.0
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))

    return LambdaLR(optimizer, lr_lambda, last_epoch)

In [None]:
class AdamW(Optimizer):
    """ Implements Adam algorithm with weight decay fix.
    Parameters:
        lr (float): learning rate. Default 1e-3.
        betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999)
        eps (float): Adams epsilon. Default: 1e-6
        weight_decay (float): Weight decay. Default: 0.0
        correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {} - should be &gt;= 0.0".format(lr))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {} - should be &gt;= 0.0".format(eps))
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
        super().__init__(params, defaults)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state["step"] = 0
                    # Exponential moving average of gradient values
                    state["exp_avg"] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state["exp_avg_sq"] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
                beta1, beta2 = group["betas"]

                state["step"] += 1

                # Decay the first and second moment running average coefficient
                # In-place operations to update the averages at the same time
                exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
                denom = exp_avg_sq.sqrt().add_(group["eps"])

                step_size = group["lr"]
                if group["correct_bias"]:  # No bias correction for Bert
                    bias_correction1 = 1.0 - beta1 ** state["step"]
                    bias_correction2 = 1.0 - beta2 ** state["step"]
                    step_size = step_size * math.sqrt(bias_correction2) / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)

                # Just adding the square of the weights to the loss function is *not*
                # the correct way of using L2 regularization/weight decay with Adam,
                # since that will interact with the m and v parameters in strange ways.
                #
                # Instead we want to decay the weights in a manner that doesn't interact
                # with the m/v parameters. This is equivalent to adding the square
                # of the weights to the loss with plain (non-momentum) SGD.
                # Add weight decay at the end (fixed version)
                if group["weight_decay"] > 0.0:
                    p.data.add_(-group["lr"] * group["weight_decay"], p.data)

        return loss   

In [None]:
model = skipConnectionModel()
model = model.to(device) # 모델을 GPU 메모리에 올림.

In [None]:
"""
모델 학습
"""

total_step = len(train_loader) * epochs
print(f"Total step is....{total_step}") # 모델이 학습하는 전체 step 계산.

# 옵티마이저와 스케줄러의 파라미터들을 정의.

no_decay = ["bias", "LayerNorm.weight"] # decay하지 않을 영역 지정.
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=adam_epsilon)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_step, num_training_steps=total_step
)

# train loss와 val loss 지정.
total_loss = 0.0
total_val_loss = 0.0

# 모델 이름을 위해서 변수 만듦.
version = time.localtime()[3:5]
curr_lr = lr

n_val_loss = 10000000. # 가장 낮은 validation loss를 저장하기 위해서 변수 설정.

for epoch in range(epochs):
    total_loss = 0 
    total_val_loss = 0
    for i, data in enumerate(tqdm(train_loader, desc='*********Train mode*******')):  # train 데이터를 부르고 학습.
        # forward pass
        pred = model(data['X'].float().to(device))
        loss = loss_fn(pred, data['Y'].float().to(device))
        
        # backward pass
        optimizer.zero_grad() # optimizer 객체 사용해서 학습 가능한 가중치 변수에 대한 모든 변화도를 0으로 만듦
        loss.backward() 
        optimizer.step() # update optimizer params
        scheduler.step() # update scheduler params
        
        total_loss += loss.item()
        
    train_loss = total_loss / len(train_loader)
    print ("Epoch [{}/{}], Train Loss: {:.4f}".format(epoch+1, epochs, train_loss))

    # evaluation
    # validation 데이터를 부르고 epoch 마다 학습된 모델을 부르고 평가.
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(tqdm(val_loader, desc='*********Evaluation mode*******')):
            pred = model(data['X'].float().to(device))
            loss_val = loss_fn(pred, data['Y'].float().to(device))
            
            total_val_loss += loss_val.item()
    val_loss = total_val_loss / len(val_loader)
    print ("Epoch [{}/{}], Eval Loss: {:.4f}".format(epoch+1, epochs, val_loss))
    
    # best model을 저장.
    if val_loss &lt; n_val_loss:
        n_val_loss = val_loss
        torch.save(model.state_dict(), f'test_{version}_{lr}_{epochs}.pth')
        print("Best Model saved......")

In [None]:
"""
모델 테스트
"""

test_model = skipConnectionModel()

# test 파일 경로 및 test 데이터 로드
path_test = 'test.csv'
class TestDataset(Dataset):
    def __init__(self, path_test):
        super(TestDataset, self).__init__()
        test = pd.read_csv(path_test)
        self.test_X = test.iloc[:,1:]
        self.tmp_x = self.test_X.values
    
    def __len__(self):
        return len(self.test_X)

    def __getitem__(self, idx):
        return torch.from_numpy(self.tmp_x)[idx]
    
test_data = TestDataset(path_test)
test_loader = DataLoader(test_data, batch_size=10000,  num_workers=4)

# 모델에 학습된 가중치를 업로드.
weights = torch.load(f'test_{version}_{lr}_{epochs}.pth', map_location='cuda:0')
test_model.load_state_dict(weights)
test_model = test_model.to(device)
test_model.eval()

with torch.no_grad():
    for data in test_loader:
        data = data.to(device)
        outputs = test_model(data.float())
pred_test = outputs

sample_sub = pd.read_csv('sample_submission.csv', index_col=0)
layers = ['layer_1','layer_2','layer_3','layer_4']
submission = sample_sub.values + pred_test.cpu().numpy()

submission = pd.DataFrame(data=submission,columns=layers)
submission.to_csv(f'test_{version}_{lr}_{epochs}.csv', index_label='id')

In [None]:
# + Self evaluation and Ensemble
# Dacon의 제출하기를 통해 측정한 mae값 중 가장 낮은 mae csv 파일과 다른 파라미터 적용으로 훈련한 모델과 mae 비교를 통해
# 대략적인 test mae를 예상한 뒤 제출하기 하여 3번 제출할 수 있는 기회를 최대한 살림.
# 다양한 파라미터 적용을 통한 모델들을 아래의 en함수를 통해 합친 뒤 평균을 구하여 제출
# (추가하는 csv 파일의 수에 따라 en함수의 함수가 받는 csv 파일 개수 증가 및 코드 수정 필요.)
def mae(best_path, my_path):
    best = pd.read_csv(best_path)
    best_value = best.iloc[:,1:].values
    value = pd.read_csv(my_path)
    my_value = value.iloc[:,1:].values
    abs_value = abs(best_value - my_value)
    size = abs_value.shape
    return sum(sum(abs_value) / (size[0]*size[1]))

def en(best_path, my_path):
    best = pd.read_csv(best_path)
    best_value = best.iloc[:,1:].values
    value = pd.read_csv(my_path)
    my_value = value.iloc[:,1:].values
    return (my_value + best_value)/2