## Import

In [1]:
import random
import glob
import os
import re
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [3]:
CFG = {
    'WINDOW_SIZE':500, # 500 Step
    'EPOCHS':10,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':128,
    'SEED':41
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

### 데이터 불러오기

In [5]:
train_paths = glob.glob('./train/*.csv')
test_paths = pd.read_csv('./test.csv')['data_path'].values

### 데이터 전처리

In [6]:
extremes = {
    'Time[s]': {'min': float('inf'), 'max': float('-inf')},
    'Signal A': {'min': float('inf'), 'max': float('-inf')},
    'Signal B': {'min': float('inf'), 'max': float('-inf')},
    'Signal C': {'min': float('inf'), 'max': float('-inf')},
    'Sensor A': {'min': float('inf'), 'max': float('-inf')},
    'Sensor B': {'min': float('inf'), 'max': float('-inf')},
    'Sensor C': {'min': float('inf'), 'max': float('-inf')},
    'Sensor D': {'min': float('inf'), 'max': float('-inf')}
}

# 주어진 모든 경로에 대해 반복
for path in train_paths:
    data = pd.read_csv(path)

    # 각 열에 대해 최소값과 최대값 업데이트
    for column in extremes.keys():
        extremes[column]['min'] = min(extremes[column]['min'], data[column].min())
        extremes[column]['max'] = max(extremes[column]['max'], data[column].max())

In [7]:
def make_train_data(train_paths, window_size=CFG['WINDOW_SIZE'], stride=100):
    sequences = []
    sequence_labels = []
    for path in tqdm(train_paths):
        
        driver = str(path.split('/')[-1].split('.')[0].split('_')[1][0])
        data = pd.read_csv(path)
        data['driver'] = 0 if driver == 'A' else 1
        data = data.values
        
        # label = float(path.split('/')[-1].split('.')[0].split('_')[0][:-2])
        label = float(re.sub(r'[^0-9]', '', path))
        label = label / 902. # Label 정규화
        for i in range(0, len(data) - window_size + 1, stride):
            # Extract sequences of window_size
            window_data = data[i:i + window_size]
            
            for i, key in enumerate(extremes.keys()):
                # 스케일링: (value - min) / (max - min)
                window_data[:, i] = (window_data[:, i] - extremes[key]['min']) / (extremes[key]['max'] - extremes[key]['min'])

            sequences.append(window_data)
            sequence_labels.append(label)
    
    return np.array(sequences), np.array(sequence_labels)

In [8]:
train_window_data, train_labels = make_train_data(train_paths)

  0%|          | 0/16 [00:00<?, ?it/s]

In [9]:
def make_predict_data(test_paths, window_size=CFG['WINDOW_SIZE']):
    '''
        본 함수는 Test Sample들이 대부분 500개의 Time Step으로 되어있다는 정보를 안다는 가정하에 구현되었습니다.
        추론 Window Size : 500에 최적화
    '''
    sequences = []
    for path in tqdm(test_paths):
        driver = str(path.split('/')[-1].split('.')[0].split('_')[1][0])
        data = pd.read_csv(path)
        data['driver'] = 0 if driver == 'A' else 1
        data = data.values
        
        window_data = np.zeros((window_size, 9))
        window_data[:len(data)] = data[:len(data)]
        for i, key in enumerate(extremes.keys()):
            # 스케일링: (value - min) / (max - min)
            window_data[:, i] = (window_data[:, i] - extremes[key]['min']) / (extremes[key]['max'] - extremes[key]['min'])
        sequences.append(window_data)
    
    return np.array(sequences)

In [10]:
test_window_data = make_predict_data(test_paths)

  0%|          | 0/4048 [00:00<?, ?it/s]

In [11]:
len(train_window_data), len(test_window_data)

(1853, 4048)

### Custom Dataset

In [12]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), self.Y[index]
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

In [13]:
dataset = CustomDataset(train_window_data, train_labels)

# 전체 데이터셋의 크기
total_size = len(dataset)

# 분리할 데이터셋의 크기 계산
train_size = int(total_size * 0.8)
val_size = total_size - train_size

# random_split 함수를 사용해 데이터셋 분리
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# DataLoader 인스턴스 생성
train_loader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)

### 모델 선언

In [14]:
class BaseModel(nn.Module):
    def __init__(self, input_size=9, hidden_size=256):
        super(BaseModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size//2, 1)
        )
    
    def forward(self, x):
        # x shape: (B, TRAIN_WINDOW_SIZE, 8)
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size, x.device)
        
        # LSTM layer
        lstm_out, hidden = self.lstm(x, hidden)
        
        # Only use the last output sequence
        last_output = lstm_out[:, -1, :]
        
        # Fully connected layer
        output = self.fc(last_output)
        
        return output.squeeze(1)
    
    def init_hidden(self, batch_size, device):
        # Initialize hidden state and cell state
        return (torch.zeros(1, batch_size, self.hidden_size, device=device),
                torch.zeros(1, batch_size, self.hidden_size, device=device))

### 모델 학습

In [15]:
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.L1Loss().to(device)
    best_loss = 9999999
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        val_loss = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')
        
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            print('Model Saved')
            
    return best_model

In [16]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []

    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.float().to(device)
            Y = Y.float().to(device)

            output = model(X)
            loss = criterion(output, Y)

            val_loss.append(loss.item())
    return np.mean(val_loss)

## Run !!

In [17]:
model = BaseModel()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
infer_model = train(model, optimizer, train_loader, val_loader, device)

  0%|          | 0/12 [00:00<?, ?it/s]

KeyboardInterrupt: 

## 모델 추론

In [None]:
test_dataset = CustomDataset(test_window_data, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, device):
    predictions = []
    
    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            
            output = model(X)
            output = output * 902.
            
            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()
            
            predictions.extend(output)
    
    return np.array(predictions)

In [None]:
pred = inference(infer_model, test_loader, device)

In [None]:
# 결과 후처리
#pred = np.round(pred, 0).astype(int)

In [None]:
pred.shape

## Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['weight'] = pred

In [None]:
submit.to_csv('./submission/baseline_submit.csv', index=False)