In [1]:
pip install sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1310 sha256=e91c192058ba61b7d0a32cb31bf1126b86469ba97a92108646b619c95948dee3
  Stored in directory: /root/.cache/pip/wheels/46/ef/c3/157e41f5ee1372d1be90b09f74f82b10e391eaacca8f22d33e
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


In [2]:
import random
import pandas as pd
import numpy as np
import os
import glob
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.impute import KNNImputer
from datetime import datetime

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [5]:
CFG = {
    'EPOCHS':20,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':16,
    'SEED':41
}

## Fixed RandomSeed

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-processing

In [7]:
all_input_list = sorted(glob.glob('/content/drive/MyDrive/TAVE/청경채/data/train_input/*.csv'))
all_target_list = sorted(glob.glob('/content/drive/MyDrive/TAVE/청경채/data/train_target/*.csv'))

In [8]:
train_input_list = all_input_list[:50]
train_target_list = all_target_list[:50]

val_input_list = all_input_list[50:]
val_target_list = all_target_list[50:]

## CustomDataset

In [27]:
class CustomDataset(Dataset):
    def __init__(self, input_paths, target_paths, infer_mode):
        self.input_paths = input_paths
        self.target_paths = target_paths
        self.infer_mode = infer_mode
        
        self.data_list = []
        self.label_list = []
        print('Data Pre-processing..')
        for input_path, target_path in tqdm(zip(self.input_paths, self.target_paths)):
            input_df = pd.read_csv(input_path)
            target_df = pd.read_csv(target_path)
            # minmax = MinMaxScaler()
            standard = StandardScaler()
            # norm = Normalizer()
            # print(input_df.info())
            input_df['시간'] = pd.to_datetime(input_df['시간'])
            input_df['년'] = input_df['시간'].dt.year 
            input_df['월'] = input_df['시간'].dt.month 
            input_df['일'] = input_df['시간'].dt.day 
            input_df['시'] = input_df['시간'].dt.hour 
            input_df['분'] = input_df['시간'].dt.minute 
            input_df = input_df.drop(columns=['시간'])
            try:
              input_df = input_df.drop(columns=['외부온도추정관측치', '외부습도추정관측치'])
            except:
              input_df = input_df.drop(columns=['외부온도관측치', '외부습도관측치'])
            # input_df = input_df.dropna(how='all').reset_index(drop=True)
            # input_df = input_df.fillna(0)
            # try:
            #   input_df['외부온도추정관측치'] = input_df['외부온도추정관측치'].fillna(0)
            #   input_df['외부습도추정관측치'] = input_df['외부습도추정관측치'].fillna(0)
            # except:
            #   input_df['외부습도관측치'] = input_df['외부습도관측치'].fillna(0)
            #   input_df['외부습도관측치'] = input_df['외부습도관측치'].fillna(0)
            imputer = KNNImputer(n_neighbors=5)
            imputed = imputer.fit_transform(input_df)
            input_df = pd.DataFrame(imputed, columns=input_df.columns)
            
            # print(input_df)
            # plt.rc('font', family='NanumBarunGothic')
            # plt.figure(figsize=(10,5))
            # ax = sns.heatmap(input_df)
            # plt.show()
            
            # print(input_df.describe())
            # q3 = input_df.quantile(0.75)
            # q1 = input_df.quantile(0.25)

            # iqr = (q3 - q1)
            # iqr = iqr * 1.5
            # lowest = q1 - iqr
            # highest = q3 + iqr
            # input_1 = input_df[iqr != 0.0]
            # print(input_1)
            # outlier_index = input_df[((input_1 < lowest) | (input_1 > highest))].index
            # print(len(input_1))
            # print(len(outlier_index))
            input_df[input_df.columns] = standard.fit_transform(input_df[input_df.columns])
            input_length = int(len(input_df)/1440)
            target_length = int(len(target_df))
            
            for idx in range(target_length):
                time_series = input_df[1440*idx:1440*(idx+1)].values
                self.data_list.append(torch.Tensor(time_series))
            for label in target_df["rate"]:
                self.label_list.append(label)
        print('Done.')
              
    def __getitem__(self, index):
        data = self.data_list[index]
        label = self.label_list[index]
        if self.infer_mode == False:
            return data, label
        else:
            return data
        
    def __len__(self):
        return len(self.data_list)

In [28]:
train_dataset = CustomDataset(train_input_list, train_target_list, False)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=6)

val_dataset = CustomDataset(val_input_list, val_target_list, False)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=6)

Data Pre-processing..


0it [00:00, ?it/s]

Done.
Data Pre-processing..


0it [00:00, ?it/s]

Done.


In [None]:
train_dataset[0]

(tensor([[ 24.8000,  40.5000, 361.0000,  ..., 309.4100,  16.5480,  37.5960],
         [ 20.5000,  44.1000, 355.0000,  ...,   0.0000,   0.0000,   0.0000],
         [ 20.6000,  44.3000, 360.0000,  ...,   0.0000,   0.0000,   0.0000],
         ...,
         [ 19.8000,  55.1000, 428.0000,  ...,   0.0000,   0.0000,   0.0000],
         [ 19.8000,  55.1000, 425.0000,  ...,   0.0000,   0.0000,   0.0000],
         [ 19.8000,  55.1000, 427.0000,  ...,   0.0000,   0.0000,   0.0000]]),
 0.5)

In [None]:
train_dataset[0][0]

tensor([[ 24.8000,  40.5000, 361.0000,  ..., 309.4100,  16.5480,  37.5960],
        [ 20.5000,  44.1000, 355.0000,  ...,   0.0000,   0.0000,   0.0000],
        [ 20.6000,  44.3000, 360.0000,  ...,   0.0000,   0.0000,   0.0000],
        ...,
        [ 19.8000,  55.1000, 428.0000,  ...,   0.0000,   0.0000,   0.0000],
        [ 19.8000,  55.1000, 425.0000,  ...,   0.0000,   0.0000,   0.0000],
        [ 19.8000,  55.1000, 427.0000,  ...,   0.0000,   0.0000,   0.0000]])

In [None]:
len(train_dataset)

1607

In [None]:
len(train_dataset[0][0])

1440

In [None]:
it = iter(train_dataset)

for i in range(10):
    print(i, next(it))

0 (tensor([[ 24.8000,  40.5000, 361.0000,  ..., 309.4100,  16.5480,  37.5960],
        [ 20.5000,  44.1000, 355.0000,  ...,   0.0000,   0.0000,   0.0000],
        [ 20.6000,  44.3000, 360.0000,  ...,   0.0000,   0.0000,   0.0000],
        ...,
        [ 19.8000,  55.1000, 428.0000,  ...,   0.0000,   0.0000,   0.0000],
        [ 19.8000,  55.1000, 425.0000,  ...,   0.0000,   0.0000,   0.0000],
        [ 19.8000,  55.1000, 427.0000,  ...,   0.0000,   0.0000,   0.0000]]), 0.5)
1 (tensor([[ 19.8000,  55.1000, 426.0000,  ...,   0.0000,   0.0000,   0.0000],
        [ 19.8000,  55.1000, 421.0000,  ...,   0.0000,   0.0000,   0.0000],
        [ 19.8000,  55.1000, 422.0000,  ...,   0.0000,   0.0000,   0.0000],
        ...,
        [ 19.1000,  54.1000, 364.0000,  ...,   0.0000,   0.0000,   0.0000],
        [ 19.1000,  54.1000, 361.0000,  ...,   0.0000,   0.0000,   0.0000],
        [ 19.1000,  54.1000, 362.0000,  ...,   0.0000,   0.0000,   0.0000]]), 0.66667)
2 (tensor([[ 19.1000,  54.1000, 361.00

## Model Define

In [29]:
class BaseModel(nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()
        self.lstm = nn.LSTM(input_size=40, hidden_size=256, batch_first=True, bidirectional=False)
        self.classifier = nn.Sequential(
            nn.Linear(256, 1)
        )
        
    def forward(self, x):
        hidden, _ = self.lstm(x)
        output = self.classifier(hidden[:,-1,:])
        return output

## Train

In [30]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    criterion = nn.L1Loss().to(device)
    
    best_loss = 9999
    best_model = None
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        val_loss = validation(model, val_loader, criterion, device)
        
        print(f'Train Loss : [{np.mean(train_loss):.5f}] Valid Loss : [{val_loss:.5f}]')
        
        if scheduler is not None:
            scheduler.step()
            
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
    return best_model

In [31]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.float().to(device)
            Y = Y.float().to(device)
            
            model_pred = model(X)
            loss = criterion(model_pred, Y)
            
            val_loss.append(loss.item())
            
    return np.mean(val_loss)

## Run!!

In [33]:
model = BaseModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

best_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.28356] Valid Loss : [0.23808]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27582] Valid Loss : [0.24325]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27703] Valid Loss : [0.24245]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27542] Valid Loss : [0.23821]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27623] Valid Loss : [0.24070]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27558] Valid Loss : [0.24079]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27556] Valid Loss : [0.24409]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27669] Valid Loss : [0.24195]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27516] Valid Loss : [0.24073]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27519] Valid Loss : [0.23848]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27490] Valid Loss : [0.23973]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27509] Valid Loss : [0.24463]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27584] Valid Loss : [0.23648]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27502] Valid Loss : [0.24081]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27485] Valid Loss : [0.24272]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27395] Valid Loss : [0.23636]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27489] Valid Loss : [0.23944]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27506] Valid Loss : [0.24252]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27461] Valid Loss : [0.23747]


  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss : [0.27411] Valid Loss : [0.24261]


## Inference

In [34]:
test_input_list = sorted(glob.glob('/content/drive/MyDrive/TAVE/청경채/data/test_input/*.csv'))
test_target_list = sorted(glob.glob('/content/drive/MyDrive/TAVE/청경채/data/test_target/*.csv'))

In [35]:
def inference_per_case(model, test_loader, test_path, device):
    model.to(device)
    model.eval()
    pred_list = []
    with torch.no_grad():
        for X in iter(test_loader):
            X = X.float().to(device)
            
            model_pred = model(X)
            
            model_pred = model_pred.cpu().numpy().reshape(-1).tolist()
            
            pred_list += model_pred
    
    submit_df = pd.read_csv(test_path)
    submit_df['rate'] = pred_list
    submit_df.to_csv(test_path, index=False)

In [36]:
for test_input_path, test_target_path in zip(test_input_list, test_target_list):
    test_dataset = CustomDataset([test_input_path], [test_target_path], True)
    test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)
    inference_per_case(best_model, test_loader, test_target_path, device)

Data Pre-processing..


0it [00:00, ?it/s]

Done.
Data Pre-processing..


0it [00:00, ?it/s]

Done.
Data Pre-processing..


0it [00:00, ?it/s]

Done.
Data Pre-processing..


0it [00:00, ?it/s]

Done.
Data Pre-processing..


0it [00:00, ?it/s]

Done.
Data Pre-processing..


0it [00:00, ?it/s]

Done.


In [37]:
import zipfile
os.chdir("/content/drive/MyDrive/TAVE/청경채/data/test_target/")
submission = zipfile.ZipFile("../submission.zip", 'w')
for path in test_target_list:
    path = path.split('/')[-1]
    submission.write(path)
submission.close()