In [1]:
import random
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':10,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':4096,
    'SEED':41
}

In [4]:

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [5]:
brand_keyword = pd.read_csv('data/brand_keyword_cnt.csv')
product_info = pd.read_csv('data/product_info.csv')
sales = pd.read_csv('data/sales.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [6]:
train_data = pd.read_csv('data/train.csv').drop(columns=['제품'])

In [6]:
train_data.head()

Unnamed: 0,ID,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,0,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,0,0,0,0,0,...,0,0,0,1,3,2,0,0,2,0
2,2,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,B002-C001-0001,B002-C002-0001,B002-C003-0003,B002-00003,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
nullrow = brand_keyword['2022-01-01'][brand_keyword['2022-01-01'].isnull()]

nullrow = list(nullrow.index)
nullrow

[95,
 246,
 250,
 303,
 385,
 440,
 444,
 466,
 515,
 647,
 765,
 811,
 1105,
 1162,
 1398,
 1486,
 1518,
 1588,
 1706,
 1893,
 1980,
 1999,
 2117,
 2125,
 2298,
 2328,
 2349,
 2430,
 2471,
 2495,
 2529,
 2711,
 2855,
 3142,
 3149]

In [10]:
brand_keyword.drop('브랜드', axis=1, inplace = True)

In [11]:
brand_t = brand_keyword.T
brand_t

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3160,3161,3162,3163,3164,3165,3166,3167,3168,3169
2022-01-01,0.84131,12.64868,0.33362,1.07339,0.000000,0.884820,0.0,0.232064,0.33362,4.33710,...,0.0,14.07020,2.77052,1.07339,0.0,2.32085,0.14505,0.00000,0.14505,4.55468
2022-01-02,0.91383,20.27850,0.43516,1.71163,0.000000,1.624588,0.0,0.246574,0.44966,6.38236,...,0.0,17.44995,3.64084,1.34899,0.0,2.98810,0.00000,0.00000,0.00000,5.54105
2022-01-03,1.45053,15.33217,0.36263,2.01624,0.188558,1.914691,0.0,0.464151,0.55120,6.61444,...,0.0,19.64026,4.90281,1.53756,0.0,3.61183,0.08703,0.00000,0.11604,6.15027
2022-01-04,2.42239,12.75021,0.17406,1.91470,0.246574,1.697114,0.0,0.377139,0.52219,6.29532,...,0.0,17.11633,6.45488,1.18944,0.0,4.06150,0.07252,0.00000,0.07252,6.39686
2022-01-05,1.87119,13.56251,0.21758,1.98723,0.246574,1.595591,0.0,0.580207,0.47867,6.19379,...,0.0,17.60951,5.74412,1.40702,0.0,3.66985,0.08703,0.00000,0.11604,7.00609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-31,0.29010,9.48651,0.49318,1.53756,0.000000,0.928326,0.0,0.667242,2.61096,6.16478,...,0.0,3.03162,3.23469,2.04525,0.0,1.98723,0.00000,0.00000,0.11604,5.51203
2023-04-01,0.31911,9.28343,0.91383,1.34899,0.000000,0.928326,0.0,0.261084,2.50942,6.62895,...,0.0,3.72787,2.65448,1.87119,0.0,1.07339,0.00000,0.00000,0.11604,3.52480
2023-04-02,0.23208,10.42935,0.79779,1.26196,0.072526,0.884820,0.0,0.348119,0.94284,7.25268,...,0.0,3.77139,2.93008,3.20568,0.0,1.92921,0.00000,0.00000,0.08703,4.03249
2023-04-03,0.33362,11.15462,1.01537,2.32085,0.217577,1.392500,0.0,0.812294,0.92834,7.74586,...,0.0,3.51029,4.33710,3.22019,0.0,2.50942,0.00000,0.07252,0.17406,5.88917


In [12]:
# 브랜드 별로 keyword cnt trend확인
#for i in brand_t.columns:
#    brand_t[i].plot(figsize = (10,6))
#    plt.show()

In [13]:
null_brand = brand_t[nullrow]

In [14]:
# 결측이 있는 브랜드는 아예 측정이 안됨 모든 시점이 결측치임. 
null_brand.isnull().sum()

95      459
246     459
250     459
303     459
385     459
440     459
444     459
466     459
515     459
647     459
765     459
811     459
1105    459
1162    459
1398    459
1486    459
1518    459
1588    459
1706    459
1893    459
1980    459
1999    459
2117    459
2125    459
2298    459
2328    459
2349    459
2430    459
2471    459
2495    459
2529    459
2711    459
2855    459
3142    459
3149    459
dtype: int64

In [15]:
brand_keyword = pd.read_csv('data/brand_keyword_cnt.csv')
brand_keyword = brand_keyword.fillna(0)
brand_keyword.isnull().sum().sum()

0

In [16]:
# 0으로 대체
brand_keyword.iloc[nullrow,:].head()

Unnamed: 0,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-08,2022-01-09,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
95,B002-00117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
246,B002-00296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
250,B002-00302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
303,B002-00366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
385,B002-00460,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Set the '브랜드' column as the index in both DataFrames
train_data.set_index('브랜드', inplace=True)
brand_keyword.set_index('브랜드', inplace=True)

# Update column names of 'brand_keyword' DataFrame to match the 'train_data' DataFrame
brand_keyword.columns = [col + '_keyword' for col in brand_keyword.columns]

# Merge the two DataFrames on the index ('브랜드')
combined_df = pd.merge(train_data, brand_keyword, left_index=True, right_index=True)

# Make a copy of the merged DataFrame and drop the 'ID' column
train_data = combined_df.copy().drop('ID', axis=1)


In [18]:
# combined_df = pd.merge(train_data, brand_keyword, on='브랜드', suffixes=('_sales', '_keyword'))

In [19]:
# combined_df.head()

In [20]:
# train_data = combined_df.copy().drop('ID', axis = 1)

In [22]:
train_data

Unnamed: 0_level_0,대분류,중분류,소분류,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,...,2023-03-26_keyword,2023-03-27_keyword,2023-03-28_keyword,2023-03-29_keyword,2023-03-30_keyword,2023-03-31_keyword,2023-04-01_keyword,2023-04-02_keyword,2023-04-03_keyword,2023-04-04_keyword
브랜드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B002-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,0,0,0,0,0,0,0,...,0.31911,0.39164,0.37713,0.49318,0.07252,0.29010,0.31911,0.23208,0.33362,0.44966
B002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,0,0,0,0,0,0,0,...,10.26979,11.96692,10.64693,10.41485,10.48738,9.48651,9.28343,10.42935,11.15462,11.38671
B002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,0,0,0,0,0,0,0,...,10.26979,11.96692,10.64693,10.41485,10.48738,9.48651,9.28343,10.42935,11.15462,11.38671
B002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,0,0,0,0,0,0,0,...,10.26979,11.96692,10.64693,10.41485,10.48738,9.48651,9.28343,10.42935,11.15462,11.38671
B002-00003,B002-C001-0001,B002-C002-0001,B002-C003-0003,0,0,0,0,0,0,0,...,0.53669,0.69625,0.44966,0.39164,1.02988,0.49318,0.91383,0.79779,1.01537,0.88482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B002-03799,B002-C001-0003,B002-C002-0008,B002-C003-0042,0,0,0,0,0,0,0,...,5.10588,6.67246,6.44038,5.90368,4.93182,5.51203,3.52480,4.03249,5.88917,5.07687
B002-03799,B002-C001-0003,B002-C002-0008,B002-C003-0044,0,0,0,0,0,0,0,...,5.10588,6.67246,6.44038,5.90368,4.93182,5.51203,3.52480,4.03249,5.88917,5.07687
B002-03799,B002-C001-0003,B002-C002-0008,B002-C003-0044,0,0,0,0,0,0,0,...,5.10588,6.67246,6.44038,5.90368,4.93182,5.51203,3.52480,4.03249,5.88917,5.07687
B002-03799,B002-C001-0003,B002-C002-0008,B002-C003-0044,0,0,0,0,0,0,0,...,5.10588,6.67246,6.44038,5.90368,4.93182,5.51203,3.52480,4.03249,5.88917,5.07687


In [23]:
# Data Scaling
scale_max_dict = {}
scale_min_dict = {}

for idx in tqdm(range(len(train_data))):
    maxi = np.max(train_data.iloc[idx,4:])
    mini = np.min(train_data.iloc[idx,4:])
    
    if maxi == mini :
        train_data.iloc[idx,4:] = 0
    else:
        train_data.iloc[idx,4:] = (train_data.iloc[idx,4:] - mini) / (maxi - mini)
    
    scale_max_dict[idx] = maxi
    scale_min_dict[idx] = mini

  0%|          | 0/15890 [00:00<?, ?it/s]

In [24]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '소분류', '브랜드']

for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])

KeyError: '브랜드'

In [22]:
train_data.columns

Index(['대분류', '중분류', '소분류', '브랜드', '2022-01-01_sales', '2022-01-02_sales',
       '2022-01-03_sales', '2022-01-04_sales', '2022-01-05_sales',
       '2022-01-06_sales',
       ...
       '2023-03-26_keyword', '2023-03-27_keyword', '2023-03-28_keyword',
       '2023-03-29_keyword', '2023-03-30_keyword', '2023-03-31_keyword',
       '2023-04-01_keyword', '2023-04-02_keyword', '2023-04-03_keyword',
       '2023-04-04_keyword'],
      dtype='object', length=922)

In [32]:
def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : 일별 판매량, 브랜드, 그리고 브랜드 키워드 카운트 정보가 있는 데이터 프레임
    train_size : 학습에 활용할 기간
    predict_size : 추론할 기간
    '''
    num_rows = len(data)
    keyword_start_index = data.columns.get_loc('2022-01-01_keyword')  # Find the index of the first '2022-01-01_keyword' column
    window_size = train_size + predict_size
    
    input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), train_size, 6)) # 6 features: 대분류, 중분류, 소분류, 브랜드, sales, keyword count
    target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size))
    
    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])  # Extract encoding info ('대분류', '중분류', '소분류', '브랜드')
        sales_data = np.array(data.iloc[i, 4:])
        keyword_data = np.array(data.iloc[i, keyword_start_index:])
        
        for j in range(len(sales_data) - window_size + 1):
            window = sales_data[j : j + window_size]
            temp_data = np.column_stack((encode_info, window[:train_size], keyword_data[j : j + window_size][:train_size]))
            input_data[i * (len(data.columns) - window_size + 1) + j] = temp_data
            target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]
    
    return input_data, target_data

In [33]:
# 확인 완료

def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량, 브랜드, 그리고 브랜드 키워드 카운트 정보가 있는 데이터 프레임
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)
    keyword_start_index = data.columns.get_loc('2022-01-01_keyword')  # Find the index of the first '2022-01-01_keyword' column
    
    input_data = np.empty((num_rows, train_size, 6))
    
    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])  # Extract encoding info ('대분류', '중분류', '소분류', '브랜드')
        sales_data = np.array(data.iloc[i, 4:4+train_size])  # Extract daily sales data for the past 'train_size' days
        keyword_data = np.array(data.iloc[i, keyword_start_index:keyword_start_index+train_size])  # Extract keyword count data for the past 'train_size' days
        
        temp_data = np.column_stack((encode_info, sales_data, keyword_data))
        input_data[i] = temp_data
    
    return input_data


In [34]:
train_input, train_target = make_train_data(train_data)
test_input = make_predict_data(train_data)

  0%|          | 0/15890 [00:00<?, ?it/s]

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 90 and the array at index 2 has size 89

In [27]:
# Train / Validation Split
data_len = len(train_input)
val_input = train_input[-int(data_len*0.2):]
val_target = train_target[-int(data_len*0.2):]
train_input = train_input[:-int(data_len*0.2)]
train_target = train_target[:-int(data_len*0.2)]

NameError: name 'train_input' is not defined

In [25]:
train_input.shape, train_target.shape, val_input.shape, val_target.shape, test_input.shape

((4487336, 90, 5),
 (4487336, 21),
 (1121834, 90, 5),
 (1121834, 21),
 (15890, 90, 5))

In [26]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

In [27]:
train_dataset = CustomDataset(train_input, train_target)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input, val_target)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [29]:
class BaseModel(nn.Module):
    def __init__(self, input_size=5, hidden_size=512, output_size=CFG['PREDICT_SIZE']):
        super(BaseModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size//2, output_size)
        )
            
        self.actv = nn.ReLU()
    
    def forward(self, x):
        # x shape: (B, TRAIN_WINDOW_SIZE, 5)
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size, x.device)
        
        # LSTM layer
        lstm_out, hidden = self.lstm(x, hidden)
        
        # Only use the last output sequence
        last_output = lstm_out[:, -1, :]
        
        # Fully connected layer
        output = self.actv(self.fc(last_output))
        
        return output.squeeze(1)
    
    def init_hidden(self, batch_size, device):
        # Initialize hidden state and cell state
        return (torch.zeros(1, batch_size, self.hidden_size, device=device),
                torch.zeros(1, batch_size, self.hidden_size, device=device))

In [30]:
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        val_loss = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')
        
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            print('Model Saved')
    return best_model

In [31]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    
    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            output = model(X)
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
    return np.mean(val_loss)

In [36]:
model = BaseModel()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
infer_model = train(model, optimizer, train_loader, val_loader, device)

  0%|          | 0/1096 [00:00<?, ?it/s]  0%|          | 0/1096 [00:03<?, ?it/s]


RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 14747697152 bytes.