In [19]:
file_path = r"C:\Users\james\Documents\GitHub\Traffic_Volume_Prediction"

In [20]:
import pandas as pd

train = pd.read_csv(file_path + r"\Data\train.csv")
test = pd.read_csv(file_path + r"\Data\test.csv")

In [21]:
# X_train과 y_train 설정
X_train = train.drop(columns=['1005004000_velocity'])
y_train = train[['1005004000_velocity']]

# X_test와 y_test 설정
X_test = test.drop(columns=['1005004000_velocity'])
y_test = test[['1005004000_velocity']]

In [22]:
# 타겟 컬럼과 시간 시차 설정
TARGET = '1005004000_velocity'
HORIZON = 24  # 24시간 시차

# 24시간 시차 적용하여 데이터 생성
train['y_shifted'] = train[TARGET].shift(-HORIZON)  # 24시간 이후 값을 타겟으로 설정
test['y_shifted'] = test[TARGET].shift(-HORIZON)

# 비어 있는 데이터 제거
train = train.dropna(subset=['y_shifted'])
test = test.dropna(subset=['y_shifted'])

# X_train, y_train 설정
X_train = train.drop(columns=[TARGET, 'y_shifted'])
y_train = train[['y_shifted']]

# X_test, y_test 설정
X_test = test.drop(columns=[TARGET, 'y_shifted'])
y_test = test[['y_shifted']]

# 결과 출력 (확인용)
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (19261, 68)
y_train shape: (19261, 1)
X_test shape: (696, 68)
y_test shape: (696, 1)


In [23]:
# 넷 다 dataframe
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(19261, 68) (19261, 1) (696, 68) (696, 1)


In [24]:
# 처리 방식 정의
fill_methods = {
    '강수량(mm)': 0,
    '풍속(m/s)': 'average',
    '적설(cm)': 0
}

# 확장 가능한 결측값 처리 함수
def fill_missing_expanding(df, column, max_offset=2400):
    """
    결측값을 24, 48, 72, ..., max_offset 시간 뒤/전의 값으로 채우는 함수
    """
    step = 24  # 24시간 간격
    for offset in range(step, max_offset + step, step):  # 24, 48, 72, ..., max_offset
        # 24시간 뒤와 전 값으로 채우기
        df[column] = df[column].fillna(df[column].shift(offset))
        df[column] = df[column].fillna(df[column].shift(-offset))
        # 결측값이 모두 채워졌으면 반복 종료
        if df[column].isna().sum() == 0:
            break
    # 남은 결측값이 있으면 ffill로 채우기
    if df[column].isna().sum() != 0:
        df[column] = df[column].ffill()
    return df

# 결측값 처리
for df in [X_train, X_test]:
    for column in df.columns:  # 모든 컬럼에 대해 처리
        method = fill_methods.get(column, '24-48hour')  # 명시되지 않은 경우 기본값 '24-48hour'
        
        if method == '24-48hour':  # 24시간 뒤/전 방식으로 채우기
            df = fill_missing_expanding(df, column)
        elif method == 'average':  # 평균으로 채우기
            df[column] = df[column].fillna(df[column].mean())
        elif method == 'ffill':  # 이전 값으로 채우기
            df[column] = df[column].ffill()
        else:  # 특정 값으로 채우기
            df[column] = df[column].fillna(method)


In [25]:
# 모든 열을 float32로 변환
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [26]:
from sklearn.preprocessing import MinMaxScaler

# X_train과 X_test를 하나로 합침
X_combined = pd.concat([X_train, X_test], axis=0)

# MinMaxScaler 적용
scaler = MinMaxScaler()
X_combined_scaled = pd.DataFrame(scaler.fit_transform(X_combined), columns=X_combined.columns)

# 다시 분리
X_train = X_combined_scaled.iloc[:len(X_train), :].reset_index(drop=True)
X_test = X_combined_scaled.iloc[len(X_train):, :].reset_index(drop=True)

In [27]:
import torch
import torch.nn as nn
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

def set_seed(seed):
    # PyTorch 시드 고정 (CPU)
    torch.manual_seed(seed)
    
    # PyTorch 시드 고정 (GPU)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # 모든 GPU에 시드 설정

    # PyTorch 재현성 설정 (optional)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# 시드 설정
set_seed(42)

# 설정 값
N_TEST = 12
N_SPLIT = 27
INPUT_SIZE = 68  # X_train의 feature 수
HIDDEN_SIZE = 128
OUTPUT_SIZE = 1  # y_train의 feature 수
EPOCHS = 300
BATCH_SIZE = 32
LEARNING_RATE = 0.001

# CUDA 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# TimeSeriesSplit 설정
tscv = TimeSeriesSplit(n_splits=N_SPLIT)

# MAPE 점수 리스트
mape_scores = []

# LSTM 모델 정의
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        _, (hidden, _) = self.lstm(x)  # LSTM의 hidden state와 cell state 반환
        out = self.fc(hidden[-1])     # 마지막 hidden state 사용
        return out

# TimeSeriesSplit을 이용한 검증
for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
    # LSTM 모델 초기화 및 CUDA로 이동
    model = LSTMModel(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # 마지막 N_TEST를 제외한 Split만 사용
    if fold < N_SPLIT - N_TEST:
        continue

    # Train/Validation 데이터 분리
    X_tr, X_val = X_train.iloc[train_idx].values, X_train.iloc[val_idx].values
    y_tr, y_val = y_train.iloc[train_idx].values, y_train.iloc[val_idx].values

    # Tensor 변환 및 차원 조정 (LSTM 입력 형식: [Batch, Sequence, Features])
    X_tr_tensor = torch.tensor(X_tr, dtype=torch.float32).unsqueeze(1).to(device)
    y_tr_tensor = torch.tensor(y_tr, dtype=torch.float32).to(device)
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32).unsqueeze(1).to(device)
    y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)

    # LSTM 학습
    for epoch in range(EPOCHS):
        model.train()
        optimizer.zero_grad()
        y_pred = model(X_tr_tensor)
        loss = criterion(y_pred, y_tr_tensor)
        loss.backward()
        optimizer.step()

    # 검증
    model.eval()
    with torch.no_grad():
        y_val_pred = model(X_val_tensor).cpu().squeeze().numpy()  # CPU로 이동 후 Numpy 변환
        mape = mean_absolute_percentage_error(y_val, y_val_pred)
        mape_scores.append(mape)
        print(f"Fold {fold + 1}, Validation MAPE: {mape:.4f}")

# 테스트 세트에 대한 최종 예측
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    y_test_pred = model(X_test_tensor).cpu().squeeze().numpy()  # CPU로 이동 후 Numpy 변환
    test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

# 결과 출력
print("")
print("Results: ")
print(f"Average Validation MAPE: {np.mean(mape_scores):.4f}")
print(f"Validation MAPE Std Dev: {np.std(mape_scores):.4f}")
print(f"Final Test MAPE: {test_mape:.4f}")

# GPU 메모리 비우기
torch.cuda.empty_cache()
print("CUDA memory cleared.")


Using device: cuda
Fold 16, Validation MAPE: 0.1808
Fold 17, Validation MAPE: 0.2050
Fold 18, Validation MAPE: 0.2286
Fold 19, Validation MAPE: 0.2643
Fold 20, Validation MAPE: 0.1961
Fold 21, Validation MAPE: 0.1683
Fold 22, Validation MAPE: 0.1555
Fold 23, Validation MAPE: 0.1569
Fold 24, Validation MAPE: 0.1664
Fold 25, Validation MAPE: 0.1792
Fold 26, Validation MAPE: 0.2024
Fold 27, Validation MAPE: 0.1921

Results: 
Average Validation MAPE: 0.1913
Validation MAPE Std Dev: 0.0302
Final Test MAPE: 0.1806
CUDA memory cleared.


Using device: cuda
Fold 16, Validation MAPE: 0.1544
Fold 17, Validation MAPE: 0.1765
Fold 18, Validation MAPE: 0.1840
Fold 19, Validation MAPE: 0.2096
Fold 20, Validation MAPE: 0.1178
Fold 21, Validation MAPE: 0.0919
Fold 22, Validation MAPE: 0.0835
Fold 23, Validation MAPE: 0.0853
Fold 24, Validation MAPE: 0.0995
Fold 25, Validation MAPE: 0.1039
Fold 26, Validation MAPE: 0.1248
Fold 27, Validation MAPE: 0.0994

Results: 
Average Validation MAPE: 0.1276
Validation MAPE Std Dev: 0.0411
Final Test MAPE: 0.0912
CUDA memory cleared.