In [1]:
file_path = r"C:\Users\james\Documents\GitHub\Traffic_Volume_Prediction"

In [2]:
import pandas as pd

train = pd.read_csv(file_path + r"\Data\train.csv")
test = pd.read_csv(file_path + r"\Data\test.csv")

In [3]:
# X_train과 y_train 설정
X_train = train.drop(columns=['1005004000_velocity'])
y_train = train[['1005004000_velocity']]

# X_test와 y_test 설정
X_test = test.drop(columns=['1005004000_velocity'])
y_test = test[['1005004000_velocity']]

In [4]:
# 타겟 컬럼과 시간 시차 설정
TARGET = '1005004000_velocity'
HORIZON = 1  # 1시간 시차

# 1시간 시차 적용하여 데이터 생성
train['y_shifted'] = train[TARGET].shift(-HORIZON)  # 1시간 이후 값을 타겟으로 설정
test['y_shifted'] = test[TARGET].shift(-HORIZON)

# 비어 있는 데이터 제거
train = train.dropna(subset=['y_shifted'])
test = test.dropna(subset=['y_shifted'])

# X_train, y_train 설정
X_train = train.drop(columns=[TARGET, 'y_shifted'])
y_train = train[['y_shifted']]

# X_test, y_test 설정
X_test = test.drop(columns=[TARGET, 'y_shifted'])
y_test = test[['y_shifted']]

# 결과 출력 (확인용)
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (19284, 68)
y_train shape: (19284, 1)
X_test shape: (719, 68)
y_test shape: (719, 1)


In [5]:
# 넷 다 dataframe
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(19284, 68) (19284, 1) (719, 68) (719, 1)


In [6]:
# 처리 방식 정의
fill_methods = {
    '강수량(mm)': 0,
    '풍속(m/s)': 'average',
    '적설(cm)': 0
}

# 확장 가능한 결측값 처리 함수
def fill_missing_expanding(df, column, max_offset=2400):
    """
    결측값을 24, 48, 72, ..., max_offset 시간 뒤/전의 값으로 채우는 함수
    """
    step = 24  # 24시간 간격
    for offset in range(step, max_offset + step, step):  # 24, 48, 72, ..., max_offset
        # 24시간 뒤와 전 값으로 채우기
        df[column] = df[column].fillna(df[column].shift(offset))
        df[column] = df[column].fillna(df[column].shift(-offset))
        # 결측값이 모두 채워졌으면 반복 종료
        if df[column].isna().sum() == 0:
            break
    # 남은 결측값이 있으면 ffill로 채우기
    if df[column].isna().sum() != 0:
        df[column] = df[column].ffill()
    return df

# 결측값 처리
for df in [X_train, X_test]:
    for column in df.columns:  # 모든 컬럼에 대해 처리
        method = fill_methods.get(column, '24-48hour')  # 명시되지 않은 경우 기본값 '24-48hour'
        
        if method == '24-48hour':  # 24시간 뒤/전 방식으로 채우기
            df = fill_missing_expanding(df, column)
        elif method == 'average':  # 평균으로 채우기
            df[column] = df[column].fillna(df[column].mean())
        elif method == 'ffill':  # 이전 값으로 채우기
            df[column] = df[column].ffill()
        else:  # 특정 값으로 채우기
            df[column] = df[column].fillna(method)


In [7]:
# 모든 열을 float32로 변환
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [8]:
from sklearn.preprocessing import MinMaxScaler

# X_train과 X_test를 하나로 합침
X_combined = pd.concat([X_train, X_test], axis=0)

# MinMaxScaler 적용
scaler = MinMaxScaler()
X_combined_scaled = pd.DataFrame(scaler.fit_transform(X_combined), columns=X_combined.columns)

# 다시 분리
X_train = X_combined_scaled.iloc[:len(X_train), :].reset_index(drop=True)
X_test = X_combined_scaled.iloc[len(X_train):, :].reset_index(drop=True)

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from tqdm import tqdm

def set_seed(seed):
    # PyTorch 시드 고정 (CPU)
    torch.manual_seed(seed)
    
    # PyTorch 시드 고정 (GPU)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # 모든 GPU에 시드 설정

    # PyTorch 재현성 설정 (optional)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# 시드 설정
set_seed(42)

# 설정 값
N_TEST = 12
N_SPLIT = 27
INPUT_SIZE = 21  # X_train의 feature 수
HIDDEN_SIZE = 128
OUTPUT_SIZE = 1  # y_train의 feature 수
EPOCHS = 100
LEARNING_RATE = 0.001

# CUDA 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# TimeSeriesSplit 설정
tscv = TimeSeriesSplit(n_splits=N_SPLIT)

# MAPE 점수 리스트
mape_scores = []

class CNNLSTM(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers):
        super(CNNLSTM, self).__init__()
        self.conv1 = nn.Conv1d(1, 64, kernel_size=3, stride=1)  # [batch_size, 1, 21] -> [batch_size, 64, 19]
        self.conv2 = nn.Conv1d(64, 32, kernel_size=3, stride=1, padding=1)  # [batch_size, 64, 19] -> [batch_size, 32, 19]
        self.batch1 = nn.BatchNorm1d(32)
        self.conv3 = nn.Conv1d(32, 16, kernel_size=3, stride=1, padding=1)  # [batch_size, 32, 19] -> [batch_size, 16, 19]
        self.batch2 = nn.BatchNorm1d(16)
        self.LSTM = nn.LSTM(input_size=16, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = F.relu(self.conv1(x))  # [batch_size, 1, 21] -> [batch_size, 64, 19]
        x = F.relu(self.conv2(x))  # [batch_size, 64, 19] -> [batch_size, 32, 19]
        x = self.batch1(x)
        x = F.relu(self.conv3(x))  # [batch_size, 32, 19] -> [batch_size, 16, 19]
        x = self.batch2(x)
        x = x.permute(0, 2, 1)     # [batch_size, 16, 19] -> [batch_size, 19, 16]
        x, _ = self.LSTM(x)        # [batch_size, 19, 16] -> [batch_size, 19, hidden_size]
        x = x[:, -1, :]            # [batch_size, 19, hidden_size] -> [batch_size, hidden_size]
        x = self.fc1(x)            # [batch_size, hidden_size] -> [batch_size, output_size]
        return x


# 학습 및 검증 루프
for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
    if fold < N_SPLIT - N_TEST:
        continue

    model = CNNLSTM(INPUT_SIZE, OUTPUT_SIZE, HIDDEN_SIZE, num_layers=1).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # 데이터 분리
    X_tr, X_val = X_train.iloc[train_idx].values, X_train.iloc[val_idx].values
    y_tr, y_val = y_train.iloc[train_idx].values, y_train.iloc[val_idx].values

    # 배치 데이터 준비
    batch_size = 32
    train_dataset = torch.utils.data.TensorDataset(
        torch.tensor(X_tr, dtype=torch.float32).unsqueeze(1),
        torch.tensor(y_tr, dtype=torch.float32).unsqueeze(1)
    )
    val_dataset = torch.utils.data.TensorDataset(
        torch.tensor(X_val, dtype=torch.float32).unsqueeze(1),
        torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)
    )
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # 학습
    for epoch in tqdm(range(EPOCHS), desc=f"Fold {fold + 1} Training"):
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(X_batch)
            # print(y_pred.shape, y_batch.shape)
            loss = criterion(y_pred, y_batch.squeeze(1))  # 크기 맞춤
            loss.backward()
            optimizer.step()

    # 검증
    model.eval()
    val_preds, val_targets = [], []
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            val_preds.append(y_pred.cpu().numpy())
            val_targets.append(y_batch.cpu().numpy())

    val_preds = np.concatenate(val_preds).squeeze()
    val_targets = np.concatenate(val_targets).squeeze()
    mape = mean_absolute_percentage_error(val_targets, val_preds)
    mape_scores.append(mape)
    print(f"Fold {fold + 1}, Validation MAPE: {mape:.4f}")

Using device: cuda


Fold 16 Training: 100%|██████████| 100/100 [03:24<00:00,  2.04s/it]


Fold 16, Validation MAPE: 0.0787


Fold 17 Training: 100%|██████████| 100/100 [03:36<00:00,  2.17s/it]


Fold 17, Validation MAPE: 0.0854


Fold 18 Training: 100%|██████████| 100/100 [03:49<00:00,  2.29s/it]


Fold 18, Validation MAPE: 0.1337


Fold 19 Training: 100%|██████████| 100/100 [04:01<00:00,  2.41s/it]


Fold 19, Validation MAPE: 0.1482


Fold 20 Training: 100%|██████████| 100/100 [02:14<00:00,  1.35s/it]


Fold 20, Validation MAPE: 0.0879


Fold 21 Training: 100%|██████████| 100/100 [02:20<00:00,  1.41s/it]


Fold 21, Validation MAPE: 0.0685


Fold 22 Training: 100%|██████████| 100/100 [02:27<00:00,  1.47s/it]


Fold 22, Validation MAPE: 0.0667


Fold 23 Training: 100%|██████████| 100/100 [02:35<00:00,  1.55s/it]


Fold 23, Validation MAPE: 0.0698


Fold 24 Training: 100%|██████████| 100/100 [02:40<00:00,  1.61s/it]


Fold 24, Validation MAPE: 0.0766


Fold 25 Training: 100%|██████████| 100/100 [02:48<00:00,  1.69s/it]


Fold 25, Validation MAPE: 0.0741


Fold 26 Training: 100%|██████████| 100/100 [03:00<00:00,  1.81s/it]


Fold 26, Validation MAPE: 0.1046


Fold 27 Training: 100%|██████████| 100/100 [03:09<00:00,  1.90s/it]

Fold 27, Validation MAPE: 0.0902





In [10]:
# 모델 초기화
model = CNNLSTM(INPUT_SIZE, OUTPUT_SIZE, HIDDEN_SIZE, num_layers=1).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# 데이터 분리
X_tr = X_train
y_tr = y_train

# 배치 데이터 준비
batch_size = 32
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(X_tr.values, dtype=torch.float32).unsqueeze(1),
    torch.tensor(y_tr.values, dtype=torch.float32).unsqueeze(1)
)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# 학습
for epoch in tqdm(range(EPOCHS), desc=f"Fold {fold + 1} Training"):
    model.train()
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        # print(y_pred.shape, y_batch.shape)
        loss = criterion(y_pred, y_batch.squeeze(1))  # 크기 맞춤
        loss.backward()
        optimizer.step()

# 테스트 세트에 대한 최종 예측
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    y_test_pred = model(X_test_tensor).cpu().squeeze().numpy()  # CPU로 이동 후 Numpy 변환
    test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

# 결과 출력
print("Results: ")
print(f"Average Validation MAPE: {np.mean(mape_scores):.4f}")
print(f"Validation MAPE Std Dev: {np.std(mape_scores):.4f}")
print(f"Final Test MAPE: {test_mape:.4f}")

# GPU 메모리 비우기
torch.cuda.empty_cache()
print("CUDA memory cleared.")

Fold 27 Training: 100%|██████████| 100/100 [03:14<00:00,  1.94s/it]

Results: 
Average Validation MAPE: 0.0904
Validation MAPE Std Dev: 0.0250
Final Test MAPE: 0.0801
CUDA memory cleared.





Using device: cuda
Fold 16, Validation MAPE: 0.1759
Fold 17, Validation MAPE: 0.1968
Fold 18, Validation MAPE: 0.1657
Fold 19, Validation MAPE: 0.1805
Fold 20, Validation MAPE: 0.1039
Fold 21, Validation MAPE: 0.0849
Fold 22, Validation MAPE: 0.0733
Fold 23, Validation MAPE: 0.0804
Fold 24, Validation MAPE: 0.0830
Fold 25, Validation MAPE: 0.0852
Fold 26, Validation MAPE: 0.1051
Fold 27, Validation MAPE: 0.0894
Results: 
Average Validation MAPE: 0.1187
Validation MAPE Std Dev: 0.0445
Final Test MAPE: 0.0855
CUDA memory cleared.