In [16]:
import pandas as pd

train = pd.read_csv("/kaggle/input/2036traffic/train.csv")
test = pd.read_csv("/kaggle/input/2036traffic/test.csv")

In [17]:
# X_train과 y_train 설정
X_train = train.drop(columns=['1005004000_velocity'])
y_train = train[['1005004000_velocity']]

# X_test와 y_test 설정
X_test = test.drop(columns=['1005004000_velocity'])
y_test = test[['1005004000_velocity']]

In [18]:
# 타겟 컬럼과 시간 시차 설정
TARGET = '1005004000_velocity'
HORIZON = 1  # 1시간 시차

# 1시간 시차 적용하여 데이터 생성
train['y_shifted'] = train[TARGET].shift(-HORIZON)  # 1시간 이후 값을 타겟으로 설정
test['y_shifted'] = test[TARGET].shift(-HORIZON)

# 비어 있는 데이터 제거
train = train.dropna(subset=['y_shifted'])
test = test.dropna(subset=['y_shifted'])

# X_train, y_train 설정
X_train = train.drop(columns=[TARGET, 'y_shifted'])
y_train = train[['y_shifted']]

# X_test, y_test 설정
X_test = test.drop(columns=[TARGET, 'y_shifted'])
y_test = test[['y_shifted']]

# 결과 출력 (확인용)
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (19284, 68)
y_train shape: (19284, 1)
X_test shape: (719, 68)
y_test shape: (719, 1)


In [19]:
# 넷 다 dataframe
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(19284, 68) (19284, 1) (719, 68) (719, 1)


In [20]:
# 처리 방식 정의
fill_methods = {
    '강수량(mm)': 0,
    '풍속(m/s)': 'average',
    '적설(cm)': 0
}

# 확장 가능한 결측값 처리 함수
def fill_missing_expanding(df, column, max_offset=2400):
    """
    결측값을 24, 48, 72, ..., max_offset 시간 뒤/전의 값으로 채우는 함수
    """
    step = 24  # 24시간 간격
    for offset in range(step, max_offset + step, step):  # 24, 48, 72, ..., max_offset
        # 24시간 뒤와 전 값으로 채우기
        df[column] = df[column].fillna(df[column].shift(offset))
        df[column] = df[column].fillna(df[column].shift(-offset))
        # 결측값이 모두 채워졌으면 반복 종료
        if df[column].isna().sum() == 0:
            break
    # 남은 결측값이 있으면 ffill로 채우기
    if df[column].isna().sum() != 0:
        df[column] = df[column].ffill()
    return df

# 결측값 처리
for df in [X_train, X_test]:
    for column in df.columns:  # 모든 컬럼에 대해 처리
        method = fill_methods.get(column, '24-48hour')  # 명시되지 않은 경우 기본값 '24-48hour'
        
        if method == '24-48hour':  # 24시간 뒤/전 방식으로 채우기
            df = fill_missing_expanding(df, column)
        elif method == 'average':  # 평균으로 채우기
            df[column] = df[column].fillna(df[column].mean())
        elif method == 'ffill':  # 이전 값으로 채우기
            df[column] = df[column].ffill()
        else:  # 특정 값으로 채우기
            df[column] = df[column].fillna(method)


In [21]:
# 모든 열을 float32로 변환
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [22]:
X_train.isnull().sum()

1050003300_velocity    0
1050020400_velocity    0
1070000200_velocity    0
1070000500_velocity    0
1070001600_velocity    0
                      ..
is_evening_rush        0
month_sin              0
month_cos              0
hour_sin               0
hour_cos               0
Length: 68, dtype: int64

In [23]:
from sklearn.preprocessing import MinMaxScaler

# X_train과 X_test를 하나로 합침
X_combined = pd.concat([X_train, X_test], axis=0)

# MinMaxScaler 적용
scaler = MinMaxScaler()
X_combined_scaled = pd.DataFrame(scaler.fit_transform(X_combined), columns=X_combined.columns)

# 다시 분리
X_train = X_combined_scaled.iloc[:len(X_train), :].reset_index(drop=True)
X_test = X_combined_scaled.iloc[len(X_train):, :].reset_index(drop=True)

X_train.drop(["F-03 유입_traffic_x", "F-04 유출_traffic_x"], axis=1, inplace=True)
# 컬럼 이름 변경
X_train.rename(columns={"F-03 유입_traffic_y": "F-03 유입_traffic"}, inplace=True)
X_train.rename(columns={"F-04 유출_traffic_y": "F-04 유출_traffic"}, inplace=True)

X_test.drop(["F-03 유입_traffic_x", "F-04 유출_traffic_x"], axis=1, inplace=True)
# 컬럼 이름 변경
X_test.rename(columns={"F-03 유입_traffic_y": "F-03 유입_traffic"}, inplace=True)
X_test.rename(columns={"F-04 유출_traffic_y": "F-04 유출_traffic"}, inplace=True)

X_train.columns.to_list()

['1050003300_velocity',
 '1050020400_velocity',
 '1070000200_velocity',
 '1070000500_velocity',
 '1070001600_velocity',
 '기온(°C)',
 '강수량(mm)',
 '풍속(m/s)',
 '습도(%)',
 '적설(cm)',
 'A-12 유입_traffic',
 'A-19 유입_traffic',
 'A-20 유입_traffic',
 'A-22 유입_traffic',
 'A-22 유출_traffic',
 'B-01 유입_traffic',
 'B-06 유입_traffic',
 'B-14 유입_traffic',
 'B-14 유출_traffic',
 'B-22 유입_traffic',
 'B-22 유출_traffic',
 'B-36 유출_traffic',
 'C-02 유입_traffic',
 'C-06 유입_traffic',
 'C-07 유입_traffic',
 'C-09 유출_traffic',
 'C-17 유입_traffic',
 'C-17 유출_traffic',
 'C-20 유입_traffic',
 'C-21 유출_traffic',
 'D-04 유입_traffic',
 'D-12 유출_traffic',
 'D-16 유출_traffic',
 'D-17 유입_traffic',
 'D-17 유출_traffic',
 'D-21 유출_traffic',
 'D-28 유입_traffic',
 'D-28 유출_traffic',
 'D-31 유입_traffic',
 'D-35 유입_traffic',
 'D-35 유출_traffic',
 'D-44 유출_traffic',
 'F-02 유출_traffic',
 'F-03 유입_traffic',
 'F-03 유출_traffic',
 'F-04 유출_traffic',
 'F-05 유입_traffic',
 'F-05 유출_traffic',
 'F-06 유입_traffic',
 'F-07 유입_traffic',
 'F-07 유출_traffic',
 'F-

In [24]:
# !pip install pytorch_forecasting
# !pip install pytorch_lightning

In [25]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet
# Define the temporal and static features

Train = pd.concat([X_train, y_train], axis=1)
Target = "y_shifted"
Train['time_idx'] = range(len(Train))
Train['group_id'] = 0  # Single group for all Train
Train

Test = pd.concat([X_test, y_test], axis=1)
Target = "y_shifted"
Test['time_idx'] = range(len(Test))
Test['group_id'] = 0  # Single group for all Train

In [26]:
Train.isnull().sum()

1050003300_velocity    0
1050020400_velocity    0
1070000200_velocity    0
1070000500_velocity    0
1070001600_velocity    0
                      ..
hour_sin               0
hour_cos               0
y_shifted              0
time_idx               0
group_id               0
Length: 69, dtype: int64

In [27]:
print(Train.columns.tolist())

['1050003300_velocity', '1050020400_velocity', '1070000200_velocity', '1070000500_velocity', '1070001600_velocity', '기온(°C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '적설(cm)', 'A-12 유입_traffic', 'A-19 유입_traffic', 'A-20 유입_traffic', 'A-22 유입_traffic', 'A-22 유출_traffic', 'B-01 유입_traffic', 'B-06 유입_traffic', 'B-14 유입_traffic', 'B-14 유출_traffic', 'B-22 유입_traffic', 'B-22 유출_traffic', 'B-36 유출_traffic', 'C-02 유입_traffic', 'C-06 유입_traffic', 'C-07 유입_traffic', 'C-09 유출_traffic', 'C-17 유입_traffic', 'C-17 유출_traffic', 'C-20 유입_traffic', 'C-21 유출_traffic', 'D-04 유입_traffic', 'D-12 유출_traffic', 'D-16 유출_traffic', 'D-17 유입_traffic', 'D-17 유출_traffic', 'D-21 유출_traffic', 'D-28 유입_traffic', 'D-28 유출_traffic', 'D-31 유입_traffic', 'D-35 유입_traffic', 'D-35 유출_traffic', 'D-44 유출_traffic', 'F-02 유출_traffic', 'F-03 유입_traffic', 'F-03 유출_traffic', 'F-04 유출_traffic', 'F-05 유입_traffic', 'F-05 유출_traffic', 'F-06 유입_traffic', 'F-07 유입_traffic', 'F-07 유출_traffic', 'F-08 유입_traffic', 'F-08 유출_traffic', 'F-09 유입_traffic

In [28]:
# Parameters
N_TEST = 12
N_SPLIT = 27
max_encoder_length = 24
max_prediction_length = 1
BATCH_SIZE = 64
LEARNING_RATE = 0.001
Train.astype('float32')
# Define features
TEMPORAL_FEATURES = ['1050003300_velocity', '1050020400_velocity', '1070000200_velocity',
                     '1070000500_velocity', '1070001600_velocity', 
                     'A-12 유입_traffic', 'A-19 유입_traffic', 'A-20 유입_traffic', 'A-22 유입_traffic', 'A-22 유출_traffic', 'B-01 유입_traffic', 'B-06 유입_traffic', 'B-14 유입_traffic', 'B-14 유출_traffic', 'B-22 유입_traffic', 'B-22 유출_traffic', 'B-36 유출_traffic', 'C-02 유입_traffic', 'C-06 유입_traffic', 'C-07 유입_traffic', 'C-09 유출_traffic', 'C-17 유입_traffic', 'C-17 유출_traffic', 'C-20 유입_traffic', 'C-21 유출_traffic', 'D-04 유입_traffic', 'D-12 유출_traffic', 'D-16 유출_traffic', 'D-17 유입_traffic', 'D-17 유출_traffic', 'D-21 유출_traffic', 'D-28 유입_traffic', 'D-28 유출_traffic', 'D-31 유입_traffic', 'D-35 유입_traffic', 'D-35 유출_traffic', 'D-44 유출_traffic', 'F-02 유출_traffic', 'F-03 유입_traffic', 'F-03 유출_traffic', 'F-04 유출_traffic', 'F-05 유입_traffic', 'F-05 유출_traffic', 'F-06 유입_traffic', 'F-07 유입_traffic', 'F-07 유출_traffic', 'F-08 유입_traffic', 'F-08 유출_traffic', 'F-09 유입_traffic', 'F-09 유출_traffic', 'F-10 유입_traffic', 'F-10 유출_traffic',
                     '기온(°C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '적설(cm)']
STATIC_FEATURES = ['Year', 'is_weekend', 'is_holiday', 'is_morning_rush', 'is_evening_rush']

# Fix the group_id and Year issues
def preprocess_data(data):
    data['group_id'] = 0
    return data

# Split and seed settings
ts_split = TimeSeriesSplit(n_splits=N_SPLIT)
mape_scores = []

# Set random seed for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Temporal Fusion Transformer setup
def setup_tft(Train, train_idx, val_idx):
    Train = preprocess_data(Train)  # Preprocess the dataset
    train_data = Train.iloc[train_idx]
    val_data = Train.iloc[val_idx]
    assert not train_data.isnull().any().any(), "Missing values in train_data"
    assert not val_data.isnull().any().any(), "Missing values in val_data"

    print(train_data.isnull().sum().sum())
    train_dataset = TimeSeriesDataSet(
        train_data,
        time_idx="time_idx",
        target="y_shifted",
        group_ids=["group_id"],
        min_encoder_length=max_encoder_length // 2,
        max_encoder_length=max_encoder_length,
        min_prediction_length=1,
        max_prediction_length=max_prediction_length,
        static_reals=STATIC_FEATURES,
        time_varying_known_reals=TEMPORAL_FEATURES+["time_idx"],
        time_varying_unknown_reals=["y_shifted"],
        target_normalizer=None,
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
    )

    val_dataset = TimeSeriesDataSet.from_dataset(
        train_dataset, val_data
    )
    
    # sample = train_dataset[0]
    # print(sample)
    #for key, value in sample.items():
    #    print(f"{key}: {value.shape if hasattr(value, 'shape') else value}")

    train_dataloader = train_dataset.to_dataloader(train=True, batch_size=128, num_workers=0)
    val_dataloader = val_dataset.to_dataloader(train=False, batch_size=128 * 10, num_workers=0)
    
    from pytorch_forecasting.metrics import MAE
    
    model = TemporalFusionTransformer.from_dataset(
        train_dataset,
        learning_rate=LEARNING_RATE,
        hidden_size=16,
        attention_head_size=4,
        dropout=0.1,
        hidden_continuous_size=8,
        output_size=1,
        loss=MAE(),
    )

    return model, train_dataloader, val_dataloader

# # Loop through splits
# for fold, (train_idx, val_idx) in enumerate(ts_split.split(Train)):
#     if fold < N_SPLIT - N_TEST:
#         continue
    
#     print(f"TFT Fold {fold + 1} start!")
#     import lightning.pytorch as lp
#     # Train TFT
#     tft_model, train_loader, val_loader = setup_tft(Train, train_idx, val_idx)
    
#     trainer = lp.Trainer(max_epochs=30, accelerator="cpu")
#     trainer.fit(tft_model, train_loader, val_loader)

#     # Evaluate TFT
#     tft_model.eval()

#     actuals = torch.cat([y for x, y in iter(val_loader)]).cpu().numpy()
#     predictions = torch.cat([tft_model.predict(x) for x, y in iter(val_loader)]).cpu().numpy()
#     mape = mean_absolute_percentage_error(actuals, predictions)
#     mape_scores.append(mape)
#     print(f"TFT Fold {fold + 1}, Validation MAPE: {mape:.4f}")

In [29]:
train_dataset = TimeSeriesDataSet(
    Train,
    time_idx="time_idx",
    target="y_shifted",
    group_ids=["group_id"],
    min_encoder_length=max_encoder_length // 2,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_reals=STATIC_FEATURES,
    time_varying_known_reals=TEMPORAL_FEATURES + ["time_idx"],
    time_varying_unknown_reals=["y_shifted"],
    target_normalizer=None,
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

test_dataset = TimeSeriesDataSet.from_dataset(train_dataset, Test)

# Dataloader setup
train_dataloader = train_dataset.to_dataloader(train=True, batch_size=BATCH_SIZE, num_workers=0)
test_dataloader = test_dataset.to_dataloader(train=False, batch_size=BATCH_SIZE * 10, num_workers=0)

# Model setup
from pytorch_forecasting.metrics import MAE
from pytorch_forecasting.models import TemporalFusionTransformer

tft_model = TemporalFusionTransformer.from_dataset(
    train_dataset,
    learning_rate=LEARNING_RATE,
    hidden_size=16,
    attention_head_size=4,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=1,
    loss=MAE(),
)

# Trainer setup
import lightning.pytorch as lp

trainer = lp.Trainer(max_epochs=30, accelerator="cpu")
trainer.fit(tft_model, train_dataloader)

/opt/conda/lib/python3.10/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
  super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs)
INFO: GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
INFO: 
   | Name                               | Type                           

Training: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/training_epoch_loop.py:389: ReduceLROnPlateau conditioned on metric val_loss which is not available but strict is set to `False`. Skipping learning rate update.
INFO: `Trainer.fit` stopped: `max_epochs=30` reached.


TypeError: expected Tensor as element 0 in argument 0, but got tuple

In [32]:
# Evaluate on test set
tft_model.eval()

# 실제값 수집
actuals = torch.cat([y[0] for x, y in iter(test_dataloader)]).cpu().numpy()

# 예측값 수집
predictions = tft_model.predict(test_dataloader, mode="prediction")

# 예측값 디버깅
print(f"Predictions type: {type(predictions)}")
print(f"Predictions shape: {predictions.shape if isinstance(predictions, torch.Tensor) else [p.shape for p in predictions]}")

# 2차원 텐서로 변환
if isinstance(predictions, list):
    predictions = torch.cat(predictions).cpu().numpy()
else:
    predictions = predictions.cpu().numpy()
# Calculate MAPE
from sklearn.metrics import mean_absolute_percentage_error
test_mape = mean_absolute_percentage_error(actuals, predictions)
print(f"Test MAPE: {test_mape:.4f}")


INFO: GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Predictions type: <class 'torch.Tensor'>
Predictions shape: torch.Size([719, 1])
Test MAPE: 0.1023
