In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import os

## データの前処理

In [2]:
# データのパス指定
data_folder = r"C:\Users\ryoya\MasterThesis\MT_Furuie\data\Miwa_FFNN_Data\Trial_1"
data_file_name = r"Miwa_data_for_FFNN.xlsx"
idx_file_name = r"Miwa_flood_idx_for_FFNN.xlsx"

data_path = os.path.join(data_folder, data_file_name)
idx_path = os.path.join(data_folder, idx_file_name)


# input, output変数の列番号を指定（0始まり）
# タイムラグもここで指定
input_cols = [2, 2, 1, 1]
input_lags = [0, 1, 1, 2]
output_cols = [1]
output_lags = [0]

# ファイルの読み込み
d_all = pd.read_excel(data_path, header=0)
idx_list = pd.read_excel(idx_path, header=0)
col_trial = 10 # 【要変更】どの列がtrain, testを指定する列か

train_idx = idx_list[idx_list.iloc[:, col_trial] == 'train']
test_idx = idx_list[idx_list.iloc[:, col_trial] == 'test']


In [None]:
# 必要なデータの取り出し（train）
x_train = []
y_train = []

# inputの取り出し
for i in range(train_idx.shape[0]):
    s = int(train_idx.iloc[i, 0]) - 1
    e = int(train_idx.iloc[i, 1]) - 1

    # ---- X（入力データ）: 各列をその列ごとの lag 分だけずらして抽出
    cols_block = []
    for col, lag in zip(input_cols, input_lags):
        start = s - lag
        end = e - lag
        # lag分だけ前の行を取り出す（指定範囲のまま、NaN埋め不要）
        x_part = d_all.iloc[start:end+1, col].to_numpy().reshape(-1, 1)
        cols_block.append(x_part)

    X_seg = np.hstack(cols_block)  # (L, len(input_cols))

    # 区間ごとに格納
    x_train.append(X_seg)

# ---- すべての区間を縦方向に結合
x_train = np.vstack(x_train)


# outputの取り出し
for i in range(train_idx.shape[0]):
    s = int(train_idx.iloc[i, 0]) - 1
    e = int(train_idx.iloc[i, 1]) - 1

    # ---- X（入力データ）: 各列をその列ごとの lag 分だけずらして抽出
    cols_block = []
    for col, lag in zip(output_cols, output_lags):
        start = s - lag
        end = e - lag
        # lag分だけ前の行を取り出す（指定範囲のまま、NaN埋め不要）
        y_part = d_all.iloc[start:end+1, col].to_numpy().reshape(-1, 1)
        cols_block.append(y_part)

    Y_seg = np.hstack(cols_block)  # (L, len(input_cols))

    # 区間ごとに格納
    y_train.append(Y_seg)

# ---- すべての区間を縦方向に結合
y_train = np.vstack(y_train)

In [7]:
# データの標準化関数
def standardize_by_column(X):
    """列ごとに標準化し、平均と標準偏差を返す"""
    mean = np.mean(X, axis=0)
    std  = np.std(X, axis=0, ddof=0)

    # 標準偏差が0の列は0除算を防ぐ
    std[std == 0] = 1.0

    X_std = (X - mean) / std
    return X_std, mean, std


In [8]:
# データの標準化
x_train_std, x_mean, x_std = standardize_by_column(x_train)
y_train_std, y_mean, y_std = standardize_by_column(y_train)

x_train_params = pd.DataFrame({'mean': x_mean, 'std': x_std})
y_train_params = pd.DataFrame({'mean': y_mean, 'std': y_std})

