In [186]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
from typing import Tuple, Optional
from datetime import datetime

In [187]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.loss")
warnings.filterwarnings("ignore", category=FutureWarning)

In [188]:
import sys
import pandas as pd

In [189]:
from pathlib import Path

project_root = Path.cwd().parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

    
from src.utils.paths import DATASET_PATH
from src.data.processing import preprocess_data, add_daily_features
from src.features.fourier_transform import fft_features
from src.dataset.windowing import build_train_df

In [190]:
# 하이퍼파라미터


In [191]:
data = pd.read_csv(DATASET_PATH)

In [192]:
data

Unnamed: 0,일자,충전방식,0시,1시,2시,3시,4시,5시,6시,7시,...,14시,15시,16시,17시,18시,19시,20시,21시,22시,23시
0,2020-01-01,급속,2800,3200,2320,1600,1520,2240,3120,3800,...,12040,11080,12320,11800,9800,8160,6880,6080,4360,3800
1,2020-01-01,완속,700,805,791,259,84,91,168,189,...,595,784,952,1085,1372,903,1218,1197,1148,1078
2,2020-01-02,급속,1400,1240,1000,680,880,1280,2080,4000,...,10720,11960,11280,11080,9840,10040,7800,5600,4800,3840
3,2020-01-02,완속,413,413,133,133,28,70,105,147,...,483,945,1064,1099,1554,1449,1253,1358,1386,1680
4,2020-01-03,급속,2240,1480,1240,640,920,1320,2320,4240,...,11120,11600,11720,11080,9560,9160,7640,6080,5520,4880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3465,2024-09-28,완속,735,714,322,245,147,126,245,434,...,1232,1113,1484,1505,1946,1911,1792,1736,1694,1645
3466,2024-09-29,급속,2440,2240,1560,1520,2120,3320,5400,6920,...,16440,14640,15320,15520,13840,11160,10280,6400,5720,3520
3467,2024-09-29,완속,749,434,315,259,147,182,196,350,...,1253,1337,1498,1715,2107,1904,1820,1953,1351,924
3468,2024-09-30,급속,1960,1800,1200,1240,2640,4080,7280,9840,...,17960,20080,20480,20160,17280,14400,12240,8800,6840,4280


## 데이터 변환: 일자별로 F1~F48 컬럼 생성
- F1: 0시 급속, F2: 0시 완속
- F3: 1시 급속, F4: 1시 완속
- ...
- F47: 23시 급속, F48: 23시 완속

In [193]:
# 원본 데이터 복사
df = data.copy()

# 일자를 datetime으로 변환
df['일자'] = pd.to_datetime(df['일자'])

# 급속과 완속 데이터 분리
fast_df = df[df['충전방식'] == '급속'].copy()
slow_df = df[df['충전방식'] == '완속'].copy()

# 시간별 컬럼 리스트
hourly_cols = [f'{h}시' for h in range(24)]

# 일자 기준으로 정렬 및 인덱스 설정
fast_df = fast_df.sort_values('일자').set_index('일자')[hourly_cols]
slow_df = slow_df.sort_values('일자').set_index('일자')[hourly_cols]

# 새로운 데이터프레임 생성
new_data = []

# 모든 날짜 가져오기 (급속과 완속 중 하나라도 있는 날짜)
all_dates = sorted(set(fast_df.index) | set(slow_df.index))

for date in all_dates:
    row = {'일자': date}
    
    # 각 시간대별로 급속(F1, F3, F5, ...), 완속(F2, F4, F6, ...) 번갈아 배치
    for hour in range(24):
        hour_col = f'{hour}시'
        
        # F1, F3, F5, ... (홀수): 급속
        f_idx_fast = hour * 2 + 1
        if date in fast_df.index:
            row[f'F{f_idx_fast}'] = fast_df.loc[date, hour_col]
        else:
            row[f'F{f_idx_fast}'] = 0
        
        # F2, F4, F6, ... (짝수): 완속
        f_idx_slow = hour * 2 + 2
        if date in slow_df.index:
            row[f'F{f_idx_slow}'] = slow_df.loc[date, hour_col]
        else:
            row[f'F{f_idx_slow}'] = 0
    
    new_data.append(row)

# 새 데이터프레임 생성
df = pd.DataFrame(new_data)

# 일자 기준 정렬
df = df.sort_values('일자').reset_index(drop=True)

print(f"변환 완료: {len(df)} rows × {len(df.columns)} columns")
print(f"날짜 범위: {df['일자'].min()} ~ {df['일자'].max()}")
print(f"\n컬럼: {list(df.columns[:5])} ... {list(df.columns[-3:])}")
df

변환 완료: 1735 rows × 49 columns
날짜 범위: 2020-01-01 00:00:00 ~ 2024-09-30 00:00:00

컬럼: ['일자', 'F1', 'F2', 'F3', 'F4'] ... ['F46', 'F47', 'F48']


Unnamed: 0,일자,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F39,F40,F41,F42,F43,F44,F45,F46,F47,F48
0,2020-01-01,2800,700,3200,805,2320,791,1600,259,1520,...,8160,903,6880,1218,6080,1197,4360,1148,3800,1078
1,2020-01-02,1400,413,1240,413,1000,133,680,133,880,...,10040,1449,7800,1253,5600,1358,4800,1386,3840,1680
2,2020-01-03,2240,539,1480,525,1240,210,640,196,920,...,9160,1365,7640,1309,6080,1302,5520,1470,4880,2030
3,2020-01-04,2360,686,2040,364,1440,217,1160,217,1200,...,8080,980,6800,917,4880,1036,3880,987,4160,1477
4,2020-01-05,1640,469,1640,294,1280,294,920,175,1200,...,7400,1162,6640,1274,4960,1246,3680,952,2960,1057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730,2024-09-26,2760,658,2240,378,1640,224,1200,182,2600,...,14480,2436,12240,1673,8480,1925,7600,1806,4560,1785
1731,2024-09-27,3080,602,1800,588,1680,245,1760,161,2960,...,15080,2359,12560,1960,9360,1890,6640,1743,4880,1344
1732,2024-09-28,3120,735,2080,714,2320,322,2120,245,3240,...,12760,1911,9760,1792,8240,1736,6440,1694,3520,1645
1733,2024-09-29,2440,749,2240,434,1560,315,1520,259,2120,...,11160,1904,10280,1820,6400,1953,5720,1351,3520,924


In [194]:
df

Unnamed: 0,일자,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F39,F40,F41,F42,F43,F44,F45,F46,F47,F48
0,2020-01-01,2800,700,3200,805,2320,791,1600,259,1520,...,8160,903,6880,1218,6080,1197,4360,1148,3800,1078
1,2020-01-02,1400,413,1240,413,1000,133,680,133,880,...,10040,1449,7800,1253,5600,1358,4800,1386,3840,1680
2,2020-01-03,2240,539,1480,525,1240,210,640,196,920,...,9160,1365,7640,1309,6080,1302,5520,1470,4880,2030
3,2020-01-04,2360,686,2040,364,1440,217,1160,217,1200,...,8080,980,6800,917,4880,1036,3880,987,4160,1477
4,2020-01-05,1640,469,1640,294,1280,294,920,175,1200,...,7400,1162,6640,1274,4960,1246,3680,952,2960,1057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730,2024-09-26,2760,658,2240,378,1640,224,1200,182,2600,...,14480,2436,12240,1673,8480,1925,7600,1806,4560,1785
1731,2024-09-27,3080,602,1800,588,1680,245,1760,161,2960,...,15080,2359,12560,1960,9360,1890,6640,1743,4880,1344
1732,2024-09-28,3120,735,2080,714,2320,322,2120,245,3240,...,12760,1911,9760,1792,8240,1736,6440,1694,3520,1645
1733,2024-09-29,2440,749,2240,434,1560,315,1520,259,2120,...,11160,1904,10280,1820,6400,1953,5720,1351,3520,924


## 일별 총 충전량 계산
급속(홀수 F 컬럼)과 완속(짝수 F 컬럼)의 일별 합계 추가

In [195]:
# 홀수 번째 컬럼 (F1, F3, F5, ..., F47) - 급속
fast_cols = [f'F{i}' for i in range(1, 49, 2)]  # F1, F3, F5, ..., F47

# 짝수 번째 컬럼 (F2, F4, F6, ..., F48) - 완속
slow_cols = [f'F{i}' for i in range(2, 49, 2)]  # F2, F4, F6, ..., F48

# daily_fast: 홀수 번째 컬럼 합계
df['daily_fast'] = df[fast_cols].sum(axis=1)

# daily_slow: 짝수 번째 컬럼 합계
df['daily_slow'] = df[slow_cols].sum(axis=1)

print(f"새로운 열 추가 완료!")
print(f"daily_fast 통계: mean={df['daily_fast'].mean():.2f}, std={df['daily_fast'].std():.2f}")
print(f"daily_slow 통계: mean={df['daily_slow'].mean():.2f}, std={df['daily_slow'].std():.2f}")
print(f"\n샘플 데이터:")
df[['일자', 'daily_fast', 'daily_slow']].head(10)

새로운 열 추가 완료!
daily_fast 통계: mean=269571.97, std=94743.63
daily_slow 통계: mean=29114.00, std=6845.14

샘플 데이터:


Unnamed: 0,일자,daily_fast,daily_slow
0,2020-01-01,169120,16387
1,2020-01-02,155960,16240
2,2020-01-03,160400,18025
3,2020-01-04,152600,14357
4,2020-01-05,138520,15904
5,2020-01-06,144440,16814
6,2020-01-07,137000,17199
7,2020-01-08,155240,18781
8,2020-01-09,166680,18382
9,2020-01-10,168480,19201


In [196]:
df

Unnamed: 0,일자,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F41,F42,F43,F44,F45,F46,F47,F48,daily_fast,daily_slow
0,2020-01-01,2800,700,3200,805,2320,791,1600,259,1520,...,6880,1218,6080,1197,4360,1148,3800,1078,169120,16387
1,2020-01-02,1400,413,1240,413,1000,133,680,133,880,...,7800,1253,5600,1358,4800,1386,3840,1680,155960,16240
2,2020-01-03,2240,539,1480,525,1240,210,640,196,920,...,7640,1309,6080,1302,5520,1470,4880,2030,160400,18025
3,2020-01-04,2360,686,2040,364,1440,217,1160,217,1200,...,6800,917,4880,1036,3880,987,4160,1477,152600,14357
4,2020-01-05,1640,469,1640,294,1280,294,920,175,1200,...,6640,1274,4960,1246,3680,952,2960,1057,138520,15904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730,2024-09-26,2760,658,2240,378,1640,224,1200,182,2600,...,12240,1673,8480,1925,7600,1806,4560,1785,279360,26215
1731,2024-09-27,3080,602,1800,588,1680,245,1760,161,2960,...,12560,1960,9360,1890,6640,1743,4880,1344,293760,25298
1732,2024-09-28,3120,735,2080,714,2320,322,2120,245,3240,...,9760,1792,8240,1736,6440,1694,3520,1645,255800,24003
1733,2024-09-29,2440,749,2240,434,1560,315,1520,259,2120,...,10280,1820,6400,1953,5720,1351,3520,924,221880,23149


## MinMax Scaling (F1~F48)
시간별 충전량 피처들을 0~1 범위로 정규화

In [197]:
from sklearn.preprocessing import MinMaxScaler

# F1~F48 컬럼 리스트
feature_cols = [f'F{i}' for i in range(1, 49)]

# MinMaxScaler 초기화
scaler = MinMaxScaler()

# F1~F48 컬럼만 스케일링 (0~1 범위로 변환)
df[feature_cols] = scaler.fit_transform(df[feature_cols])

print("MinMax Scaling 완료!")
print(f"스케일링 범위: 0 ~ 1")
print(f"\n스케일링 후 통계:")
print(df[feature_cols].describe().loc[['min', 'max', 'mean', 'std']].T.head(10))

# 일자와 daily_fast, daily_slow는 스케일링하지 않음
print(f"\n스케일링 적용 컬럼: {len(feature_cols)}개 (F1 ~ F48)")
print(f"스케일링 제외 컬럼: 일자, daily_fast, daily_slow")

MinMax Scaling 완료!
스케일링 범위: 0 ~ 1

스케일링 후 통계:
     min  max      mean       std
F1   0.0  1.0  0.238839  0.117199
F2   0.0  1.0  0.480648  0.136979
F3   0.0  1.0  0.290984  0.156170
F4   0.0  1.0  0.406706  0.149058
F5   0.0  1.0  0.273203  0.156502
F6   0.0  1.0  0.364361  0.139836
F7   0.0  1.0  0.236157  0.146115
F8   0.0  1.0  0.295274  0.138484
F9   0.0  1.0  0.337813  0.184261
F10  0.0  1.0  0.326693  0.169220

스케일링 적용 컬럼: 48개 (F1 ~ F48)
스케일링 제외 컬럼: 일자, daily_fast, daily_slow


In [198]:
df

Unnamed: 0,일자,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F41,F42,F43,F44,F45,F46,F47,F48,daily_fast,daily_slow
0,2020-01-01,0.194553,0.483696,0.390533,0.584270,0.333333,0.908333,0.219512,0.421687,0.205674,...,0.058484,0.133333,0.132331,0.189189,0.142466,0.189112,0.223881,0.217647,169120,16387
1,2020-01-02,0.058366,0.260870,0.100592,0.269663,0.121795,0.125000,0.079268,0.204819,0.092199,...,0.075090,0.144444,0.114286,0.251351,0.172603,0.286533,0.227612,0.470588,155960,16240
2,2020-01-03,0.140078,0.358696,0.136095,0.359551,0.160256,0.216667,0.073171,0.313253,0.099291,...,0.072202,0.162222,0.132331,0.229730,0.221918,0.320917,0.324627,0.617647,160400,18025
3,2020-01-04,0.151751,0.472826,0.218935,0.230337,0.192308,0.225000,0.152439,0.349398,0.148936,...,0.057040,0.037778,0.087218,0.127027,0.109589,0.123209,0.257463,0.385294,152600,14357
4,2020-01-05,0.081712,0.304348,0.159763,0.174157,0.166667,0.316667,0.115854,0.277108,0.148936,...,0.054152,0.151111,0.090226,0.208108,0.095890,0.108883,0.145522,0.208824,138520,15904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730,2024-09-26,0.190661,0.451087,0.248521,0.241573,0.224359,0.233333,0.158537,0.289157,0.397163,...,0.155235,0.277778,0.222556,0.470270,0.364384,0.458453,0.294776,0.514706,279360,26215
1731,2024-09-27,0.221790,0.407609,0.183432,0.410112,0.230769,0.258333,0.243902,0.253012,0.460993,...,0.161011,0.368889,0.255639,0.456757,0.298630,0.432665,0.324627,0.329412,293760,25298
1732,2024-09-28,0.225681,0.510870,0.224852,0.511236,0.333333,0.350000,0.298780,0.397590,0.510638,...,0.110469,0.315556,0.213534,0.397297,0.284932,0.412607,0.197761,0.455882,255800,24003
1733,2024-09-29,0.159533,0.521739,0.248521,0.286517,0.211538,0.341667,0.207317,0.421687,0.312057,...,0.119856,0.324444,0.144361,0.481081,0.235616,0.272206,0.197761,0.152941,221880,23149


## 로그 변환 (daily_fast, daily_slow)
타겟 변수의 분포를 정규화하기 위해 ln(1+x) 변환 적용

In [199]:
# 로그 변환 전 원본 값 저장 (나중에 역변환 시 필요하면 사용)
df['daily_fast_original'] = df['daily_fast'].copy()
df['daily_slow_original'] = df['daily_slow'].copy()

# ln(1+x) 변환 적용
df['daily_fast'] = np.log1p(df['daily_fast'])
df['daily_slow'] = np.log1p(df['daily_slow'])

print("로그 변환 완료! (ln(1+x))")
print(f"\n변환 전 통계 (원본):")
print(f"  daily_fast_original: mean={df['daily_fast_original'].mean():.2f}, std={df['daily_fast_original'].std():.2f}")
print(f"  daily_slow_original: mean={df['daily_slow_original'].mean():.2f}, std={df['daily_slow_original'].std():.2f}")

print(f"\n변환 후 통계 (로그):")
print(f"  daily_fast: mean={df['daily_fast'].mean():.4f}, std={df['daily_fast'].std():.4f}")
print(f"  daily_slow: mean={df['daily_slow'].mean():.4f}, std={df['daily_slow'].std():.4f}")

print(f"\n샘플 비교:")
comparison_df = df[['일자', 'daily_fast_original', 'daily_fast', 'daily_slow_original', 'daily_slow']].head(10)
comparison_df

로그 변환 완료! (ln(1+x))

변환 전 통계 (원본):
  daily_fast_original: mean=269571.97, std=94743.63
  daily_slow_original: mean=29114.00, std=6845.14

변환 후 통계 (로그):
  daily_fast: mean=12.4359, std=0.3825
  daily_slow: mean=10.2472, std=0.2624

샘플 비교:


Unnamed: 0,일자,daily_fast_original,daily_fast,daily_slow_original,daily_slow
0,2020-01-01,169120,12.03837,16387,9.704305
1,2020-01-02,155960,11.957361,16240,9.695294
2,2020-01-03,160400,11.985432,18025,9.79957
3,2020-01-04,152600,11.935582,14357,9.572063
4,2020-01-05,138520,11.838777,15904,9.674389
5,2020-01-06,144440,11.880626,16814,9.730027
6,2020-01-07,137000,11.827744,17199,9.752665
7,2020-01-08,155240,11.952734,18781,9.840654
8,2020-01-09,166680,12.023837,18382,9.819182
9,2020-01-10,168480,12.034578,19201,9.86277


In [200]:
df

Unnamed: 0,일자,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F43,F44,F45,F46,F47,F48,daily_fast,daily_slow,daily_fast_original,daily_slow_original
0,2020-01-01,0.194553,0.483696,0.390533,0.584270,0.333333,0.908333,0.219512,0.421687,0.205674,...,0.132331,0.189189,0.142466,0.189112,0.223881,0.217647,12.038370,9.704305,169120,16387
1,2020-01-02,0.058366,0.260870,0.100592,0.269663,0.121795,0.125000,0.079268,0.204819,0.092199,...,0.114286,0.251351,0.172603,0.286533,0.227612,0.470588,11.957361,9.695294,155960,16240
2,2020-01-03,0.140078,0.358696,0.136095,0.359551,0.160256,0.216667,0.073171,0.313253,0.099291,...,0.132331,0.229730,0.221918,0.320917,0.324627,0.617647,11.985432,9.799570,160400,18025
3,2020-01-04,0.151751,0.472826,0.218935,0.230337,0.192308,0.225000,0.152439,0.349398,0.148936,...,0.087218,0.127027,0.109589,0.123209,0.257463,0.385294,11.935582,9.572063,152600,14357
4,2020-01-05,0.081712,0.304348,0.159763,0.174157,0.166667,0.316667,0.115854,0.277108,0.148936,...,0.090226,0.208108,0.095890,0.108883,0.145522,0.208824,11.838777,9.674389,138520,15904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1730,2024-09-26,0.190661,0.451087,0.248521,0.241573,0.224359,0.233333,0.158537,0.289157,0.397163,...,0.222556,0.470270,0.364384,0.458453,0.294776,0.514706,12.540260,10.174125,279360,26215
1731,2024-09-27,0.221790,0.407609,0.183432,0.410112,0.230769,0.258333,0.243902,0.253012,0.460993,...,0.255639,0.456757,0.298630,0.432665,0.324627,0.329412,12.590522,10.138520,293760,25298
1732,2024-09-28,0.225681,0.510870,0.224852,0.511236,0.333333,0.350000,0.298780,0.397590,0.510638,...,0.213534,0.397297,0.284932,0.412607,0.197761,0.455882,12.452155,10.085976,255800,24003
1733,2024-09-29,0.159533,0.521739,0.248521,0.286517,0.211538,0.341667,0.207317,0.421687,0.312057,...,0.144361,0.481081,0.235616,0.272206,0.197761,0.152941,12.309896,10.049750,221880,23149
