In [8]:
# 魔术指令，自动加载模块
%load_ext autoreload
%autoreload 2
import os
os.chdir("/home/beihang/xihu/HZTourism/FlowPred-dev")
import sys
sys.path.append("/home/beihang/xihu/HZTourism/FlowPred-dev")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
# 配置项
exper_name = "mse_loss"
exper_dir = f"exper/{exper_name}"
exper_data_dir = f"exper_data/{exper_name}"
os.makedirs(exper_data_dir, exist_ok=True)

spot_id = 14207
his_hour = 24
pred_hour = 6

raw_dir = f"{exper_data_dir}/raw/{spot_id}"
proc_dir = f"{exper_data_dir}/proc/{spot_id}"
train_dir = f"{exper_data_dir}/train/{spot_id}"
test_dir = f"{exper_data_dir}/test/{spot_id}"
res_dir = f"{exper_data_dir}/res/{spot_id}"


os.makedirs(raw_dir, exist_ok=True)
os.makedirs(proc_dir, exist_ok=True)
os.makedirs(train_dir, exist_ok=True)   
os.makedirs(test_dir, exist_ok=True)
os.makedirs(res_dir, exist_ok=True)


In [10]:
# 使用元组作为键的配置映射
spot_len_config_map = {
    # 10分钟频率景点
    (14210, 14211, 14212, 14213): {"freq": "10min", "his_len": 144, "pred_len": 36},
    
    # 30秒频率景点
    (14207, 14209): {"freq": "30sec", "his_len": 2880, "pred_len": 720},
    
    # 1分钟频率景点
    (14208,): {"freq": "1min", "his_len": 1440, "pred_len": 360},
}

# 默认配置
default_config = {"freq": "5min", "his_len": 288, "pred_len": 72}

def get_spot_len_config(spot_id):
    for spot_tuple, config in spot_len_config_map.items():
        if spot_id in spot_tuple:
            return config
    return default_config

In [11]:
spot_len_config = get_spot_len_config(spot_id)
his_len = spot_len_config["his_len"]
pred_len = spot_len_config["pred_len"]
freq = spot_len_config["freq"]

In [12]:
# 根据景点加载训练数据
from src.utils.utils_data import save_csv_from_db

s_time = "2024-07-20"
e_time = "2025-07-20"
if e_time is None:
    file_base_name = f"{spot_id}_{s_time}"
    train_raw_data_file=f"{raw_dir}/{file_base_name}.csv"
else:
    file_base_name = f"{spot_id}_{s_time}_{e_time}"
    train_raw_data_file=f"{raw_dir}/{file_base_name}.csv"
save_csv_from_db(
    spot_id=spot_id,
    s_time=f"{s_time} 00:00:00",
    e_time= f"{e_time} 23:59:59",
    output_csv_file=train_raw_data_file,
)

数据库连接成功！
正在从表 'dahua_flow' 中查询数据，时间范围: 2024-07-20 00:00:00 至 2025-07-20 23:59:59...


输出表头: ['spot_id', 'kpi_time', 'kpi_value']
CSV 文件表头已写入。
成功！筛选后的数据已导出到文件 'exper_data/mse_loss/raw/14207/14207_2024-07-20_2025-07-20.csv'，共 1290115 条记录。
数据库连接已关闭。


In [13]:
import pandas as pd
# 数据预处理
df = pd.read_csv(train_raw_data_file)
# 按kpi_time列转换为datetime格式
df['kpi_time'] = pd.to_datetime(df['kpi_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
# 按kpi_time去重
df = df.drop_duplicates(subset=['kpi_time'])
# 按kpi_time排序
df = df.sort_values(by='kpi_time')
# 数据预处理，分景点
if spot_id in [14210,14211,14212,14213]:
    pass
elif spot_id in [14207,14209]:
    from src.utils.utils_data import fill_missing_value_singlespot_30s,preprocess_for_koopman_30s_enhanced
    df_proc = fill_missing_value_singlespot_30s(spot_id, df)
    # 添加Koopman专用预处理
    df_proc = preprocess_for_koopman_30s_enhanced(df_proc, spot_id)
elif spot_id in [14208]:
    pass
else:
    from src.utils.utils_data import fill_missing_value_singlespot_day
    df_proc = fill_missing_value_singlespot_day(df, freq=freq)

df_proc.to_csv(f"{proc_dir}/{file_base_name}_proc.csv", index=False)

开始增强版Koopman预处理 - 景点14207
原始数据形状: (1032082, 3)
原始问题检查:
  变异系数: 1.052
  零值数量: 6212
  数据范围: [0.0, 719.0]
开始逐步处理:
  步骤1: 清理NaN和Inf...
  步骤2: 激进连续零值处理...
    处理了 32 个长连续零值段
  步骤3: 强力降低变异性...
    当前变异系数: 1.047
    应用对数变换进一步降低变异性...
    强力平滑后变异系数: 0.195
  步骤4: 增强异常值处理...
  步骤5: 强化数值稳定性...

=== 强化处理结果对比 ===
变异系数: 1.052 -> 0.195 (目标: <0.6)
零值数量: 6212 -> 0
最大连续零值: 120 -> 0
数据范围: [0.0, 719.0] -> [44.8, 135.6]
动态范围比: 7190.0 -> 3.0
✅ 强化处理完全成功！应该能彻底解决NaN问题
✅ 强化版Koopman预处理完成


In [14]:
# 数据处理为模型输入形式

from src.pattern.pattern_train import get_group_annotation, save_mode_data
df_proc = pd.read_csv(f"{proc_dir}/{file_base_name}_proc.csv")
save_base_dir = train_dir
os.makedirs(save_base_dir, exist_ok=True)
groups_mode_0, groups_mode_1 = get_group_annotation(his_len=his_len,pred_len=pred_len, df=df_proc, time_interval=freq)
save_mode_data(
    groups_mode=groups_mode_0,
    mode=0,
    data_basepath=save_base_dir,
)
save_mode_data(
    groups_mode=groups_mode_1,
    mode=1,
    data_basepath=save_base_dir,
)

Starting group annotation...
Getting continuous groups...


Building continuous groups: 100%|██████████| 1032082/1032082 [00:57<00:00, 17892.95row/s]


Found 481 continuous groups
Processing groups (his_len=2880, pred_len=720)...


Processing continuous groups: 100%|██████████| 481/481 [11:10<00:00,  1.39s/group] 


Generated 470 holiday batches and 937 workday batches
Final merging and deduplicating...


Merging batches: 100%|██████████| 94/94 [00:14<00:00,  6.48it/s]
Merging batches: 100%|██████████| 19/19 [00:02<00:00,  6.42it/s]
Merging batches: 100%|██████████| 4/4 [00:00<00:00,  6.13it/s]
Merging batches: 100%|██████████| 1/1 [00:00<00:00,  4.73it/s]
Merging batches: 100%|██████████| 188/188 [00:28<00:00,  6.51it/s]
Merging batches: 100%|██████████| 38/38 [00:05<00:00,  6.35it/s]
Merging batches: 100%|██████████| 8/8 [00:01<00:00,  6.08it/s]
Merging batches: 100%|██████████| 2/2 [00:00<00:00,  5.34it/s]
Merging batches: 100%|██████████| 1/1 [00:00<00:00,  4.61it/s]


Final holiday data: 443020 rows
Final workday data: 744790 rows
Getting final continuous groups...


Building continuous groups: 100%|██████████| 443020/443020 [00:25<00:00, 17513.59row/s]
Building continuous groups: 100%|██████████| 744790/744790 [00:41<00:00, 17799.60row/s]


Final holiday groups: 57
Final workday groups: 76
Saving 57 groups to mode_0...


Saving mode_0 files: 100%|██████████| 57/57 [00:01<00:00, 40.84file/s]


Mode_0 data saved successfully!
Saving 76 groups to mode_1...


Saving mode_1 files: 100%|██████████| 76/76 [00:02<00:00, 32.68file/s]

Mode_1 data saved successfully!



