In [1]:
import sys
sys.path.append('../external/tslib')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import csv
import os
import time

random.seed(42)
np.random.seed(42)

### Preprocessing

In [None]:
# 读入CSV
df = pd.read_csv('../data/request_timeseries_all.csv')

# 把 timestamp 转成 date
df['date'] = pd.to_datetime(df['timestamp'], unit='s')

# 删除原来的 timestamp
df = df.drop(columns=['timestamp'])

# 把 date 列放到最前面
df = df[['date', 'requests']]

# roll up evry 5 data points
group_size = 5
grouped = df.groupby(df.index // group_size)

# 构建结果 DataFrame：保留起始时间，计算归一化请求数
result = grouped.agg({
    "date": "first",
    "requests": lambda x: x.sum() / 120
})

result.to_csv('../data/request_timeseries_aggregated.csv', index=False)

### Experiment

In [None]:
# 内部用后30%作为测试集
python ../external/tslib/run.py \
  --task_name long_term_forecast \
  --is_training 1 \
  --root_path ../data/ \
  --data_path request_timeseries_aggregated.csv\
  --model_id ts_48_48 \
  --model iTransformer \
  --data custom \
  --features S \
  --target requests \
  --freq h \
  --seq_len 48 \
  --label_len 24 \
  --pred_len 48 \
  --e_layers 2 \
  --d_layers 1 \
  --factor 3 \
  --enc_in 1 \
  --dec_in 1 \
  --c_out 1 \
  --d_model 16 \
  --d_ff 32 \
  --des Exp \
  --itr 1 \
  --top_k 5

In [4]:
# 分割数据集，将数据集分为训练集和测试集，后30%作为测试集

df = pd.read_csv('../data/request_timeseries_all.csv')
train_size = int(len(df) * 0.7)
train_data = df[:train_size]
test_data = df[train_size:]
train_data.to_csv('../data/request_timeseries_train.csv', index=False)
test_data.to_csv('../data/request_timeseries_test.csv', index=False)