In [None]:
from datadog import initialize, api
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import random
import csv
import os
import time

random.seed(42)
np.random.seed(42)

### Fetch time series data from datadog

In [None]:
# 👉 先配置 API 和 APP key
options = {
    "api_key": os.getenv("DATADOG_API_KEY"),
    "app_key": os.getenv("DATADOG_APP_KEY"),
    "api_host": "https://api.us5.datadoghq.com"
}
initialize(**options)

# 👉 配置参数
metric_query = "avg:metrics.mCreateCardPick.request{*}.as_count()"
interval_hours = 12  # 每次拉取12小时
save_to_one_file = True  # 设置是否合并写入一个 CSV

# 👉 时间范围
start_datetime = datetime(2024, 12, 23)
end_datetime = datetime(2025, 4, 21)

output_file = "../data/request_timeseries_all.csv"

# 👉 写入 CSV 的初始化（统一文件）
if save_to_one_file:
    f = open(output_file, "w", newline="")
    writer = csv.writer(f)
    writer.writerow(["timestamp", "requests"])

# 👉 循环拉取
cursor = start_datetime
while cursor < end_datetime:
    window_start = int(cursor.timestamp())
    window_end = int((cursor + timedelta(hours=interval_hours)).timestamp())
    
    print(f"⏳ 查询时间段: {datetime.utcfromtimestamp(window_start)} ~ {datetime.utcfromtimestamp(window_end)}")

    try:
        result = api.Metric.query(
            start=window_start,
            end=min(window_end, int(end_datetime.timestamp())),
            query=metric_query
        )
        
        if "series" in result and result["series"]:
            points = result["series"][0]["pointlist"]
            rows = []
            for point in points:
                timestamp = int(point[0] / 1000)
                value = round(point[1], 4) if point[1] is not None else 0.0
                rows.append([timestamp, value])

            if save_to_one_file:
                writer.writerows(rows)
            else:
                file_name = f"../data/request_rate_{cursor.strftime('%Y%m%d_%H%M')}.csv"
                with open(file_name, "w", newline="") as temp_f:
                    temp_writer = csv.writer(temp_f)
                    temp_writer.writerow(["timestamp", "requests"])
                    temp_writer.writerows(rows)
                print(f"✅ 写入 {file_name}")

        else:
            print("⚠️ 无数据返回, 终止", result.errors)
            break

    except Exception as e:
        print(f"❌ 错误: {e}")

    time.sleep(1)  # 避免 API 限流

    # 下一段
    cursor += timedelta(hours=interval_hours)

# 👉 关闭 CSV 文件
if save_to_one_file:
    f.close()
    print(f"✅ 所有数据写入 {output_file}")

⏳ 查询时间段: 2024-12-22 13:00:00 ~ 2024-12-23 01:00:00
⏳ 查询时间段: 2024-12-23 01:00:00 ~ 2024-12-23 13:00:00
⏳ 查询时间段: 2024-12-23 13:00:00 ~ 2024-12-24 01:00:00
⏳ 查询时间段: 2024-12-24 01:00:00 ~ 2024-12-24 13:00:00
⏳ 查询时间段: 2024-12-24 13:00:00 ~ 2024-12-25 01:00:00
⏳ 查询时间段: 2024-12-25 01:00:00 ~ 2024-12-25 13:00:00
⏳ 查询时间段: 2024-12-25 13:00:00 ~ 2024-12-26 01:00:00
⏳ 查询时间段: 2024-12-26 01:00:00 ~ 2024-12-26 13:00:00
⏳ 查询时间段: 2024-12-26 13:00:00 ~ 2024-12-27 01:00:00
⏳ 查询时间段: 2024-12-27 01:00:00 ~ 2024-12-27 13:00:00
⏳ 查询时间段: 2024-12-27 13:00:00 ~ 2024-12-28 01:00:00
⏳ 查询时间段: 2024-12-28 01:00:00 ~ 2024-12-28 13:00:00
⏳ 查询时间段: 2024-12-28 13:00:00 ~ 2024-12-29 01:00:00
⏳ 查询时间段: 2024-12-29 01:00:00 ~ 2024-12-29 13:00:00
⏳ 查询时间段: 2024-12-29 13:00:00 ~ 2024-12-30 01:00:00
⏳ 查询时间段: 2024-12-30 01:00:00 ~ 2024-12-30 13:00:00
⏳ 查询时间段: 2024-12-30 13:00:00 ~ 2024-12-31 01:00:00
⏳ 查询时间段: 2024-12-31 01:00:00 ~ 2024-12-31 13:00:00
⏳ 查询时间段: 2024-12-31 13:00:00 ~ 2025-01-01 01:00:00
⏳ 查询时间段: 2025-01-01 01:00:00 ~ 

In [29]:
def extract_continuous_segment(csv_path, week_count, day_count, time_scale, request_scale, output_path):
    # 读取 CSV 文件
    df = pd.read_csv(csv_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    df.sort_values('timestamp', inplace=True)
    df.reset_index(drop=True, inplace=True)

    # 设置时间戳为索引
    df.set_index('timestamp', inplace=True)

    # 获取数据的起始和结束时间
    start_time = df.index.min()
    end_time = df.index.max()

    # 计算所有可能的周一 00:00 的时间点
    all_mondays = pd.date_range(start=start_time, end=end_time, freq='W-MON')

    # 计算所需的总天数
    total_days = week_count * 7 + day_count

    # 查找所有满足条件的连续时间段
    valid_starts = []
    for monday in all_mondays:
        segment_end = monday + timedelta(days=total_days)
        if segment_end <= end_time:
            valid_starts.append(monday)

    if not valid_starts:
        print("❌ 数据中没有满足条件的连续时间段。")
        return

    # 随机选择一个起始时间
    selected_start = random.choice(valid_starts)
    selected_end = selected_start + timedelta(days=total_days)
    print(f"✅ 选中的时间段：{selected_start} 到 {selected_end}")

    # 提取选中的数据段
    segment = df.loc[selected_start:selected_end].copy()
    if segment.empty:
        print("⚠️ 选中的时间段内没有数据。")
        return

    # 重置时间戳，从 0 开始，并应用时间缩放
    segment.reset_index(inplace=True)
    base_time = segment['timestamp'].min()
    segment['timestamp'] = segment['timestamp'].apply(lambda x: int((x - base_time).total_seconds() / time_scale))

    # 应用请求数缩放
    segment['requests'] = segment['requests'] / request_scale

    # 保存到新的 CSV 文件
    segment.to_csv(output_path, index=False)
    print(f"📁 处理后的数据已保存到：{output_path}")

In [43]:
def schedule_requests_from_csv(requests_csv_path, rate_csv_path, output_csv_path):
    # 读取请求参数和请求速率数据
    requests_df = pd.read_csv(requests_csv_path)
    rate_df = pd.read_csv(rate_csv_path)

    # 打乱请求顺序
    requests_df = requests_df.sample(frac=1).reset_index(drop=True)

    result_rows = []
    request_index = 0
    accum = 0.0  # 累积速率

    for _, rate_row in rate_df.iterrows():
        timestamp_base = float(rate_row['timestamp'])
        rps = float(rate_row['requests'])

        accum += rps
        num_requests = int(accum)
        accum -= num_requests  # 保留小数部分

        for _ in range(num_requests):
            if request_index >= len(requests_df):
                break
            row = requests_df.iloc[request_index].copy()
            row['timestamp'] = int(timestamp_base)
            result_rows.append(row)
            request_index += 1

        if request_index >= len(requests_df):
            break

    # 创建结果 DataFrame
    output_df = pd.DataFrame(result_rows)

    # 保存到 CSV 文件
    output_df.to_csv(output_csv_path, index=False)
    print(f"📁 调度后的请求已保存到：{output_csv_path}")


In [44]:
extract_continuous_segment("../data/request_timeseries_all.csv", 0, 1, 120, 120*2, "../data/request_timeseries_1day.csv")
schedule_requests_from_csv("../data/train_mini.csv", "../data/request_timeseries_1day.csv", "../experiment/input/scheduled_requests_1day.csv")

✅ 选中的时间段：2025-02-24 13:00:00 到 2025-02-25 13:00:00
📁 处理后的数据已保存到：../data/request_timeseries_1day.csv
📁 调度后的请求已保存到：../experiment/input/scheduled_requests_1day.csv
