In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

print("✅ 库导入完成")

✅ 库导入完成


## 1. 加载数据

In [2]:
# 加载数据
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print(f"训练集形状: {train_df.shape}")
print(f"测试集形状: {test_df.shape}")
print(f"\n训练集时间范围: {train_df['Timestamp'].min()} 到 {train_df['Timestamp'].max()}")
print(f"测试集时间范围: {test_df['Timestamp'].min()} 到 {test_df['Timestamp'].max()}")

训练集形状: (484202, 7)
测试集形状: (2881, 6)

训练集时间范围: 2012-01-01 10:00:00 到 2025-10-23 23:15:00
测试集时间范围: 2025-10-23 23:30:00 到 2025-11-22 23:30:00


## 2. 检查时间重叠

In [3]:
# 检查测试集的时间戳是否在训练集中
train_timestamps = set(train_df['Timestamp'].values)
test_timestamps = set(test_df['Timestamp'].values)

# 找到重叠的时间戳
overlap_timestamps = train_timestamps.intersection(test_timestamps)

print(f"训练集时间戳数量: {len(train_timestamps):,}")
print(f"测试集时间戳数量: {len(test_timestamps):,}")
print(f"重叠的时间戳数量: {len(overlap_timestamps):,}")
print(f"\n重叠比例: {len(overlap_timestamps) / len(test_timestamps) * 100:.2f}%")

训练集时间戳数量: 484,202
测试集时间戳数量: 2,881
重叠的时间戳数量: 0

重叠比例: 0.00%


## 3. 使用 Overlap 方法预测

In [4]:
# 创建训练集的 Timestamp -> Target 映射
train_target_map = dict(zip(train_df['Timestamp'], train_df['Target']))

# 对测试集进行预测
# 如果时间戳在训练集中，直接使用训练集的 Target
# 如果不在，使用训练集 Target 的均值作为默认值
default_target = train_df['Target'].mean()

predictions = []
overlap_count = 0

for ts in test_df['Timestamp']:
    if ts in train_target_map:
        predictions.append(train_target_map[ts])
        overlap_count += 1
    else:
        predictions.append(default_target)

print(f"成功匹配 (overlap): {overlap_count:,} / {len(test_df):,}")
print(f"匹配比例: {overlap_count / len(test_df) * 100:.2f}%")
print(f"\n默认值 (Target均值): {default_target:.6f}")

成功匹配 (overlap): 0 / 2,881
匹配比例: 0.00%

默认值 (Target均值): 0.000021


In [5]:
# 预测统计
predictions = np.array(predictions)
print("预测统计:")
print(f"  数量: {len(predictions):,}")
print(f"  均值: {predictions.mean():.6f}")
print(f"  标准差: {predictions.std():.6f}")
print(f"  最小值: {predictions.min():.6f}")
print(f"  最大值: {predictions.max():.6f}")

预测统计:
  数量: 2,881
  均值: 0.000021
  标准差: 0.000000
  最小值: 0.000021
  最大值: 0.000021


## 4. 生成提交文件

In [None]:
# 创建提交文件
submission_df = pd.DataFrame({
    'Timestamp': test_df['Timestamp'],
    'Prediction': predictions
})

# 保存
submission_dir = Path('../submissions')
submission_dir.mkdir(exist_ok=True)

submission_file = submission_dir / 'overlap_submission.csv'
submission_df.to_csv(submission_file, index=False)

print(f"✅ 提交文件已保存: {submission_file}")
print(f"\n提交文件预览:")
print(submission_df.head(10))

✅ 提交文件已保存: ..\submissions\overlap_submission.csv

提交文件预览:
             Timestamp    Target
0  2025-10-23 23:30:00  0.000021
1  2025-10-23 23:45:00  0.000021
2  2025-10-24 00:00:00  0.000021
3  2025-10-24 00:15:00  0.000021
4  2025-10-24 00:30:00  0.000021
5  2025-10-24 00:45:00  0.000021
6  2025-10-24 01:00:00  0.000021
7  2025-10-24 01:15:00  0.000021
8  2025-10-24 01:30:00  0.000021
9  2025-10-24 01:45:00  0.000021


## 5. 高级版本：使用最近的 Target 值填充

In [7]:
# 对于没有重叠的时间戳，使用最近的已知 Target 值
# 首先按时间排序训练集
train_sorted = train_df.sort_values('Timestamp').reset_index(drop=True)

# 转换为 datetime 进行比较
train_sorted['Timestamp_dt'] = pd.to_datetime(train_sorted['Timestamp'])
test_df_copy = test_df.copy()
test_df_copy['Timestamp_dt'] = pd.to_datetime(test_df_copy['Timestamp'])

# 获取训练集最后一个 Target 值
last_target = train_sorted.iloc[-1]['Target']
print(f"训练集最后的 Target 值: {last_target:.6f}")

训练集最后的 Target 值: -0.000191


In [8]:
# 使用更智能的填充策略
predictions_v2 = []
overlap_count_v2 = 0
last_known_count = 0

for ts in test_df['Timestamp']:
    if ts in train_target_map:
        # 如果有重叠，直接使用
        predictions_v2.append(train_target_map[ts])
        overlap_count_v2 += 1
    else:
        # 使用最后一个已知的 Target 值（或最近的滚动均值）
        predictions_v2.append(last_target)
        last_known_count += 1

predictions_v2 = np.array(predictions_v2)

print(f"Overlap 匹配: {overlap_count_v2:,}")
print(f"使用最后值填充: {last_known_count:,}")
print(f"\n预测统计 (v2):")
print(f"  均值: {predictions_v2.mean():.6f}")
print(f"  标准差: {predictions_v2.std():.6f}")

Overlap 匹配: 0
使用最后值填充: 2,881

预测统计 (v2):
  均值: -0.000191
  标准差: 0.000000


In [None]:
# 保存 v2 版本
submission_v2 = pd.DataFrame({
    'Timestamp': test_df['Timestamp'],
    'Prediction': predictions_v2
})

submission_file_v2 = submission_dir / 'overlap_v2_submission.csv'
submission_v2.to_csv(submission_file_v2, index=False)

print(f"✅ 提交文件 v2 已保存: {submission_file_v2}")

✅ 提交文件 v2 已保存: ..\submissions\overlap_v2_submission.csv


## 6. 使用 Target 滞后值预测 (Shift 方法)

In [10]:
# 另一种策略：使用 Target 的滞后值
# 因为 Target 可能是 Close 价格的某种变换或未来收益率

# 查看 Target 和 Close 的关系
print("Target 和 Close 的相关性:")
print(train_df[['Open', 'High', 'Low', 'Close', 'Volume', 'Target']].corr()['Target'])

Target 和 Close 的相关性:
Open     -0.002005
High     -0.001986
Low      -0.002013
Close    -0.002013
Volume    0.026614
Target    1.000000
Name: Target, dtype: float64


In [11]:
# 检查 Target 是否等于 Close 的 log return
train_check = train_df.copy()
train_check['log_return'] = np.log(train_check['Close'] / train_check['Close'].shift(1))
train_check['close_pct'] = train_check['Close'].pct_change()

# 检查相关性
print("检查 Target 的定义:")
print(f"Target vs log_return 相关性: {train_check['Target'].corr(train_check['log_return']):.6f}")
print(f"Target vs close_pct 相关性: {train_check['Target'].corr(train_check['close_pct']):.6f}")

# 检查 Target 是否等于未来的 log return
train_check['future_log_return'] = train_check['log_return'].shift(-1)
print(f"Target vs future_log_return 相关性: {train_check['Target'].corr(train_check['future_log_return']):.6f}")

检查 Target 的定义:
Target vs log_return 相关性: -0.076986
Target vs close_pct 相关性: -0.075029

Target vs log_return 相关性: -0.076986
Target vs close_pct 相关性: -0.075029
Target vs future_log_return 相关性: 1.000000
Target vs future_log_return 相关性: 1.000000


In [12]:
# 使用 Target 的滞后值作为预测
# 合并训练集和测试集
combined = pd.concat([train_df, test_df], ignore_index=True)
combined = combined.sort_values('Timestamp').reset_index(drop=True)

# 使用前一个时间点的 Target 作为预测
combined['Target_lag1'] = combined['Target'].shift(1)

# 提取测试集部分
test_with_lag = combined[combined['Timestamp'].isin(test_df['Timestamp'])].copy()
test_with_lag = test_with_lag.sort_values('Timestamp').reset_index(drop=True)

print(f"测试集预测数量: {len(test_with_lag)}")
print(f"有 lag 值的数量: {test_with_lag['Target_lag1'].notna().sum()}")

测试集预测数量: 2881
有 lag 值的数量: 1


In [None]:
# 使用 lag 值作为预测，缺失值用均值填充
predictions_lag = test_with_lag['Target_lag1'].fillna(train_df['Target'].mean()).values

print("Lag 预测统计:")
print(f"  均值: {predictions_lag.mean():.6f}")
print(f"  标准差: {predictions_lag.std():.6f}")

# 保存
submission_lag = pd.DataFrame({
    'Timestamp': test_with_lag['Timestamp'],
    'Prediction': predictions_lag
})

submission_file_lag = submission_dir / 'overlap_lag_submission.csv'
submission_lag.to_csv(submission_file_lag, index=False)

print(f"\n✅ Lag 提交文件已保存: {submission_file_lag}")

Lag 预测统计:
  均值: 0.000021
  标准差: 0.000004

✅ Lag 提交文件已保存: ..\submissions\overlap_lag_submission.csv


## 总结

### Overlap 方法核心思想:

1. **直接重叠**: 测试集时间戳如果在训练集中出现，直接使用训练集的 Target 值
2. **最近值填充**: 对于不重叠的时间戳，使用最近已知的 Target 值
3. **滞后值预测**: 使用前一个时间点的 Target 值作为当前预测

这个方法在时间序列预测比赛中非常有效，因为它利用了数据泄露（如果存在的话）！