# 数据处理

In [9]:
import numpy as np
import pandas as pd

df_raw = pd.read_excel('全因子.xlsx')
df_raw.set_index('date', inplace=True)
#df_raw = df_raw.loc[df_raw.index > '2021-08-13']#永续回测需要

df = pd.DataFrame(index=df_raw.index)

# 0:63 直接差分（保持原逻辑）
df = pd.concat([df, df_raw.iloc[:, 0:63].diff(1)], axis=1)

# 63:65 先前向填充再差分（保持原逻辑）
df = pd.concat([df, df_raw.iloc[:, 63:65].fillna(method='ffill').diff(1)], axis=1)

# 65: 之后 —— 改为先前向填充再差分（这是唯一改动）
df = pd.concat([df, df_raw.iloc[:, 65:].fillna(method='ffill').diff(1)], axis=1)

# 行/列缺失清理
df = df.dropna(thresh=df.shape[1] / 2)  # 删除一行超过一半 NaN（周末等）
for col in df.columns:  # 删除 NaN 过多的列
    if df[col].isnull().sum() > df.shape[0] / 2:
        print(col, '前端缺失，已删除。')
        df.drop([col], axis=1, inplace=True)

# 转数值 + 异常值置 NaN + 线性插补
for col in df.columns:
    df[col] = pd.to_numeric(df[col])
    # z-score 绝对值大于 5 判定为异常
    df[col] = df[col].apply(lambda x: np.nan if abs((x - df[col].mean()) / df[col].std()) > 5 else x)
    df[col] = df[col].interpolate(method='linear', axis=0)
    
df.fillna(0, inplace=True)
df_clean = df.dropna()
print(df_clean.info())

# 标签：下一期 Δ(二级5yYTM) 是否 > 0
df_clean['value_sort'] = df_clean['二级5yYTM'].shift(-1).apply(lambda x: 1 if x > 0 else 0)
df_clean = df_clean.iloc[:-1]

  df = pd.concat([df, df_raw.iloc[:, 63:65].fillna(method='ffill').diff(1)], axis=1)
  df = pd.concat([df, df_raw.iloc[:, 65:].fillna(method='ffill').diff(1)], axis=1)


SHIBOR_2Y-FR007_2Y 前端缺失，已删除。
TL当季价格 前端缺失，已删除。
TL下季价格 前端缺失，已删除。
TL成交量 前端缺失，已删除。
TL持仓量 前端缺失，已删除。
TL跨期价差 前端缺失，已删除。
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1783 entries, 2019-01-03 to 2025-08-28
Columns: 189 entries, 同业存单_1Y-FR007_1Y to 永续债_7Y-二级资本债_7Y
dtypes: float64(189)
memory usage: 2.6 MB
None


In [3]:
import numpy as np
import pandas as pd

df_raw = pd.read_excel('全因子.xlsx')
df_raw.set_index('date', inplace=True)

# === 1) 先从原始数据抽出 ytm，并据此生成标签（不做任何插值/差分） ===
ytm = df_raw['二级5yYTM'].copy()
ytm = ytm.dropna()  # 去掉原始缺失的样本
labels = (ytm.shift(-5) - ytm > 0).astype(int)  # 周一→下周一（5个交易日）
labels = labels.dropna()  # 去掉尾部 5 行

# === 2) 特征：把 ytm 剔除后再做你原来的分段差分与清洗 ===
feat_raw = df_raw.drop(columns=['二级5yYTM'])

df = pd.DataFrame(index=feat_raw.index)
df = pd.concat([df, feat_raw.iloc[:, 0:63].diff(1)], axis=1)
df = pd.concat([df, feat_raw.iloc[:, 63:65].ffill().diff(1)], axis=1)
df = pd.concat([df, feat_raw.iloc[:, 65:].ffill().diff(1)], axis=1)

# 行/列缺失清理
df = df.dropna(thresh=df.shape[1] / 2)
for col in list(df.columns):
    if df[col].isna().sum() > df.shape[0] / 2:
        print(col, '前端缺失，已删除。')
        df.drop(columns=[col], inplace=True)

# 数值化 + 异常值→NaN + 线性插补（仅对特征做）
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    mu, sd = df[col].mean(), df[col].std()
    z = (df[col] - mu) / (sd if sd not in (0, np.nan) else 1)
    df[col] = df[col].mask(z.abs() > 5)
    df[col] = df[col].interpolate(method='linear', axis=0)

df = df.fillna(0)

# === 3) 与标签按时间对齐，得到最终 df_clean ===
common_idx = df.index.intersection(labels.index)
df_clean = df.loc[common_idx].copy()
df_clean['value_sort'] = labels.loc[common_idx].astype(int)

print(df_clean.info())

SHIBOR_2Y-FR007_2Y 前端缺失，已删除。
TL当季价格 前端缺失，已删除。
TL下季价格 前端缺失，已删除。
TL成交量 前端缺失，已删除。
TL持仓量 前端缺失，已删除。
TL跨期价差 前端缺失，已删除。
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1662 entries, 2019-01-03 to 2025-08-28
Columns: 189 entries, 同业存单_1Y-FR007_1Y to value_sort
dtypes: float64(188), int64(1)
memory usage: 2.4 MB
None


# GBM

In [10]:
import warnings
warnings.filterwarnings('ignore')

from hypernets.utils import logging

from sklearn.model_selection import train_test_split

from hypergbm import make_experiment
from hypernets.tabular.datasets import dsutils

In [11]:
# 假设 df_clean 已经准备好，索引为时间或顺序
label_col = "value_sort"
split_point = int(len(df_clean) * 0.7)

# 按顺序切分
df_train = df_clean.iloc[:split_point]
df_test  = df_clean.iloc[split_point:]

# 拆分特征和标签
X_train, y_train = df_train.drop(columns=[label_col]), df_train[label_col]
X_test,  y_test  = df_test.drop(columns=[label_col]),  df_test[label_col]

train_data = pd.concat([X_train,y_train],axis=1)
experiment = make_experiment(train_data, target='value_sort')
estimator = experiment.run()

### Input Data

Unnamed: 0,X_train.shape,y_train.shape,X_eval.shape,y_eval.shape,X_test.shape,Task
0,"(1163, 188)","(1163,)",,,,binary(2)


ValueError: object __array__ method not producing an array

<Figure size 640x320 with 1 Axes>

## Data Adaption

### Initliazed parameters

Unnamed: 0,key,value
0,memory_limit,0.05
1,min_cols,0.3
2,name,data_adaption
3,target,


### Fitted parameters

Unnamed: 0,key,value
0,input_features,"[同业存单_1Y-FR007_1Y, 国债_1Y-FR007_1Y, 国债_2Y-FR007..."
1,selected_features,"[同业存单_1Y-FR007_1Y, 国债_1Y-FR007_1Y, 国债_2Y-FR007..."
2,unselected_features,[]


### Elapsed

* 0.007 seconds

## Data Clean

### Initliazed parameters

Unnamed: 0,key,value
0,cv,True
1,data_cleaner_args,"{'nan_chars': None, 'correct_object_dtype': Tr..."
2,name,data_clean
3,train_test_split_strategy,


### Fitted parameters

Unnamed: 0,key,value
0,input_features,"[同业存单_1Y-FR007_1Y, 国债_1Y-FR007_1Y, 国债_2Y-FR007..."
1,selected_features,"[同业存单_1Y-FR007_1Y, 国债_1Y-FR007_1Y, 国债_2Y-FR007..."
2,unselected_features,"[OMO007, SLF007]"
3,X_train.shape,"(1163, 186)"
4,y_train.shape,"(1163,)"
5,X_eval.shape,
6,y_eval.shape,
7,X_test.shape,
8,unselected_reason,"{'OMO007': 'constant', 'SLF007': 'constant'}"
9,kept/dropped feature count,186/2


### Elapsed

* 0.034 seconds

## Space Searching

### Initliazed parameters

Unnamed: 0,key,value
0,cv,True
1,name,space_searching
2,num_folds,3


#### Experiment Settings:

HyperGBM(searcher=EvolutionSearcher(space_fn=GeneralSearchSpaceGenerator(kwargs=None, n_estimators=200), population_size=30, sample_size=10, regularized=True, optimize_direction='max', random_state=RandomState(MT19937) at 0x7FDEA0E12E40), callbacks=[EarlyStoppingCallback(max_no_improvement_trials=10, mode='max', time_limit=3599.95792555809), NotebookCallback(), ProgressiveCallback(), FitCrossValidationCallback()], task='binary', discriminator=OncePercentileDiscriminator(percentile=50, history=TrialHistory(direction='max'), optimize_direction='max'))

Unnamed: 0,X,y,X_eval,y_eval,cv,num_folds,max_trials,fit_kwargs
0,"(1163, 186)","(1163,)",,,True,3,10,()


#### Trials Summary:

Unnamed: 0,Trial No.,Previous reward,Best trial,Best reward,Total elapsed,Valid trials,Max trials
0,10,ERR,2,[0.6672398968185727],5.112949,2,10


#### Best Trial:

key,value
signature,07eb114c3cebfac1fa1af984886326ec
vectors,"[2, 3, 0, 2, 0, 2]"
0-estimator_options.hp_or,2
,
1-numeric_imputer_0.strategy,most_frequent
,
2-numeric_or_scaler_0.hp_or,0
,
3-Module_CatBoostEstimator_1.learning_rate,0.5
,


#### Top trials:

Unnamed: 0,Trial No.,Reward,Elapsed,Space Vector
0,2,[0.6672398968185727],1.155156,"[2, 3, 0, 2, 0, 2]"
1,7,[0.650042992261393],1.030864,"[2, 1, 1, 3, 0, 1]"


search:   0%|          | 0/10 [00:00<?, ?it/s]

fit_cross_validation:   0%|          | 0/3 [00:00<?, ?it/s]

09-16 22:31:02 E hypernets.m.hyper_model.py 104 - run_trail failed! trail_no=1
09-16 22:31:02 E hypernets.m.hyper_model.py 106 - Traceback (most recent call last):
  File "/root/miniconda3/envs/hypergbm/lib/python3.9/site-packages/hypernets/model/hyper_model.py", line 95, in _run_trial
    ret_data = estimator.fit_cross_validation(X, y, stratified=True, num_folds=num_folds, shuffle=False,
  File "/root/miniconda3/envs/hypergbm/lib/python3.9/site-packages/hypergbm/hyper_gbm.py", line 338, in fit_cross_validation
    fold_est.fit(x_train_fold, y_train_fold, **fit_kwargs)
  File "/root/miniconda3/envs/hypergbm/lib/python3.9/site-packages/hypergbm/estimators.py", line 517, in fit
    return self.fit_with_encoder(super().fit, X, y, kwargs)
  File "/root/miniconda3/envs/hypergbm/lib/python3.9/site-packages/hypergbm/estimators.py", line 198, in fit_with_encoder
    return fn_fit(X, y, **kwargs)
  File "/root/miniconda3/envs/hypergbm/lib/python3.9/site-packages/xgboost/core.py", line 726, in i

### Fitted parameters

Unnamed: 0,key,value
0,input_features,"[同业存单_1Y-FR007_1Y, 国债_1Y-FR007_1Y, 国债_2Y-FR007..."
1,best_reward,[0.6672398968185727]
2,history,"TrialHistory(direction='max', size=10, succeed..."


### Elapsed

* 5.146 seconds

## Final Ensemble

### Initliazed parameters

Unnamed: 0,key,value
0,ensemble_size,20
1,name,final_ensemble
2,scorer,"make_scorer(accuracy_score, response_method='p..."


### Fitted parameters

Unnamed: 0,0,1
0,weights,"[1.0, 0.0]"
1,scores,"[0.6672398968185727, 0.6672398968185727, 0.667..."
2,best_stack,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,hits,{0: 20}
4,ensemble_size,20


### Elapsed

* 4.770 seconds

In [12]:
import pickle
with open('model.pkl','wb') as f:
  pickle.dump(estimator, f)

In [13]:
from sklearn.metrics import classification_report

y_pred=estimator.predict(df_test)
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.66324   0.80124   0.72574       322
           1    0.41818   0.25989   0.32056       177

    accuracy                        0.60922       499
   macro avg    0.54071   0.53056   0.52315       499
weighted avg    0.57631   0.60922   0.58202       499



In [14]:
from sklearn.metrics import roc_auc_score

# 假设 estimator 是你训练好的模型
y_prob = estimator.predict_proba(X_test)[:, 1]   # 取正类概率
auc = roc_auc_score(y_test, y_prob)
print("AUC:", auc)

AUC: 0.5608134189563814


In [15]:
pip show scikit-learn

Name: scikit-learn
Version: 1.2.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /root/miniconda3/envs/hypergbm/lib/python3.9/site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: hypergbm, hypernets, imbalanced-learn
Note: you may need to restart the kernel to use updated packages.


# AUTOGLUON

In [4]:
from autogluon.tabular import TabularDataset, TabularPredictor

## 处理二级债

In [2]:
import numpy as np
import pandas as pd

df_raw = pd.read_excel('全因子.xlsx')
df_raw.set_index('date', inplace=True)


ytm = df_raw[['二级5yYTM']].copy()

labels = (ytm['二级5yYTM'].shift(-1) - ytm['二级5yYTM'] > 0).astype(int)

df_clean = df_raw.copy()
df_clean['value_sort'] = labels

# 删除最后一行（因为没有未来数据做标签）
df_clean = df_clean.iloc[:-1]

In [58]:
import pandas as pd

# 读取数据
df_1 = pd.read_excel('全因子.xlsx')
#df_2 = pd.read_csv('all_data_5.csv')
df_2 = pd.read_excel('FR007_PRED.xlsx',sheet_name = 1)
# 将 'date' 列设置为索引
df_1.set_index('date', inplace=True)
df_2.set_index('date', inplace=True)

# 删除 df_2 中的 'Y1' 列
df_2 = df_2.drop('Y1', axis=1)

# 确保索引是 datetime 类型
df_1.index = pd.to_datetime(df_1.index, errors='coerce')
df_2.index = pd.to_datetime(df_2.index, errors='coerce')

# 合并两个 DataFrame，基于索引进行合并
df_clean = pd.merge(df_1, df_2, left_index=True, right_index=True, how='inner')

df_3 = pd.read_excel('FR007.xlsx')
df_3.set_index('date', inplace=True)
df_clean = pd.merge(df_clean, df_3, left_index=True, right_index=True, how='inner')

In [6]:
import pandas as pd

# ====== 配置 ======
path1, sheet1 = "merged.xlsx", 0
path2, sheet2 = "FR007_PRED.xlsx", 1   # 确认这个 sheet 索引是否正确！
path3, sheet3 = "FR007.xlsx", 0
target_col = "Y1"                       # 你要差分的列名（如 "Y1" 或 "Y2"）

# ====== 工具函数 ======
def ensure_date_index(df, date_col=None):
    """
    将 df 的日期列设为索引，自动处理常见格式：
    - 指定了 date_col 就用该列，否则用第一列
    - 先常规 pd.to_datetime 解析；若大量 NaT，尝试按 Excel 序列号解析
    - 归一化到日期、丢 NaT、去重、排序
    """
    # 选索引列
    if date_col is not None:
        if date_col not in df.columns:
            raise ValueError(f"指定的日期列 '{date_col}' 不存在；可用列: {df.columns.tolist()[:10]} ...")
        idx = df[date_col]
        df = df.drop(columns=[date_col])
    else:
        idx = df.iloc[:, 0]
        df = df.drop(df.columns[0], axis=1)

    # 常规解析
    dt = pd.to_datetime(idx, errors="coerce")

    # 若解析失败较多，尝试 Excel 序列号（天数，自 1899-12-30 起）
    if dt.isna().mean() > 0.5:
        s = pd.to_numeric(idx, errors="coerce")
        if s.notna().mean() > 0.5 and s.between(40000, 90000).mean() > 0.5:
            dt = pd.to_datetime(s, unit="d", origin="1899-12-30", errors="coerce")

    dt = dt.dt.normalize()
    mask = dt.notna()
    df = df.loc[mask].copy()
    df.index = dt[mask]
    df = df[~df.index.duplicated(keep="first")].sort_index()
    return df

def describe(name, df):
    mn = df.index.min()
    mx = df.index.max()
    print(f"{name}: 行数={len(df)}, 日期范围=[{mn} ~ {mx}]")

# ====== 读取与预处理 ======
df1 = pd.read_excel(path1, sheet_name=sheet1)
df2 = pd.read_excel(path2, sheet_name=sheet2)
df3 = pd.read_excel(path3, sheet_name=sheet3)

# 优先用名为 'date' 的列；没有则默认第一列
df1 = ensure_date_index(df1, 'date' if 'date' in df1.columns else None)
df2 = ensure_date_index(df2, 'date' if 'date' in df2.columns else None)
df3 = ensure_date_index(df3, 'date' if 'date' in df3.columns else None)

describe("df1", df1)
describe("df2", df2)
describe("df3", df3)

# ====== 求三表的日期交集并合并 ======
common_idx = df1.index.intersection(df2.index).intersection(df3.index)
print(f"共同日期个数={len(common_idx)}, 共同日期范围=[{common_idx.min()} ~ {common_idx.max()}]")

df_clean = pd.concat([df1, df2, df3], axis=1).loc[common_idx].sort_index()

# ====== 差分与标签（统一使用你提供的 target_col）======
if target_col not in df_clean.columns:
    raise ValueError(f"目标列 '{target_col}' 不在合并后的数据中。可用列示例: {df_clean.columns.tolist()[:20]}")

df_clean["diff"]  = df_clean[target_col].diff()
df_clean["label"] = (df_clean["diff"].shift(-1) > 0).astype("float")
df_clean = df_clean.dropna(subset=["diff"])
if df_clean["label"].isna().any():
    df_clean = df_clean.iloc[:-1, :]
df_clean["label"] = df_clean["label"].astype(int)

describe("df_clean", df_clean)
print(f"标签基于列: {target_col}")
# df_clean.to_excel("df_clean.xlsx")


df1: 行数=3183, 日期范围=[2017-01-01 00:00:00 ~ 2025-09-18 00:00:00]
df2: 行数=571, 日期范围=[2023-06-01 00:00:00 ~ 2025-09-16 00:00:00]
df3: 行数=2273, 日期范围=[2017-01-02 00:00:00 ~ 2025-09-17 00:00:00]
共同日期个数=560, 共同日期范围=[2023-06-01 00:00:00 ~ 2025-09-16 00:00:00]
df_clean: 行数=559, 日期范围=[2023-06-02 00:00:00 ~ 2025-09-16 00:00:00]
标签基于列: Y1


In [1]:
import pandas as pd

# ====== 配置 ======
path1, sheet1 = "merged.xlsx", 0
path3, sheet3 = "国债.xlsx", 0
target_col = "国债_10Y"   # 作为标签基准的列（来自 FR007.xlsx）

# ====== 工具函数 ======
def ensure_date_index(df, date_col=None):
    """将 df 的日期列设为索引，并做日期规范化、去重、排序。"""
    if date_col is not None and date_col in df.columns:
        idx = df[date_col]
        df = df.drop(columns=[date_col])
    else:
        idx = df.iloc[:, 0]
        df = df.drop(df.columns[0], axis=1)

    # 常规解析
    dt = pd.to_datetime(idx, errors="coerce")

    # 若解析失败较多，尝试 Excel 序列号（以 1899-12-30 为起点的天数）
    if dt.isna().mean() > 0.5:
        s = pd.to_numeric(idx, errors="coerce")
        if s.notna().mean() > 0.5 and s.between(40000, 90000).mean() > 0.5:
            dt = pd.to_datetime(s, unit="d", origin="1899-12-30", errors="coerce")

    dt = dt.dt.normalize()
    mask = dt.notna()
    df = df.loc[mask].copy()
    df.index = dt[mask]
    df = df[~df.index.duplicated(keep="first")].sort_index()
    return df

def describe(name, df):
    mn, mx = df.index.min(), df.index.max()
    print(f"{name}: 行数={len(df)}, 日期范围=[{mn} ~ {mx}]")

# ====== 读取与预处理 ======
df1 = pd.read_excel(path1, sheet_name=sheet1)
df3 = pd.read_excel(path3, sheet_name=sheet3)

# 优先使用名为 'date' 的列作为索引；没有则默认第一列
df1 = ensure_date_index(df1, 'date' if 'date' in df1.columns else None)
df3 = ensure_date_index(df3, 'date' if 'date' in df3.columns else None)

# 删除 FR007_5Y 为空的行（在 FR007.xlsx 侧先清理）
if target_col not in df3.columns:
    raise ValueError(f"FR007.xlsx 中未找到列 {target_col}，可用列有：{list(df3.columns)}")
df3 = df3.dropna(subset=[target_col])

describe("df1", df1)
describe("df3 (清理后)", df3)

# ====== 求交集并合并 ======
common_idx = df1.index.intersection(df3.index)
print(f"共同日期个数={len(common_idx)}, 范围=[{common_idx.min()} ~ {common_idx.max()}]")

df_clean = pd.concat([df1, df3], axis=1).loc[common_idx].sort_index()

# ====== 差分与标签（基于 FR007_5Y）=====
df_clean["diff"] = df_clean[target_col].diff()

# 用“下一期”的涨跌作为当前样本标签：>0 -> -1，<0 -> 1，=0 -> 0
df_clean["label"] = df_clean["diff"].shift(-1).apply(
    lambda x: (-1 if x > 0 else (1 if x < 0 else 0)) if pd.notna(x) else pd.NA
)

# 去掉首行(diff 为 NaN) 与末行(label 为 NA)
df_clean = df_clean.dropna(subset=["diff", "label"])
df_clean["label"] = df_clean["label"].astype(int)

# 如果不想保留“持平=0”的样本，取消下一行注释：
# df_clean = df_clean[df_clean["label"] != 0]

describe("df_clean", df_clean)
print(f"标签基于列: {target_col}")

# 可选：保存结果
df_clean.to_excel("df_clean.xlsx")

df1: 行数=3183, 日期范围=[2017-01-01 00:00:00 ~ 2025-09-18 00:00:00]
df3 (清理后): 行数=2115, 日期范围=[2017-01-03 00:00:00 ~ 2025-09-11 00:00:00]
共同日期个数=2115, 范围=[2017-01-03 00:00:00 ~ 2025-09-11 00:00:00]
df_clean: 行数=2113, 日期范围=[2017-01-04 00:00:00 ~ 2025-09-10 00:00:00]
标签基于列: 国债_10Y


## 处理FR007

In [1]:
import numpy as np
import pandas as pd

df_clean = pd.read_csv('all_data_5.csv')
df_clean.set_index('date', inplace=True)
df_clean = df_clean.drop('Y1',axis=1)

In [2]:
# 假设 df_clean 已经准备好，索引为时间或顺序

label_col = "label"
split_point = int(len(df_clean) * 0.8)

# 按顺序切分
df_train = df_clean.iloc[:split_point]
df_test  = df_clean.iloc[split_point:]

# 拆分特征和标签
X_train, y_train = df_train.drop(columns=[label_col]), df_train[label_col]
X_test,  y_test  = df_test.drop(columns=[label_col]),  df_test[label_col]

train_data = pd.concat([X_train,y_train],axis=1)
test_data = pd.concat([X_test, y_test],axis = 1)

In [3]:
from autogluon.tabular import TabularDataset, TabularPredictor
predictor = TabularPredictor(label=label_col).fit(train_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20250919_035048"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.3
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #25-Ubuntu SMP Wed Mar 30 15:54:22 UTC 2022
CPU Count:          20
Memory Avail:       974.53 GB / 1007.51 GB (96.7%)
Disk Space Avail:   14.19 GB / 30.00 GB (47.3%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='extreme' : New in v1.4: Massively better than 'best' on datasets <30000 samples by using new models meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, and TabM. Absolute best accuracy. Requires a GPU. Recommended 64 GB CPU memory and 32+ GB GPU memory.
	presets='best'    : Maximize accuracy. Recommended 

In [10]:
predictor.evaluate(test_data)

{'accuracy': 0.5224586288416075,
 'balanced_accuracy': np.float64(0.36499226436807924),
 'mcc': 0.10013290053350657}

In [7]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBM,0.607143,0.688889,accuracy,0.02621,0.025625,0.429002,0.02621,0.025625,0.429002,1,True,2
1,XGBoost,0.607143,0.655556,accuracy,0.031925,0.025985,0.959421,0.031925,0.025985,0.959421,1,True,9
2,LightGBMXT,0.5625,0.677778,accuracy,0.025678,0.023011,0.822087,0.025678,0.023011,0.822087,1,True,1
3,ExtraTreesGini,0.5625,0.7,accuracy,0.056777,0.091613,1.02877,0.056777,0.091613,1.02877,1,True,6
4,WeightedEnsemble_L2,0.535714,0.755556,accuracy,0.367625,0.238382,21.0639,0.002021,0.000618,0.040376,2,True,12
5,ExtraTreesEntr,0.526786,0.666667,accuracy,0.057842,0.090468,1.02181,0.057842,0.090468,1.02181,1,True,7
6,NeuralNetFastAI,0.517857,0.733333,accuracy,0.045053,0.034264,2.587463,0.045053,0.034264,2.587463,1,True,8
7,RandomForestEntr,0.508929,0.7,accuracy,0.059717,0.102804,0.899581,0.059717,0.102804,0.899581,1,True,4
8,RandomForestGini,0.5,0.7,accuracy,0.057804,0.087081,1.038618,0.057804,0.087081,1.038618,1,True,3
9,NeuralNetTorch,0.491071,0.644444,accuracy,0.052368,0.049133,4.19476,0.052368,0.049133,4.19476,1,True,10


In [11]:
# 1. 确保 'test_data' 是正确的，并且返回的预测结果是 Series 类型
predictions = predictor.predict(test_data)

# 2. 将预测结果的索引对齐到 'test_data' 的索引
df_result = test_data.reset_index()[['date', '国债_10Y', 'label']].copy()

# 3. 将预测结果添加到 DataFrame
df_result['predictions'] = predictions.values  # 使用 .values 确保是一个可插入的值

# 4. 输出查看
print(df_result.head())

# 5. 保存到 Excel 文件
output_file = 'prediction_results_国债_2025.xlsx'
df_result.to_excel(output_file, index=False)

print(f"预测结果已保存到 {output_file}")

        date  国债_10Y  label  predictions
0 2023-12-15  2.5612      1            1
1 2023-12-18  2.5546     -1           -1
2 2023-12-19  2.5666     -1           -1
3 2023-12-20  2.5760      1           -1
4 2023-12-21  2.5353      1            1
预测结果已保存到 prediction_results_国债_2025.xlsx


In [70]:
predictions = predictor.predict(test_data)
predictor.predict(test_data)

date
2025-03-21   -1
2025-03-24    1
2025-03-25   -1
2025-03-26    1
2025-03-27   -1
             ..
2025-08-22    1
2025-08-25    1
2025-08-26    1
2025-08-27   -1
2025-08-28   -1
Name: label, Length: 110, dtype: int64

In [43]:
type(predictions)

pandas.core.series.Series

In [4]:
from autogluon.tabular import TabularPredictor
import pandas as pd
import matplotlib.pyplot as plt

# 用验证集/测试集计算特征重要性（需要包含label列，AutoGluon会用它评估）
imp = predictor.feature_importance(test_data, subsample_size=None)  # 或 train_data

# 1) 保存到 Excel/CSV
imp.to_excel('feature_importance.xlsx')
imp.to_csv('feature_importance.csv', index=True)

# 2) 画前30个特征的重要性条形图并保存
topk = 30
plot_df = imp.head(topk).sort_values('importance')  # importance越大越重要
ax = plot_df['importance'].plot(kind='barh', figsize=(8, 10))
ax.set_xlabel('Permutation Importance')
ax.set_ylabel('Feature')
ax.set_title(f'AutoGluon Feature Importance (Top {topk})')
plt.tight_layout()
plt.savefig('feature_importance_top30.png', dpi=150)
plt.close()

print('已生成：feature_importance.xlsx / feature_importance.csv / feature_importance_top30.png')

These features in provided data are not utilized by the predictor and will be ignored: ['国债_10Y-国开债_10Y', '国债_10Y-企业债AAA+_10Y', '国债_10Y-中短期票据AAA+_10Y', '国债_10Y-国债_1Y', '国债_10Y-国债_3Y', '国债_10Y-国债_5Y', '国债_10Y-国债_7Y', '国债_30Y-国债_10Y', '国开债_10Y-国债_10Y', '国开债_5Y-国债_5Y', '国开债_1Y-国债_1Y', '中短期票据AAA+_1Y-国开债1Y', '中短期票据AAA+_3Y-国开债3Y', '中短期票据AAA+_5Y-国开债5Y', '中短期票据AAA+_7Y-国开债7Y', '中短期票据AAA+_10Y-国开债10Y', '中短期票据AAA+_30Y-国开债30Y', '国债_10Y-同业存单AAA_1Y', '国开债_10Y-国开债_5Y', '国开债_10Y-国开债_1Y', '活跃券_10Y-国债_10Y', '国债_5Y-FR007_IRS_5Y', '国债_1Y_收益率变动_5D', '国债_1Y_收益率变动_30D', '国债_1Y_收益率变动_90D', '国债_5Y_收益率变动_5D', '国债_5Y_收益率变动_30D', '国债_5Y_收益率变动_90D', '国债_7Y_收益率变动_5D', '国债_7Y_收益率变动_30D', '国债_7Y_收益率变动_90D', '国债_10Y_收益率变动_5D', '国债_10Y_收益率变动_30D', '国债_10Y_收益率变动_90D', '国债_30Y_收益率变动_5D', '国债_30Y_收益率变动_30D', '国债_30Y_收益率变动_90D', '国债_1Y_动量', '国债_5Y_动量', '国债_7Y_动量', '国债_10Y_动量', '国债_30Y_动量', '国债_1Y_macd', '国债_5Y_macd', '国债_7Y_macd', '国债_10Y_macd', '国债_30Y_macd', '交易日期是否交易日', '交易日期月', '交易日期日', '交易日期是否15号', '交易日期是否20号', '交易日期星期

已生成：feature_importance.xlsx / feature_importance.csv / feature_importance_top30.png
