<a href="https://colab.research.google.com/github/Sam-ai904/Huatai-Model/blob/main/%E9%81%97%E4%BC%A0%E8%A7%84%E5%88%92%2B%E5%9B%A0%E5%AD%90%E5%90%88%E6%88%90%2B%E5%9B%9E%E6%B5%8B%EF%BC%88%E7%A0%94%E6%8A%A5%E7%89%88%2B%E9%AB%98%E6%80%A7%E8%83%BD%EF%BC%89.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
#!pip install tushare bottleneck
import tushare as ts
import pandas as pd
import numpy as np
import random
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
import bottleneck as bn


In [17]:
# 设置Tushare API Token并初始化
ts.set_token('2876ea85cb005fb5fa17c809a98174f2d5aae8b1f830110a5ead6211')
pro = ts.pro_api()


In [18]:
# 获取沪深300成分股前50
def get_hs300_top50(start_date , end_date):
    try:
        df = pro.index_weight(index_code='000001.SH', start_date= start_date, end_date= end_date)
        if df.empty:
            raise ValueError("未获取到沪深300成分股数据")

        df = df.sort_values('weight', ascending=False)
        stock_list = df['con_code'].unique()[:100].tolist()
        print(f"成功获取沪深300前50只股票: {stock_list[:6]}...（共 {len(stock_list)} 只）")
        return stock_list

    except Exception as e:
        print(f"获取沪深300成分股失败: {e}")
        return []

# 获取股票日频数据
def get_data(start_date, end_date, stock_list):
    df_list = []
    for stock in stock_list:
        try:
            temp_df = pro.daily(
                ts_code=stock,
                start_date=start_date,
                end_date=end_date,
                fields='ts_code,trade_date,open,close,high,low,vol,pct_chg'
            )
            if not temp_df.empty:
                # print(f"股票 {stock} 在 {start_date} 至 {end_date} 获取到 {len(temp_df)} 条数据")
                df_list.append(temp_df)
            else:
                print(f"股票 {stock} 在 {start_date} 至 {end_date} 无数据")
        except Exception as e:
            print(f"获取股票 {stock} 数据失败: {e}")

    if not df_list:
        print(f"时间范围 {start_date} 至 {end_date} 无任何股票数据")
        return pd.DataFrame()

    try:
        df = pd.concat(df_list)
        df.rename(columns={'vol': 'volume', 'pct_chg': 'return'}, inplace=True)
        df['return'] = df['return'] / 100
        df['trade_date'] = pd.to_datetime(df['trade_date'])

        # 检查并移除重复的 trade_date 和 ts_code 组合
        duplicates = df.duplicated(subset=['trade_date', 'ts_code'], keep=False)
        if duplicates.any():
            print(f"警告: 发现重复数据，共 {duplicates.sum()} 条，自动保留最后一条")
            df = df.drop_duplicates(subset=['trade_date', 'ts_code'], keep='last')

        # 重塑数据
        df_pivot = df.pivot(index='trade_date', columns='ts_code')

        # 检查收益率数据
        if 'return' in df_pivot:
            return_stats = df_pivot['return'].describe()
            print(f"收益率统计: {return_stats}")
            # 过滤收益率全为 NaN 或常数的股票
            valid_stocks = return_stats.loc['std'] > 0
            valid_stocks = valid_stocks[valid_stocks].index.tolist()
            if not valid_stocks:
                print("所有股票的收益率均为常数或 NaN，无法继续")
                return pd.DataFrame()
            df_pivot = df_pivot.loc[:, df_pivot.columns.get_level_values(1).isin(valid_stocks)]

        return df_pivot

    except Exception as e:
        print(f"数据合并失败: {e}")
        return pd.DataFrame()

In [19]:
# 中位数去极值函数
def winsorize_median(factor, n_mad=5):
    factor = factor.copy()
    # 计算中位数
    median = np.nanmedian(factor)
    # 计算中位数绝对偏差 (MAD)
    mad = np.nanmedian(np.abs(factor - median))
    # 设定上下限
    upper = median + n_mad * mad
    lower = median - n_mad * mad
    # 截断极值
    factor = np.clip(factor, lower, upper)
    return factor

In [20]:
# 标准化函数
def standardize(factor):
    factor = factor.copy()
    mean = np.nanmean(factor)
    std = np.nanstd(factor)
    return (factor - mean) / (std + 1e-10)  # 避免除以零

In [21]:
# 计算所有因子的值
def calculate_all_factors(data, factor_expressions, target_shape):
    factors = []
    # 使用 target 的展平长度来确定样本数
    expected_length = target_shape[0]
    for expr in factor_expressions:
        try:
            # 计算因子值
            factor_values = eval(expr, {'np': np, 'bn': bn}, {'data': data})
            # 展平为1维数组
            if isinstance(factor_values, pd.DataFrame):
                factor_values_flat = factor_values.values.flatten()
            else:
                factor_values_flat = factor_values.flatten()
            # 确保因子值长度与 target 对齐
            if len(factor_values_flat) != expected_length:
                print(f"因子 {expr} 的展平长度 {len(factor_values_flat)} 与目标长度 {expected_length} 不一致，调整中...")
                # 截断或填充因子值
                if len(factor_values_flat) > expected_length:
                    factor_values_flat = factor_values_flat[:expected_length]
                else:
                    # 填充 NaN 至目标长度
                    factor_values_flat = np.pad(factor_values_flat, (0, expected_length - len(factor_values_flat)),
                                               mode='constant', constant_values=np.nan)
            # 去极值和标准化
            factor_values_flat = standardize(winsorize_median(factor_values_flat))
            factors.append(factor_values_flat)
        except Exception as e:
            print(f"计算因子 {expr} 失败: {e}")
            factors.append(np.full(expected_length, np.nan))

    # 转换为 (样本数, 因子数) 的矩阵
    factors = np.array(factors).T
    return factors

In [22]:
# 使用随机森林进行因子合成
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 使用随机森林进行因子合成
def synthesize_with_random_forest(factors, target, test_size=0.2, random_state=42):
    # 确保 factors 和 target 的样本数一致
    if factors.shape[0] != len(target):
        raise ValueError(f"factors 形状 {factors.shape[0]} 与 target 长度 {len(target)} 不一致")

    # 确保没有 NaN 值
    valid_mask = ~np.any(np.isnan(factors), axis=1) & ~np.isnan(target)
    factors_clean = factors[valid_mask]
    target_clean = target[valid_mask]

    if len(factors_clean) < 2:
        print("有效数据点少于2，无法进行随机森林训练")
        return np.full(len(target), np.nan), None

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(
        factors_clean, target_clean, test_size=test_size, random_state=random_state
    )

    # 训练随机森林模型
    rf = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=random_state,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)

    # 预测
    y_pred_train = rf.predict(X_train)
    y_pred_test = rf.predict(X_test)

    # 计算 MSE
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    print(f"Train MSE: {mse_train:.6f}, Test MSE: {mse_test:.6f}")

    # 预测整个数据集
    final_factor = np.full(len(target), np.nan)
    final_factor[valid_mask] = rf.predict(factors[valid_mask])

    # 标准化最终因子
    final_factor = standardize(final_factor)

    return final_factor, rf

In [23]:
# 保存结果
def save_results(final_factor, data, output_path="synthetic_factor.csv"):
    # 确保 final_factor 的长度与 data.index 对齐
    if len(final_factor) != len(data.index):
        print(f"final_factor 长度 {len(final_factor)} 与 data.index 长度 {len(data.index)} 不一致，调整中...")
        min_length = min(len(final_factor), len(data.index))
        final_factor = final_factor[:min_length]
        data_subset = data.iloc[:min_length]
    else:
        data_subset = data

    result_df = pd.DataFrame({
        'date': data_subset.index,
        'synthetic_factor': final_factor
    })
    result_df.to_csv(output_path, index=False)
    print(f"合成因子已保存至 {output_path}")


In [24]:
# 初始化种群
def initialize_population(size, function_list):
    population = []
    for _ in range(size):
        formula = random.choice(function_list)
        population.append(formula)
    return population


In [25]:
# 修改后的 calculate_fitness 函数
def calculate_fitness(formula, data):
    try:
        factor_values = eval(formula, {'np': np, 'bn': bn}, {'data': data})
        # 将 factor_values 转换为 numpy 数组并展平
        if isinstance(factor_values, pd.DataFrame):
            factor_values_flat = factor_values.values.flatten()
        else:
            factor_values_flat = factor_values.flatten()
        # 将 returns 转换为 numpy 数组并展平
        returns_flat = data['return'].values.flatten()

        # 检查数据
        # print(f"因子公式: {formula}")
        # print(f"因子值形状: {factor_values.shape if isinstance(factor_values, (pd.DataFrame, np.ndarray)) else 'N/A'}, 展平后: {factor_values_flat.shape}")
        print(f"因子值统计: min={np.nanmin(factor_values_flat):.4f}, max={np.nanmax(factor_values_flat):.4f}, std={np.nanstd(factor_values_flat):.4f}")
        # print(f"因子值 NaN 比例: {np.isnan(factor_values_flat).mean():.4f}")
        print(f"收益率形状: {data['return'].shape}, 展平后: {returns_flat.shape}")
        print(f"收益率统计: min={np.nanmin(returns_flat):.4f}, max={np.nanmax(returns_flat):.4f}, std={np.nanstd(returns_flat):.4f}")
        # print(f"收益率 NaN 比例: {np.isnan(returns_flat).mean():.4f}")

        mask = ~(np.isnan(factor_values_flat) | np.isnan(returns_flat))
        if mask.sum() < 2:
            print("有效数据点少于 2，无法计算相关性")
            return -1

        # 检查是否为常数
        if np.nanstd(factor_values_flat[mask]) == 0 or np.nanstd(returns_flat[mask]) == 0:
            print("因子值或收益率是常数，无法计算相关性")
            return -1

        corr, _ = spearmanr(factor_values_flat[mask], returns_flat[mask])
        print(f"RankIC(corr): {corr:.4f}")
        return corr if not np.isnan(corr) else -1
    except Exception as e:
        print(f"计算适应度失败: {e}")
        return -1

In [26]:
def evolve_population(population, data, generations):
    for gen in range(generations):
        fitness_scores = [(formula, calculate_fitness(formula, data)) for formula in population]
        # 打印 fitness_scores 的统计信息
        scores = [score for _, score in fitness_scores if not np.isnan(score)]
        print(f"第 {gen+1} 代，适应度统计: min={min(scores) if scores else 'N/A'}, max={max(scores) if scores else 'N/A'}, mean={np.mean(scores) if scores else 'N/A'}")

        fitness_scores = sorted(fitness_scores, key=lambda x: x[1], reverse=True)[:int(len(fitness_scores)*0.6)]
        # 暂时移除筛选条件，确保种群不为空
        population = [item[0] for item in fitness_scores if item[1] > 0.015]

        print(f"第 {gen+1} 代，筛选后种群大小: {len(population)}")

        if not population:
            print("种群为空，停止进化，可能是因子公式无效或数据问题")
            return []

        new_population = []
        while len(new_population) < len(population):
            parent1, parent2 = random.sample(population, 2)
            new_formula = f"({parent1}) + ({parent2})"
            if random.random() < 0.1:
                operations = ['+', '*', '-']
                new_formula = new_formula.replace('+', random.choice(operations))
            new_population.append(new_formula)
        population = new_population[:len(population)]

    return population

In [27]:
# 计算残差收益率
def calculate_residual_return(data, factor_pool):
    if not factor_pool:
        # 展平 data['return'] 为一维数组
        residual = data['return'].values.flatten()
        print(f"初始残差收益率形状: {data['return'].shape}, 展平后: {residual.shape}")
        return pd.Series(residual)
    try:
        X = np.column_stack([eval(formula, {'np': np, 'bn': bn}, {'data': data}) for formula in factor_pool])
        y = data['return'].values.flatten()
        mask = ~(np.isnan(X).any(axis=1) | np.isnan(y))
        if mask.sum() < 2:
            print("残差计算数据点不足，返回原始收益率")
            return pd.Series(y)
        lr = LinearRegression()
        lr.fit(X[mask], y[mask])
        residuals = y - lr.predict(X)
        print(f"残差计算后形状: {residuals.shape}")
        return pd.Series(residuals)
    except Exception as e:
        print(f"残差计算错误: {e}")
        return pd.Series(data['return'].values.flatten())


In [28]:
# 修改后的 rolling_factor_extraction 函数（更新了 function_list）
def rolling_factor_extraction(start_date, end_date, interval_years=2):
    stock_list = get_hs300_top50(start_date, end_date)
    if not stock_list:
        print("无法获取股票列表，退出")
        return []

    current_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    factor_pool = []
    # 更新因子公式，增加复杂性和多样性
    function_list = [
        "data['open'] / (data['close'] + 1e-10)",  # 价格比率
        "(data['high'] - data['low']) / (data['close'] + 1e-10)",  # 波动性
        "bn.move_mean(data['volume'], window=5) / (bn.move_mean(data['volume'], window=20) + 1e-10)",  # 成交量均值比率
        "np.log(data['close'] + 1) - np.log(bn.move_mean(data['close'], window=5) + 1)",  # 价格对数差
        "bn.nanrankdata(data['close'].diff(1), axis=0) / (data['close'] + 1e-10)",  # 价格变化排名
        "(data['close'] - bn.move_mean(data['close'], window=10)) / bn.move_std(data['close'], window=10)"  # 标准化价格偏差
    ]

    # 设置最早数据日期
    earliest_date = pd.to_datetime('20150101')

    while current_date < end_date:
        sample_start = (current_date - pd.DateOffset(years=2))
        if sample_start < earliest_date:
            sample_start = earliest_date
        sample_start = sample_start.strftime('%Y%m%d')
        sample_end = current_date.strftime('%Y%m%d')

        print(f"正在处理窗口: {sample_start} 至 {sample_end}")
        data = get_data(sample_start, sample_end, stock_list)

        if data.empty:
            print(f"无数据: {sample_start} 至 {sample_end}")
            current_date += pd.DateOffset(years=interval_years)
            continue

        population_size = 200
        generations = 2
        population = initialize_population(population_size, function_list)

        try:
            residual_return = calculate_residual_return(data, factor_pool)
            # 确保 residual_return 是一维数组
            if isinstance(residual_return, pd.Series):
                residual_return_flat = residual_return.values
            else:
                residual_return_flat = residual_return.flatten()
            print(f"残差收益率统计: min={np.nanmin(residual_return_flat):.4f}, max={np.nanmax(residual_return_flat):.4f}, std={np.nanstd(residual_return_flat):.4f}")

            print("开始进化种群...")
            final_population = evolve_population(population, data, generations)
            print(f"最终种群大小: {len(final_population)}")

            for formula in final_population:
                try:
                    factor_values = eval(formula, {'np': np, 'bn': bn}, {'data': data})
                    # 展平二维数组，确保顺序一致
                    if isinstance(factor_values, pd.DataFrame):
                        factor_values_flat = factor_values.values.flatten()
                    else:
                        factor_values_flat = factor_values.flatten()
                    mask = ~(np.isnan(factor_values_flat) | np.isnan(residual_return_flat))
                    if mask.sum() < 2:
                        print(f"因子 {formula} 有效数据点少于 2，跳过")
                        continue

                    corr, _ = spearmanr(factor_values_flat[mask], residual_return_flat[mask])
                    # print(f"因子 {formula} 与残差的相关性: {corr:.4f}")

                    if abs(corr) < 0.7 and not np.isnan(corr):
                        factor_pool.append(formula)
                        # print(f"添加因子: {formula}, 相关性: {corr:.4f}")
                except Exception as e:
                    print(f"因子 {formula} 处理失败: {e}")
                    continue
        except Exception as e:
            print(f"处理错误: {e}")
            continue

        current_date += pd.DateOffset(years=interval_years)
        factor_pool = factor_pool[-100:]

    return factor_pool,data

In [32]:
# 回测函数
def backtest_synthetic_factor(start_date, end_date, stock_list, factor_file="synthetic_factor.csv"):
    """
    利用合成因子进行回测，评估其表现。

    参数：
    start_date (str): 回测开始日期，格式 'YYYYMMDD'
    end_date (str): 回测结束日期，格式 'YYYYMMDD'
    stock_list (list): 股票池
    factor_file (str): 合成因子文件路径

    返回：
    None
    """
    import matplotlib.pyplot as plt

    # 1. 加载合成因子
    try:
        factor_df = pd.read_csv(factor_file)
        factor_df['date'] = pd.to_datetime(factor_df['date'])
        factor_df.set_index('date', inplace=True)
        print(f"加载合成因子数据：{factor_df.shape}")
    except Exception as e:
        print(f"加载合成因子文件失败：{e}")
        return

    # 2. 获取回测期间的股票数据
    df_list = []
    for stock in stock_list:
        try:
            temp_df = pro.daily(
                ts_code=stock,
                start_date=start_date,
                end_date=end_date,
                fields='ts_code,trade_date,close,pct_chg'
            )
            if not temp_df.empty:
                df_list.append(temp_df)
            else:
                print(f"股票 {stock} 在 {start_date} 至 {end_date} 无数据")
        except Exception as e:
            print(f"获取股票 {stock} 数据失败：{e}")

    if not df_list:
        print(f"时间范围 {start_date} 至 {end_date} 无任何股票数据")
        return

    # 合并数据
    df = pd.concat(df_list)
    df['trade_date'] = pd.to_datetime(df['trade_date'])
    df['pct_chg'] = df['pct_chg'] / 100  # 转换为小数
    df_pivot = df.pivot(index='trade_date', columns='ts_code', values=['close', 'pct_chg'])
    print(f"回测数据形状：{df_pivot.shape}")

    # 3. 准备因子和收益数据
    # 假设因子值是展平的，需要重新整理为 (日期, 股票) 的形状
    dates = df_pivot.index
    stocks = df_pivot['close'].columns
    n_dates = len(dates)
    n_stocks = len(stocks)
    expected_length = n_dates * n_stocks

    # 检查因子数据长度
    if len(factor_df) != expected_length:
        print(f"因子数据长度 {len(factor_df)} 与预期长度 {expected_length} 不一致，调整中...")
        factor_df = factor_df.iloc[:expected_length]

    # 重塑因子值为 (日期, 股票) 的矩阵
    factor_values = factor_df['synthetic_factor'].values.reshape(n_dates, n_stocks)
    factor_df_reshaped = pd.DataFrame(factor_values, index=dates, columns=stocks)

    # 获取每日收益率
    returns = df_pivot['pct_chg']

    # 4. 构建投资组合
    portfolio_returns = []
    for date in returns.index:
        # 获取当日的因子值和收益率
        if date not in factor_df_reshaped.index:
            portfolio_returns.append(0.0)
            continue
        factor_values_day = factor_df_reshaped.loc[date]
        returns_day = returns.loc[date]

        # 过滤掉因子值或收益率为 NaN 的股票
        valid_mask = ~factor_values_day.isna() & ~returns_day.isna()
        factor_values_day = factor_values_day[valid_mask]
        returns_day = returns_day[valid_mask]

        if len(factor_values_day) < 2:
            portfolio_returns.append(0.0)
            continue

        # 根据因子值排序
        factor_rank = factor_values_day.rank()
        total_stocks = len(factor_rank)
        top_threshold = int(total_stocks * 0.8)  # 前 20%
        bottom_threshold = int(total_stocks * 0.2)  # 后 20%

        # 多头组合：因子值排名前 20% 的股票
        long_stocks = factor_rank[factor_rank > top_threshold].index
        # 空头组合：因子值排名后 20% 的股票
        short_stocks = factor_rank[factor_rank <= bottom_threshold].index

        # 计算多头和空头组合的收益率（等权重）
        long_return = returns_day[long_stocks].mean() if len(long_stocks) > 0 else 0.0
        short_return = returns_day[short_stocks].mean() if len(short_stocks) > 0 else 0.0

        # 多空组合收益率
        portfolio_return = long_return - short_return
        portfolio_returns.append(portfolio_return if not np.isnan(portfolio_return) else 0.0)

    # 5. 计算回测指标
    portfolio_returns = pd.Series(portfolio_returns, index=returns.index)

    # 累计收益率
    cumulative_returns = (1 + portfolio_returns).cumprod()

    # 年化收益率
    n_days = len(portfolio_returns)
    n_years = n_days / 252  # 假设一年252个交易日
    annualized_return = (cumulative_returns.iloc[-1]) ** (1 / n_years) - 1

    # 年化波动率
    annualized_volatility = portfolio_returns.std() * np.sqrt(252)

    # 最大回撤
    cumulative_max = cumulative_returns.cummax()
    drawdowns = (cumulative_returns - cumulative_max) / cumulative_max
    max_drawdown = drawdowns.min()

    # 夏普比率（假设无风险利率为0）
    sharpe_ratio = (annualized_return / annualized_volatility) if annualized_volatility != 0 else 0.0

    # 6. 输出回测结果
    print("\n回测结果：")
    print(f"累计收益率：{cumulative_returns.iloc[-1] - 1:.4f}")
    print(f"年化收益率：{annualized_return:.4f}")
    print(f"年化波动率：{annualized_volatility:.4f}")
    print(f"最大回撤：{max_drawdown:.4f}")
    print(f"夏普比率：{sharpe_ratio:.4f}")

    # 7. 可视化累计收益率
    plt.figure(figsize=(10, 6))
    plt.plot(cumulative_returns, label='累计收益率')
    plt.title('合成因子多空组合累计收益率')
    plt.xlabel('日期')
    plt.ylabel('累计收益率')
    plt.legend()
    plt.grid()
    plt.show()

In [33]:
# 主函数
def main():
    start_date = '20180101'
    end_date = '20240101'
    print(f"因子挖掘范围: {start_date} 至 {end_date}")

    # 因子挖掘
    rolling_factors, data = rolling_factor_extraction(start_date, end_date)
    print("挖掘的因子公式:")
    if not rolling_factors:
        print("因子池为空，可能是因子公式无效或数据问题，请检查日志")
        return

    for factor in rolling_factors:
        print(f'{factor}\n')

    # 因子合成
    print("开始因子合成...")
    # 准备目标变量（下一期收益率）
    target = data['return'].shift(-1).values.flatten()
    # 计算所有因子的值，传递 target 的形状
    factors = calculate_all_factors(data, rolling_factors, target_shape=target.shape)

    # 确保 factors 和 target 的样本数一致
    if factors.shape[0] != len(target):
        print(f"factors 形状 {factors.shape[0]} 与 target 长度 {len(target)} 不一致，调整中...")
        min_length = min(factors.shape[0], len(target))
        factors = factors[:min_length]
        target = target[:min_length]

    # 使用随机森林进行因子合成
    final_factor, rf_model = synthesize_with_random_forest(factors, target)

    # 保存结果
    save_results(final_factor, data)

    # 输出特征重要性
    if rf_model is not None:
        feature_importances = pd.Series(rf_model.feature_importances_, index=[f"Factor_{i+1}" for i in range(len(rolling_factors))])
        print("\n特征重要性（前10个因子）：")
        print(feature_importances.sort_values(ascending=False).head(10))

    # 回测
    print("\n开始回测...")
    stock_list = get_hs300_top50(start_date, end_date)  # 获取股票池
    backtest_synthetic_factor(start_date, end_date, stock_list)

In [34]:
# # 主函数
# def main():
#     start_date = '20180101'
#     end_date = '20240101'
#     print(f"因子挖掘范围: {start_date} 至 {end_date}")

#     # 因子挖掘
#     rolling_factors, data = rolling_factor_extraction(start_date, end_date)
#     print("挖掘的因子公式:")
#     if not rolling_factors:
#         print("因子池为空，可能是因子公式无效或数据问题，请检查日志")
#         return

#     for factor in rolling_factors:
#         print(f'{factor}\n')

#     # 因子合成
#     print("开始因子合成...")
#     # 准备目标变量（下一期收益率）
#     target = data['return'].shift(-1).values.flatten()
#     # 计算所有因子的值，传递 target 的形状
#     factors = calculate_all_factors(data, rolling_factors, target_shape=target.shape)

#     # 确保 factors 和 target 的样本数一致
#     if factors.shape[0] != len(target):
#         print(f"factors 形状 {factors.shape[0]} 与 target 长度 {len(target)} 不一致，调整中...")
#         min_length = min(factors.shape[0], len(target))
#         factors = factors[:min_length]
#         target = target[:min_length]

#     # 使用随机森林进行因子合成
#     final_factor, rf_model = synthesize_with_random_forest(factors, target)

#     # 保存结果
#     save_results(final_factor, data)

#     # 输出特征重要性
#     if rf_model is not None:
#         feature_importances = pd.Series(rf_model.feature_importances_, index=[f"Factor_{i+1}" for i in range(len(rolling_factors))])
#         print("\n特征重要性（前10个因子）：")
#         print(feature_importances.sort_values(ascending=False).head(10))


In [None]:
if __name__ == "__main__":
    main()

因子挖掘范围: 20180101 至 20240101
成功获取沪深300前50只股票: ['600519.SH', '601398.SH', '601288.SH', '601857.SH', '601988.SH', '601628.SH']...（共 100 只）
正在处理窗口: 20160101 至 20180101
股票 601728.SH 在 20160101 至 20180101 无数据
股票 601658.SH 在 20160101 至 20180101 无数据
股票 601138.SH 在 20160101 至 20180101 无数据
股票 601816.SH 在 20160101 至 20180101 无数据
