<a href="https://colab.research.google.com/github/Sam-ai904/Huatai-Model/blob/main/%E9%81%97%E4%BC%A0%E8%A7%84%E5%88%92_%E7%A0%94%E6%8A%A5%E7%89%88%2B%E9%AB%98%E6%80%A7%E8%83%BD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!pip install tushare
!pip install bottleneck
import tushare as ts
import pandas as pd
import numpy as np
import random
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
import bottleneck as bn




In [2]:
# 设置Tushare API Token并初始化
ts.set_token('2876ea85cb005fb5fa17c809a98174f2d5aae8b1f830110a5ead6211')
pro = ts.pro_api()


In [3]:
# 获取沪深300成分股前50
def get_hs300_top50(start_date , end_date):
    try:
        df = pro.index_weight(index_code='000300.SH', start_date= start_date, end_date= end_date)
        if df.empty:
            raise ValueError("未获取到沪深300成分股数据")
        else:
            print("已经获取数据，正在运行模型...")

        df = df.sort_values('weight', ascending=False).head(50)
        return df['con_code'].tolist()
    except Exception as e:
        print(f"获取沪深300成分股失败: {e}")
        return []

# 获取股票日频数据
def get_data(start_date, end_date, stock_list):
    df_list = []
    for stock in stock_list:
        try:
            temp_df = pro.daily(
                ts_code=stock,
                start_date=start_date,
                end_date=end_date,
                fields='ts_code,trade_date,open,close,high,low,vol,pct_chg'
            )
            if not temp_df.empty:
                df_list.append(temp_df)
            else:
                print(f"股票 {stock} 在 {start_date} 至 {end_date} 无数据")
        except Exception as e:
            print(f"获取股票 {stock} 数据失败: {e}")

    if not df_list:
        print(f"时间范围 {start_date} 至 {end_date} 无任何股票数据")
        return pd.DataFrame()

    try:
        df = pd.concat(df_list)
        df.rename(columns={'vol': 'volume', 'pct_chg': 'return'}, inplace=True)
        df['return'] = df['return'] / 100
        df['trade_date'] = pd.to_datetime(df['trade_date'])

        # 检查并移除重复的 trade_date 和 ts_code 组合
        duplicates = df.duplicated(subset=['trade_date', 'ts_code'], keep=False)
        if duplicates.any():
            print(f"警告: 发现重复数据，共 {duplicates.sum()} 条，自动保留最后一条")
            df = df.drop_duplicates(subset=['trade_date', 'ts_code'], keep='last')

        return df.pivot(index='trade_date', columns='ts_code')
    except Exception as e:
        print(f"数据合并失败: {e}")
        return pd.DataFrame()


In [4]:
# 初始化种群
def initialize_population(size, function_list):
    population = []
    for _ in range(size):
        formula = random.choice(function_list)
        population.append(formula)
    return population


In [5]:
# 适应度计算
def calculate_fitness(formula, data):
    try:
        factor_values = eval(formula, {'np': np, 'bn': bn}, {'data': data})
        rank_ic = bn.nanrankdata(factor_values, axis=0)
        returns = bn.nanrankdata(data['return'].values, axis=0)
        mask = ~(np.isnan(rank_ic) | np.isnan(returns))
        if mask.sum() < 2:
            return -1
        corr, _ = spearmanr(rank_ic[mask], returns[mask])
        return corr if not np.isnan(corr) else -1
    except:
        return -1


In [6]:
# 进化过程
def evolve_population(population, data, generations):
    for _ in range(generations):
        # 并行计算适应度
        fitness_scores = [(formula, calculate_fitness(formula, data)) for formula in population]
        # 筛选高适应度因子
        fitness_scores = sorted(fitness_scores, key=lambda x: x[1], reverse=True)[:int(len(fitness_scores)*0.6)]
        population = [item[0] for item in fitness_scores if item[1] > 0.015]

        if not population:
            return []

        # 交叉与变异
        new_population = []
        while len(new_population) < len(population):
            parent1, parent2 = random.sample(population, 2)
            new_formula = f"({parent1}) + ({parent2})"
            if random.random() < 0.1:  # 变异
                operations = ['+', '*', '-']
                new_formula = new_formula.replace('+', random.choice(operations))
            new_population.append(new_formula)
        population = new_population[:len(population)]
    return population


In [7]:
# 计算残差收益率
def calculate_residual_return(data, factor_pool):
    if not factor_pool:
        return data['return']
    try:
        X = np.column_stack([eval(formula, {'np': np, 'bn': bn}, {'data': data}) for formula in factor_pool])
        y = data['return'].values
        mask = ~(np.isnan(X).any(axis=1) | np.isnan(y))
        if mask.sum() < 2:
            return data['return']
        lr = LinearRegression()
        lr.fit(X[mask], y[mask])
        residuals = y - lr.predict(X)
        return pd.Series(residuals, index=data.index)
    except Exception as e:
        print(f"残差计算错误: {e}")
        return data['return']



In [8]:
def rolling_factor_extraction(start_date, end_date, interval_years=1):
    stock_list = get_hs300_top50(start_date, end_date,)
    if not stock_list:
        print("无法获取股票列表，退出")
        return []

    current_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    factor_pool = []
    function_list = [
        "data['open'] + data['close']",
        "data['high'] - data['low']",
        "bn.move_mean(data['volume'], window=5)",
        "np.log(data['open'] + 1)",
        "data['close'] / (data['high'] + 1e-10)"
    ]

    # 设置最早数据日期，防止获取过早的数据
    earliest_date = pd.to_datetime('20150101')

    while current_date < end_date:
        sample_start = (current_date - pd.DateOffset(years=6))
        # 限制最早日期
        if sample_start < earliest_date:
            sample_start = earliest_date
        sample_start = sample_start.strftime('%Y%m%d')
        sample_end = current_date.strftime('%Y%m%d')

        print(f"正在处理窗口: {sample_start} 至 {sample_end}")
        data = get_data(sample_start, sample_end, stock_list)

        if data.empty:
            print(f"无数据: {sample_start} 至 {sample_end}")
            current_date += pd.DateOffset(years=interval_years)
            continue

        population_size = 500
        generations = 4
        population = initialize_population(population_size, function_list)

        try:
            residual_return = calculate_residual_return(data, factor_pool)
            final_population = evolve_population(population, data, generations)

            for formula in final_population:
                try:
                    factor_values = eval(formula, {'np': np, 'bn': bn}, {'data': data})
                    corr = pd.Series(factor_values).corr(residual_return)
                    if abs(corr) < 0.8 and not np.isnan(corr):
                        factor_pool.append(formula)
                except:
                    continue
        except Exception as e:
            print(f"处理错误: {e}")

        current_date += pd.DateOffset(years=interval_years)
        factor_pool = factor_pool[-100:]

    return factor_pool

In [None]:
# 执行
def main():
  start_date = '20200101'
  end_date = '20240101'
  print(f"因子挖掘范围: {start_date} 至 {end_date}")
  rolling_factors = rolling_factor_extraction(start_date, end_date)
  print("挖掘的因子公式:")
  for factor in rolling_factors:
      print(factor)

if __name__ == "__main__":
    main()

因子挖掘范围: 20200101 至 20240101
已经获取数据，正在运行模型...
正在处理窗口: 20150101 至 20200101
