# 加载模块

In [1]:
import os
os.chdir("..")
import pandas as pd 
import numpy as np
import yfinance as yf
from tqdm import tqdm
from models.PortfolioModel import PortfolioModel
from DataPipeline.DataBuilder import build_dataset
import pandas_market_calendars as mcal

# 准备 tickers 和 原始数据 RawData

In [17]:
symbols = pd.read_csv('data/RawData/misc/test_tickers_Name.csv')
symbols = symbols['Ticker'].to_list() # 读取 Tickers
tickers = yf.Tickers(" ".join(symbols)) # 转换成 yfinance 专属类

In [18]:
# 从 yfinance 中下载 tickers 数据
for symbol in symbols:
    stock = tickers.tickers[symbol]
    stock_data = stock.history(start="2023-01-01", end="2025-01-01")
    stock_data.to_csv(f"data/RawData/{symbol}.csv")

# 特征工程

## 用于计算生成特征的函数

In [19]:
def add_rsi(df, column='Close', window=14):
    """计算 RSI(相对强弱指标)，衡量超买超卖"""
    delta = df[column].diff()  # 价格变动
    gain = delta.clip(lower=0)  # 上涨部分
    loss = -delta.clip(upper=0)  # 下跌部分（取正）

    avg_gain = gain.rolling(window=window).mean()  # 平均涨幅
    avg_loss = loss.rolling(window=window).mean()  # 平均跌幅

    rs = avg_gain / avg_loss  # 相对强度
    df[f'RSI_{window}'] = 100 - (100 / (1 + rs))  # RSI计算公式
    return df

def add_macd_diff(df, column='Close', fast=12, slow=26, signal=9):
    """计算 MACD 差值(DIF - DEA)，衡量趋势动量变化"""
    ema_fast = df[column].ewm(span=fast, adjust=False).mean()  # 快速EMA
    ema_slow = df[column].ewm(span=slow, adjust=False).mean()  # 慢速EMA
    dif = ema_fast - ema_slow  # DIF线
    dea = dif.ewm(span=signal, adjust=False).mean()  # DEA线（DIF的均线）
    df['MACD_diff'] = dif - dea  # MACD差值（柱状图）
    return df

def add_bollinger_width(df, column='Close', window=20, num_std=2):
    """计算布林带宽度（上轨 - 下轨），衡量波动性"""
    ma = df[column].rolling(window=window).mean()  # 中轨（均线）
    std = df[column].rolling(window=window).std()  # 标准差
    upper = ma + num_std * std  # 上轨
    lower = ma - num_std * std  # 下轨
    df['bollinger_width'] = upper - lower  # 布林带宽度
    return df

def add_volume_bias(df, column='Volume', window=10):
    """计算成交量乖离率，衡量成交量相对均值的偏离"""
    volume_sma = df[column].rolling(window=window).mean()  # 成交量均线
    df['volume_bias'] = (df[column] - volume_sma) / volume_sma  # 偏离程度
    return df

## 特征生成

In [20]:
# 特征工程
for symbol in symbols:
    Ticker_Rawdata = pd.read_csv(f'data/RawData/{symbol}.csv')  # 读取原始数据
    Ticker_Rawdata['log_return'] = np.log(Ticker_Rawdata['Close'] / Ticker_Rawdata['Close'].shift(1))  # 计算对数收益率 ln(Pt/Pt-1)
    Ticker_Rawdata['SMA_10'] = Ticker_Rawdata['Close'].rolling(window=10).mean()  # 计算10日简单移动平均
    Ticker_Rawdata['price_bias'] = (Ticker_Rawdata['Close'] - Ticker_Rawdata['SMA_10']) / Ticker_Rawdata['SMA_10']  # 计算价格乖离率（当前价格相对SMA_10的偏离）
    Ticker_Rawdata = add_rsi(Ticker_Rawdata)  # 添加RSI_14，衡量超买超卖
    Ticker_Rawdata = add_macd_diff(Ticker_Rawdata)  # 添加MACD差值（趋势动量指标）
    Ticker_Rawdata = add_bollinger_width(Ticker_Rawdata)  # 添加布林带宽度，衡量价格波动性
    Ticker_Rawdata = add_volume_bias(Ticker_Rawdata)  # 添加成交量乖离率，衡量成交量与均值的偏离
    Ticker_Rawdata.bfill(inplace=True)  # 用最近的非空值填充NaN

    Ticker_Rawdata['Date'] = pd.to_datetime(Ticker_Rawdata['Date'], utc=True).dt.tz_convert(None) # 时区修正


    Ticker_Rawdata.drop(columns=['Open', 'High', 'Low', 'Volume', 'Dividends', 'Stock Splits', 'Capital Gains'], inplace=True, errors='ignore')  # errors='ignore' 可防止列不存在时报错
    Ticker_Rawdata.to_csv(f'data/FeatureData/{symbol}.csv')

# 交易日日期生成

In [None]:
# 获取 NYSE 交易日历（适用于大部分 iShares ETF）
nyse = mcal.get_calendar('NYSE')

# 设置你要的时间范围
schedule = nyse.schedule(start_date='2023-01-01', end_date='2025-01-31')

# 生成交易日列表
dates = mcal.date_range(schedule, frequency='1D')
# 保存为 CSV 方便以后使用
dates = dates.date
dates_df = pd.DataFrame({'Date': dates})
dates_df.to_csv('data/TradingDay_info.csv', index=False)