In [67]:
import pandas as pd
import numpy as np
import calendar

In [68]:
df = pd.read_csv('data/sp500_index.csv')
df["Date"] = pd.to_datetime(df["Date"])
df = df.set_index("Date").sort_index()

In [69]:
df.head()

Unnamed: 0_level_0,S&P500
Date,Unnamed: 1_level_1
2014-12-22,2078.54
2014-12-23,2082.17
2014-12-24,2081.88
2014-12-26,2088.77
2014-12-29,2090.57


In [70]:
# 计算日收益率
df["Return"] = df["S&P500"].pct_change()

In [71]:
# === 1. 构造月度特征 ===
monthly = pd.DataFrame()
monthly["Month_start"] = df["S&P500"].resample("ME").first()
monthly["Month_end"]   = df["S&P500"].resample("ME").last()
monthly["High"]        = df["S&P500"].resample("ME").max()
monthly["Low"]         = df["S&P500"].resample("ME").min()

In [72]:
# 上月涨跌幅
monthly["Return_prev_month"] = (monthly["Month_end"] - monthly["Month_start"]) / monthly["Month_start"]

In [73]:
# 上月波动率（基于日收益率）
monthly["Volatility_prev_month"] = df["Return"].resample("ME").std()

In [74]:
# 上月高低价差
monthly["Range_prev_month"] = monthly["High"] - monthly["Low"]

In [75]:
# === 2. 移动均线特征 ===
# 日度均线 (5日, 20日)，取月末值
df["MA5d"] = df["S&P500"].rolling(5).mean()
df["MA20d"] = df["S&P500"].rolling(20).mean()
monthly["MA5d_minus_MA20d"] = (df["MA5d"] - df["MA20d"]).resample("ME").last()

# 月度均线 (3个月, 6个月)
monthly["MA3m"] = monthly["Month_end"].rolling(3).mean()
monthly["MA6m"] = monthly["Month_end"].rolling(6).mean()
monthly["MA3m_minus_MA6m"] = monthly["MA3m"] - monthly["MA6m"]

# === 3. 时间特征 ===
monthly["Year"] = monthly.index.year
monthly["Month"] = monthly.index.month
monthly["Month"] = monthly.index.month.map(lambda x: calendar.month_name[x])
dummies = pd.get_dummies(monthly["Month"])
monthly = pd.concat([monthly, dummies], axis=1).drop(columns=["Month"])

# === 4. 标签：下个月的涨跌幅 ===
monthly["Label_next_month"] = monthly["Return_prev_month"].shift(-1)

# 去掉最后一个月（没有标签）
monthly = monthly.dropna()

print(monthly.head())

            Month_start  Month_end     High      Low  Return_prev_month  \
Date                                                                      
2015-05-31      2108.29    2107.39  2130.82  2080.15          -0.000427   
2015-06-30      2111.73    2063.11  2124.20  2057.64          -0.023024   
2015-07-31      2077.42    2103.84  2128.28  2046.68           0.012718   
2015-08-31      2098.04    1972.18  2104.18  1867.61          -0.059989   
2015-09-30      1913.85    1920.03  1995.31  1881.77           0.003229   

            Volatility_prev_month  Range_prev_month  MA5d_minus_MA20d  \
Date                                                                    
2015-05-31               0.006810             50.67            4.4405   
2015-06-30               0.007012             66.56          -11.5195   
2015-07-31               0.007372             81.60            0.5380   
2015-08-31               0.016831            236.57          -85.5915   
2015-09-30               0.014185   

In [76]:
print(monthly.describe())

       Month_start    Month_end         High          Low  Return_prev_month  \
count   115.000000   115.000000   115.000000   115.000000         115.000000   
mean   3359.621130  3388.974870  3466.601304  3267.279043           0.008778   
std    1040.994727  1064.993842  1082.298827  1019.182374           0.046372   
min    1913.850000  1920.030000  1951.700000  1829.080000          -0.163625   
25%    2476.450000  2513.105000  2550.215000  2422.465000          -0.010676   
50%    3066.910000  3044.310000  3153.630000  2964.330000           0.010953   
75%    4197.350000  4187.485000  4302.685000  4090.205000           0.036962   
max    5728.800000  6032.380000  6032.380000  5712.690000           0.178883   

       Volatility_prev_month  Range_prev_month  MA5d_minus_MA20d         MA3m  \
count             115.000000        115.000000        115.000000   115.000000   
mean                0.009386        199.322261          7.534796  3355.550812   
std                 0.006436        

In [77]:
monthly.to_csv("monthly_features.csv", index=True)