**划分数据集，构建滞后特征, 并在训练集上进行特征选择**

例如，使用 5 天的滞后特征，并且有 6 个原始特征，
则滞后特征的总数量就是 5 天 * 6 个特征 = 30 个滞后特征

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 读取数据
data = pd.read_csv('data/smci_sorted.csv')
features = ['DPRIME', 'Close','Volume', 'Open','High', 'Low']
target = 'Close'

# 创建滞后特征
def create_lag_features(data, features, lag=5):
    for feature in features:
        for i in range(1, lag + 1):
            data[f'{feature}_lag_{i}'] = data[feature].shift(i)
    return data

# 构建滞后特征
data_lagged = create_lag_features(data, features, lag=5)

# 去除NA值
data_lagged = data_lagged.dropna()

# 定义特征和目标变量
X = data_lagged[[f'{feature}_lag_{i}' for feature in features for i in range(1, 6)]]
y = data_lagged[target]

# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 按时间顺序划分数据集
train_size = int(len(X_scaled) * 0.8)
X_train, X_test = X_scaled[:train_size], X_scaled[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# 输出数据集大小
print("train set:", X_train.shape, y_train.shape)
print("test set:", X_test.shape, y_test.shape)

train set: (2008, 30) (2008,)
test set: (503, 30) (503,)


**相关性分析：选择与目标变量相关性绝对值大于0.3的特征**

In [3]:
# 计算相关性矩阵
correlation_matrix = pd.DataFrame(X_train, columns=[f'{feature}_lag_{i}' for feature in features for i in range(1, 6)]).assign(target=y_train).corr()
correlations = correlation_matrix['target'].abs().sort_values(ascending=False)
selected_features_corr = correlations[correlations > 0.3].index.drop('target').tolist()
print("每个特征的相关性评分:")
print(correlations)
print("重要特征（相关性分析）:", selected_features_corr)

每个特征的相关性评分:
Close_lag_5     1.000000
target          1.000000
Open_lag_4      0.998130
High_lag_5      0.997993
Low_lag_5       0.997908
Open_lag_5      0.995688
High_lag_4      0.995570
Low_lag_4       0.995167
Close_lag_4     0.992954
Open_lag_3      0.990942
Low_lag_3       0.988356
High_lag_3      0.987952
Close_lag_3     0.986128
Open_lag_2      0.984013
Low_lag_2       0.981704
High_lag_2      0.981302
Close_lag_2     0.979586
Open_lag_1      0.977618
Low_lag_1       0.975120
High_lag_1      0.974737
Close_lag_1     0.972893
DPRIME_lag_5    0.640535
DPRIME_lag_4    0.639053
DPRIME_lag_3    0.638083
DPRIME_lag_2    0.635972
DPRIME_lag_1    0.633663
Volume_lag_5    0.056832
Volume_lag_4    0.038395
Volume_lag_3    0.036261
Volume_lag_2    0.032699
Volume_lag_1    0.028307
Name: target, dtype: float64
重要特征（相关性分析）: ['Close_lag_5', 'Open_lag_4', 'High_lag_5', 'Low_lag_5', 'Open_lag_5', 'High_lag_4', 'Low_lag_4', 'Close_lag_4', 'Open_lag_3', 'Low_lag_3', 'High_lag_3', 'Close_lag_3', 'O

**递归特征消除（RFE）：选择RFE标记为True的特征** 

In [4]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import numpy as np

# 递归特征消除
model = LinearRegression()
rfe = RFE(model, n_features_to_select=10)  # 假设选择10个最重要的特征
fit = rfe.fit(X_train, y_train)

rfe_support = fit.support_
rfe_ranking = fit.ranking_

selected_features_rfe = [f'{feature}_lag_{i}' for i in range(1, 6) for feature in features if fit.support_[features.index(feature) * 5 + i - 1]]
rfe_scores = dict(zip([f'{feature}_lag_{i}' for i in range(1, 6) for feature in features], rfe_ranking))

print("每个特征的RFE评分（排名）:")
print(rfe_scores)
print("重要特征（RFE）:", selected_features_rfe)


每个特征的RFE评分（排名）:
{'DPRIME_lag_1': 16, 'Close_lag_1': 13, 'Volume_lag_1': 12, 'Open_lag_1': 14, 'High_lag_1': 15, 'Low_lag_1': 1, 'DPRIME_lag_2': 1, 'Close_lag_2': 1, 'Volume_lag_2': 5, 'Open_lag_2': 6, 'High_lag_2': 19, 'Low_lag_2': 20, 'DPRIME_lag_3': 18, 'Close_lag_3': 17, 'Volume_lag_3': 21, 'Open_lag_3': 3, 'High_lag_3': 11, 'Low_lag_3': 1, 'DPRIME_lag_4': 1, 'Close_lag_4': 8, 'Volume_lag_4': 2, 'Open_lag_4': 1, 'High_lag_4': 1, 'Low_lag_4': 1, 'DPRIME_lag_5': 7, 'Close_lag_5': 1, 'Volume_lag_5': 10, 'Open_lag_5': 1, 'High_lag_5': 9, 'Low_lag_5': 4}
重要特征（RFE）: ['Close_lag_1', 'Low_lag_1', 'Close_lag_2', 'High_lag_2', 'Close_lag_3', 'Open_lag_3', 'High_lag_3', 'Low_lag_3', 'Open_lag_4', 'High_lag_4']


**L1正则化（Lasso）：选择Lasso回归选择的特征** 

In [5]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# 使用Lasso进行特征选择
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)

model = SelectFromModel(lasso, prefit=True)
lasso_support = model.get_support()
lasso_coefficients = lasso.coef_

selected_features_lasso = [f'{feature}_lag_{i}' for i in range(1, 6) for feature in features if lasso_support[features.index(feature) * 5 + i - 1]]
lasso_scores = dict(zip([f'{feature}_lag_{i}' for i in range(1, 6) for feature in features], lasso_coefficients))

print("每个特征的Lasso评分（系数）:")
print(lasso_scores)
print("重要特征（Lasso）:", selected_features_lasso)


每个特征的Lasso评分（系数）:
{'DPRIME_lag_1': -0.0, 'Close_lag_1': -0.0, 'Volume_lag_1': -0.0, 'Open_lag_1': -0.6184330844549351, 'High_lag_1': -0.0, 'Low_lag_1': 130.81290088313614, 'DPRIME_lag_2': 0.0, 'Close_lag_2': 0.0, 'Volume_lag_2': 0.0, 'Open_lag_2': 0.0, 'High_lag_2': 0.0, 'Low_lag_2': 0.0, 'DPRIME_lag_3': -0.0, 'Close_lag_3': 0.0, 'Volume_lag_3': -0.0, 'Open_lag_3': 0.0, 'High_lag_3': 0.0, 'Low_lag_3': 0.0, 'DPRIME_lag_4': 0.0, 'Close_lag_4': 0.0, 'Volume_lag_4': 0.0, 'Open_lag_4': 0.0, 'High_lag_4': 0.0, 'Low_lag_4': 0.0, 'DPRIME_lag_5': 0.0, 'Close_lag_5': 33.40320159196775, 'Volume_lag_5': 0.0, 'Open_lag_5': 0.0, 'High_lag_5': 0.0, 'Low_lag_5': 0.0}
重要特征（Lasso）: ['Close_lag_1', 'Low_lag_1', 'DPRIME_lag_4']
