In [1]:
# 鲁棒性测试-逻辑回归
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# 读取数据
df = pd.read_csv("/Users/wangyijie/Visual_Studio_code/毕业论文项目/03金融市场分析与建模/sentiment_SPY_features.csv")
df["date"] = pd.to_datetime(df["date"])

# 构造标签变量
df["return"] = df["price"].pct_change()
df["high_volatility"] = (df["return"].abs() > 0.01).astype(int)

# 清洗缺失值
df = df.dropna().reset_index(drop=True)

# 特征与标签
features = ["positive_ratio", "negative_ratio", "neutral_ratio", "total", "prev_return", "rolling_std"]
X = df[features]
y = df["high_volatility"]

# 时间顺序划分训练集和测试集
split_index = int(len(df) * 0.8)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 训练逻辑回归模型
log_model = LogisticRegression()
log_model.fit(X_train_scaled, y_train)

# 模型预测与评估
y_pred = log_model.predict(X_test_scaled)
y_score = log_model.predict_proba(X_test_scaled)[:, 1]

# 输出评估结果
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
roc_auc = roc_auc_score(y_test, y_score)

conf_matrix, report, roc_auc


(array([[142,  10],
        [ 45,  25]]),
 {'0': {'precision': 0.7593582887700535,
   'recall': 0.9342105263157895,
   'f1-score': 0.8377581120943953,
   'support': 152.0},
  '1': {'precision': 0.7142857142857143,
   'recall': 0.35714285714285715,
   'f1-score': 0.47619047619047616,
   'support': 70.0},
  'accuracy': 0.7522522522522522,
  'macro avg': {'precision': 0.7368220015278839,
   'recall': 0.6456766917293233,
   'f1-score': 0.6569742941424357,
   'support': 222.0},
  'weighted avg': {'precision': 0.7451462157344511,
   'recall': 0.7522522522522522,
   'f1-score': 0.7237502989715379,
   'support': 222.0}},
 0.7886278195488723)

In [2]:
#引入滞后变量，滞后一日

# 生成滞后变量（滞后1天）
df['positive_ratio_lag1'] = df['positive_ratio'].shift(1)
df['negative_ratio_lag1'] = df['negative_ratio'].shift(1)
df['neutral_ratio_lag1'] = df['neutral_ratio'].shift(1)
df['total_lag1'] = df['total'].shift(1)

# 删除含有缺失值的行
df_lagged = df.dropna().reset_index(drop=True)

# 保存新数据集
output_path = "sentiment_SPY_features_with_lag.csv"
df_lagged.to_csv(output_path, index=False)

output_path


'sentiment_SPY_features_with_lag.csv'

In [3]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

# 读取数据
df = pd.read_csv("/Users/wangyijie/Visual_Studio_code/毕业论文项目/03金融市场分析与建模/sentiment_SPY_features_with_lag.csv")

# 构建标签：高波动性（二分类）
df["return"] = df["price"].pct_change()
df["high_volatility"] = (df["return"].abs() > 0.01).astype(int)
df = df.dropna().reset_index(drop=True)

# 特征选择：加入滞后变量
features = ['positive_ratio_lag1', 'negative_ratio_lag1', 'neutral_ratio_lag1', 
            'total_lag1', 'prev_return', 'rolling_std']
X = df[features]
y = df["high_volatility"]

# 特征标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 时间顺序划分训练集和测试集（80/20）
split_index = int(len(df) * 0.8)
X_train, X_test = X_scaled[:split_index], X_scaled[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

param_grid = {
    'C': [0.01, 0.1, 0.5, 1, 2, 10, 100],
    'gamma': ['scale', 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}

cv = TimeSeriesSplit(n_splits=5)
svc = SVC(probability=True)
grid = GridSearchCV(svc, param_grid, scoring='roc_auc', cv=cv)
grid.fit(X_train, y_train)


# 最佳模型
best_model = grid.best_estimator_
print("✅ Best Parameters:", grid.best_params_)

# === 7. 模型评估 ===
y_score = best_model.decision_function(X_test)

# === 8. PR曲线与最优阈值查找 ===
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx = f1_scores.argmax()
best_threshold = thresholds[best_idx]
print("最佳阈值:", best_threshold)
print("对应的 Recall:", recall[best_idx])
print("对应的 F1 Score:", f1_scores[best_idx])

# 应用最佳阈值
y_pred_custom = (y_score >= best_threshold).astype(int)

# 混淆矩阵
cm = confusion_matrix(y_test, y_pred_custom)
report = classification_report(y_test, y_pred_custom, output_dict=True)
roc_auc = roc_auc_score(y_test, y_score)

# 可视化混淆矩阵
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix with Lagged Sentiment Features")
plt.xlabel("Predicted")
plt.ylabel("Actual")
conf_matrix_path = "confusion_matrix_lagged.png"
plt.savefig(conf_matrix_path)
plt.close()

# PR曲线
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx = np.argmax(f1_scores)

plt.figure(figsize=(7, 5))
plt.plot(recall, precision, label="PR Curve")
plt.scatter(recall[best_idx], precision[best_idx], color='red', label=f'Best F1 = {f1_scores[best_idx]:.2f}')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve with Lagged Features")
plt.legend()
pr_curve_path = "pr_curve_lagged.png"
plt.savefig(pr_curve_path)
plt.close()

# ROC曲线
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, y_score)
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve with Lagged Sentiment Features")
plt.legend()
roc_curve_path = "roc_curve_lagged.png"
plt.savefig(roc_curve_path)
plt.close()

{
    "confusion_matrix": conf_matrix_path,
    "pr_curve": pr_curve_path,
    "roc_curve": roc_curve_path,
    "classification_report": report,
    "roc_auc": roc_auc,
    "best_f1": f1_scores[best_idx],
    "best_recall": recall[best_idx],
    "best_precision": precision[best_idx]
}

✅ Best Parameters: {'C': 2, 'gamma': 'scale', 'kernel': 'linear'}
最佳阈值: -0.5912238068258984
对应的 Recall: 0.6285714285714286
对应的 F1 Score: 0.6197183048601469


{'confusion_matrix': 'confusion_matrix_lagged.png',
 'pr_curve': 'pr_curve_lagged.png',
 'roc_curve': 'roc_curve_lagged.png',
 'classification_report': {'0': {'precision': 0.8266666666666667,
   'recall': 0.8157894736842105,
   'f1-score': 0.8211920529801324,
   'support': 152.0},
  '1': {'precision': 0.6111111111111112,
   'recall': 0.6285714285714286,
   'f1-score': 0.6197183098591549,
   'support': 70.0},
  'accuracy': 0.7567567567567568,
  'macro avg': {'precision': 0.7188888888888889,
   'recall': 0.7221804511278196,
   'f1-score': 0.7204551814196436,
   'support': 222.0},
  'weighted avg': {'precision': 0.7586986986986987,
   'recall': 0.7567567567567568,
   'f1-score': 0.7576642961401846,
   'support': 222.0}},
 'roc_auc': 0.7805451127819549,
 'best_f1': 0.6197183048601469,
 'best_recall': 0.6285714285714286,
 'best_precision': 0.6111111111111112}