In [None]:
import openai
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# 设置你的OpenAI API密钥
openai.api_key = "YOUR_API_KEY"

# 示例数据
data = pd.DataFrame({
    'text': [
        "I love this product. It is amazing!",
        "Terrible service, will never use again.",
        "Absolutely fantastic experience.",
        "Worst purchase I've ever made.",
        "Highly recommend to everyone.",
        "Do not buy this. Waste of money.",
        "Great value for the price!",
        "Awful. Completely useless.",
        "Loved it. Five stars.",
        "Very disappointed."
    ],
    'label': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1=Positive, 0=Negative
})

# 获取嵌入向量的函数
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=text,
        model=model
    )
    return response.data[0].embedding

# 提取嵌入并生成特征矩阵
embedding_list = []
for i, row in data.iterrows():
    embedding = get_embedding(row['text'])
    embedding_list.append(embedding)

X = np.array(embedding_list)
y = data['label'].values

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 建立模型
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)

# 评估结果
print("Classification Report:")
print(classification_report(y_test, y_pred))

# 可视化（PCA降维到2D）
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X)

plt.figure(figsize=(8, 6))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap='coolwarm', s=60)
plt.title("Text Embeddings Visualized with PCA")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.show()

In [None]:
import openai
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 设置你的 OpenAI API 密钥
openai.api_key = "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

# 示例文本数据：每日收盘后发布的新闻摘要（实际应替换为真实数据）
news_texts = [
    "Apple announced a new line of M4 chips with better energy efficiency.",
    "Federal Reserve indicates possible rate cuts in the second quarter.",
    "Unexpected decline in global smartphone demand hurts tech shares.",
    "Tesla expands manufacturing to India to reduce dependency on China.",
    "Major cyberattack impacts global financial data systems.",
]

# 示例目标变量（次日股价：1=涨，0=跌）
labels = [1, 1, 0, 1, 0]

# 获取嵌入向量（使用 OpenAI embedding 模型）
def get_embedding(text):
    response = openai.Embedding.create(
        input=text,
        model="text-embedding-3-small"  # 或 "text-embedding-ada-002"
    )
    return response['data'][0]['embedding']

# 构建特征矩阵
embeddings = [get_embedding(text) for text in news_texts]
X = np.array(embeddings)
y = np.array(labels)

# 训练模型
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 模型评估
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))