# **Trial**

In [21]:
import json
import jieba
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from gensim.models.ldamodel import LdaModel
from gensim import corpora, models
from collections import defaultdict

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.autograd import Variable
from transformers import BertModel, BertTokenizer

In [2]:
json_path = r'C:\Users\11435\Desktop\clutter\research\data\stock\BV1LuxZeVE25.json'

with open(json_path, 'r', encoding='utf-8') as file:
    # 加载 JSON 数据
    data = json.load(file)
    
doc_data = [info['review'] for info in data]
doc_data

['22年入市，昨天早上几乎最高位离场，达到预期收益就收手，纪律性是对抗贪婪本性的唯一办法[OK]',
 '今天已经暴跌了，但是我认为下午会有国家队。。。这才第二天就砸盘这么猛，主力有点不给面子了，盲猜收盘之前会翻红，但是也让散户看明白了，什么政策什么经济复苏，还是那套玩意，就是嘎散户的钱，盲目入坑的绝对冷静了[doge_金箍]反正我是没入，不亏就是赚了',
 '昨天在店内一边吃鸡翅一边用手机看股票。一个乞丐进来乞讨，我给他一块鸡翅后继续看股票。乞丐啃着鸡翅没走，也在一旁看着，他说：“长期均线金叉，KDJ数值底部反复钝化，MACD底背离，能量潮喇叭口扩大，这股要涨了。”我惊诧地问：“这个你也懂？” 乞丐说：“不懂我能有今天？”',
 '几天的大阳线把股民们以前的记忆都干没了[doge]，你们根本不知道有多疯狂，我一个从未接触过股票的朋友都开户来问我怎么玩了，你可知道他竟然连板块都不会找股票代码什么是沪深的也不知道就投了几万块进去',
 '经历2008年股灾的老人回忆到，股市有政策底，还有市场底。政策到位，上涨一轮，引人下场后再跌停，让你来不及逃生。我巨亏40%，狠狠心抛了，逃过了后面的下跌。',
 '一个特别微观又直观的例子：大家可以看下各银行的大额存单转让区，这里基本都是居民存款，节前转让收益率上升了10-20bp。说明这轮行情真真实实的撬动了银行存款，之前那么多zc没办到的事情这次竟然办到了。',
 '我始终坚信，一个赌场般的市场，绝对不会让大多数人赚到钱。',
 '今天已经开始技术性回调了[脱单doge]',
 '别人都是看涨，导致目前赛道过于拥挤。而老王直接看空，以此来安慰没上车的群体，别出心裁、另辟蹊径。看似在谈经济，实则在玩自媒体。老王高啊[支持]',
 '无论是涨是跌，最终能赚到并且保住这些钱的人都是极少数人，关键在于能不能清晰地认识到在这些人里面包不包括自己。',
 '我重申一下：股市是穷人最少的地方，中产最多的地方。\n在股市放水，造成的结果就是，中产变成富人，穷人还是穷人。\n【Doge】动动猪脑子想想，穷人能有多少钱玩？万儿八千，好点的十万八万，顶天了。\n富人可是百万千万。\n股市翻一倍，富人100万变200万，穷人5万变10万。\n\n 贫富差距增加了 = 最终贫富差距 - 初始贫富差距 = 190万 - 95万 = 95万\n\n因此。\

In [3]:
# 读取停用词，并去停用词
stopwords_path1 = r'C:\Users\11435\Desktop\clutter\research\data\corpus\stopwords_scu.txt'
with open(stopwords_path1, 'r', encoding='utf-8') as f:
    stopwords1 = set([line.strip() for line in f])

stopwords_path2 = r'C:\Users\11435\Desktop\clutter\research\data\corpus\stopwords_hit.txt'
with open(stopwords_path2, 'r', encoding='utf-8') as f:
    stopwords2 = set([line.strip() for line in f])

stopwords = stopwords1.union(stopwords2)

texts = []
for doc in doc_data:
    words = jieba.cut(doc)
    filter_words = [word for word in words if word not in stopwords and word.strip() != '']
    texts.append(filter_words)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\11435\AppData\Local\Temp\jieba.cache
Loading model cost 0.483 seconds.
Prefix dict has been built successfully.


In [4]:
# 过滤频次
FREQ_LIMIT = 1
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > FREQ_LIMIT]for text in texts]

In [5]:
# 构建词典和语料库
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [6]:
lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

In [7]:
# 提取主题向量
lda_features = []
for document in corpus:
    topic_distribution = lda.get_document_topics(document)
    lda_features.append([topic[1] for topic in topic_distribution])

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(2, 1), (4, 1), (5, 1), (6, 1)],
 [(2, 1), (7, 1), (8, 1), (9, 1), (10, 1)]]

In [8]:
# 加载已经训练好的bert模型
model_path = r'D:\tool\toolkit\nlp\distiluse-base-multilingual-cased-v2-finetuned-stsb_multi_mt-es'
model = BertModel.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
  return self.fget.__get__(instance, owner)()
Some weights of BertModel were not initialized from the model checkpoint at D:\tool\toolkit\nlp\distiluse-base-multilingual-cased-v2-finetuned-stsb_multi_mt-es and are newly initialized: ['embeddings.token_type_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder

In [16]:
bert_features = []
for article in doc_data:
    # 对文章进行分词和编码
    encoded_input = tokenizer(article, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # 获取模型的输出
    with torch.no_grad():
        outputs = model(**encoded_input)
    # 获取CLS token的嵌入作为文章的特征向量
    bert_features.append(outputs.last_hidden_state[:, 0, :].squeeze().detach().numpy())

In [18]:
# lda和bert特征拼接
concatenated_features = []
for bert_feature, lda_feature in zip(bert_features, lda_features):
    # 将BERT特征向量和LDA主题向量拼接
    concatenated_feature = np.concatenate((bert_feature, lda_feature))
    concatenated_features.append(concatenated_feature)

In [22]:
# 定义自编码器
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, hidden_size):
        super(AutoEncoder, self).__init__()

        self.encoder  = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True)
        )
        self.en_fc = nn.Linear(64, hidden_size)
        self.de_fc = nn.Linear(hidden_size, 64)
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(True),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        en = self.encoder(x)
        code = self.en_fc(en)
        de = self.de_fc(code)
        decoded = self.decoder(de)
        return code, decoded

In [20]:
# 开始训练


773