# 利用深度学习模型进行情感分类
这里是一个基本步骤以及一个简单的深度学习模型，用于情感分类：

数据预处理：
- 文本清洗：去除停用词、标点符号、数字等。
- 分词：可以使用jieba分词等工具进行中文分词。
- 词嵌入：可以使用预训练的word2vec、GloVe、FastText或BERT等模型进行词嵌入。

建立模型
- 可以使用一个简单的卷积神经网络 (CNN) 或长短时记忆网络 (LSTM)。

In [5]:
import pandas as pd
import jieba
import re
from gensim.models import Word2Vec

# 1. 数据加载
data = pd.read_excel(r'D:\Mo\数据挖掘学习\情感分类\tagged_data.xlsx')  # 请替换为你的Excel文件路径
comments = data['评价'].tolist()  # 假设内容列的名称是'content'

# 2. 文本清洗
# 加载停用词（假设你有一个名为'stopwords.txt'的停用词文件，每行一个词）
with open('stop_words.txt', 'r', encoding='utf-8') as f:
    stopwords = [line.strip() for line in f.readlines()]

def clean_text(text):
    text = re.sub('[\W\d]', ' ', text)  # 去除标点符号和数字
    words = jieba.cut(text)
    words = [w for w in words if w not in stopwords]
    return ' '.join(words)

cleaned_comments = [clean_text(c) for c in comments]

# 3. 使用word2vec进行词嵌入
sentences = [comment.split() for comment in cleaned_comments]
model_w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
model_w2v.save("word2vec.model")

# 示例：获取一个单词的向量
vector = model_w2v.wv['空间']  # 将'你的单词'替换为任何你想要的单词



Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Stepstar\AppData\Local\Temp\jieba.cache
Loading model cost 0.686 seconds.
Prefix dict has been built successfully.


In [6]:
vector

array([ 0.39101592,  0.12942822,  0.4920325 , -0.18685408,  0.7078181 ,
       -1.0550503 ,  0.6823373 ,  2.0524466 , -0.45004106, -1.19479   ,
        0.11892094, -2.2406209 ,  0.09476876,  0.52514696, -0.8230334 ,
       -1.1572946 , -0.08168875, -0.30617237, -0.54241955, -2.0897238 ,
        1.525842  ,  0.11144078,  1.7312609 , -0.2571589 ,  0.8108609 ,
        0.47013488, -1.1424893 , -0.6036548 , -0.544022  , -0.7644484 ,
       -0.9025774 ,  0.7495082 ,  1.2455838 , -1.7388631 , -0.50644886,
        1.3724557 ,  0.01343953, -1.5020944 , -0.7228506 , -0.9775503 ,
        0.27302843, -1.0356295 , -0.45428723,  0.6712555 ,  0.9425834 ,
       -0.02672654, -0.8253072 ,  0.13480127,  1.357672  ,  0.9381895 ,
        0.5456603 ,  0.11008839,  0.3756892 ,  0.26532152,  0.7732765 ,
       -1.2476687 ,  0.6257581 , -0.23031253, -0.61135375,  0.62689525,
       -0.47409973, -0.5765718 ,  0.5467307 ,  0.5880336 , -1.5586724 ,
       -0.07922017,  0.41843972,  1.1845555 , -0.9592036 ,  1.05

In [13]:
import pandas as pd
import jieba
import re
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
def pad_or_truncate(sequence, maxlen):
    # 如果序列短于最大长度，使用0填充至maxlen
    if len(sequence) < maxlen:
        return sequence + [0] * (maxlen - len(sequence))
    # 如果序列长于最大长度，截断至maxlen
    return sequence[:maxlen]
# ... [数据加载、文本清洗、词嵌入等代码，如之前所示]

# 1. 分训练集和测试集
labels = data['标注分类'].values
X_train, X_test, y_train, y_test = train_test_split(cleaned_comments, labels, test_size=0.2, random_state=42)

# 2. 转换文本为向量形式
VOCAB_SIZE = len(model_w2v.wv.index_to_key)
EMBEDDING_DIM = 100
MAX_LENGTH = 200

# 创建词汇表
word2idx = {word: idx for idx, word in enumerate(model_w2v.wv.index_to_key)}
idx2word = {idx: word for word, idx in word2idx.items()}

def text_to_sequence(text, maxlen):
    sequence = [word2idx.get(word, 0) for word in text.split()]
    return sequence

X_train_sequences = [text_to_sequence(text, MAX_LENGTH) for text in X_train]
X_test_sequences = [text_to_sequence(text, MAX_LENGTH) for text in X_test]

X_train_pad = [pad_or_truncate(seq, MAX_LENGTH) for seq in X_train_sequences]
X_test_pad = [pad_or_truncate(seq, MAX_LENGTH) for seq in X_test_sequences]

# 3. 构建和训练深度学习模型
model = Sequential([
    Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH, weights=[model_w2v.wv.vectors], trainable=False),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()
model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=10, batch_size=64)


ModuleNotFoundError: No module named 'tensorflow.python.data.experimental.ops.data_service_ops'

## 模型评估

In [9]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

# 使用模型进行预测
y_pred_probabilities = model.predict(X_test_pad)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_probabilities]

# 计算各项评估指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"准确率: {accuracy:.4f}")
print(f"精确率: {precision:.4f}")
print(f"召回率: {recall:.4f}")
print(f"F1分数: {f1:.4f}")


NameError: name 'model' is not defined