In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import logging
import pandas as pd
import gensim

logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

In [7]:
#数据集处理
class TextDataset(Dataset):
    def __init__(self, texts, labels,word_vectors,max_len):
        """
        文本数据集类
        :param text: 文本数据列表，每个元素是一个词索引列表。
        :param label: 标签列表，与 texts 中的文本一一对应。
        :param word_vectors:词向量字典，key 是词索引，value 是对应的词向量。
        :param max_len:每个文本的最大长度，用于填充或截断文本。
        """
        self.text =[self.pad_text(text,max_len) for text in texts]
        self.label = labels
        self.word_vectors = word_vectors

    def pad_text(self,text,max_len):
        """
        填充或截断文本
        :param text: 文本数据列表，每个元素是一个词索引列表。
        :param max_len:每个文本的最大长度，用于填充或截断文本。
        """
        """填充或截断文本以匹配指定的最大长度"""
            # 确保 text 是一个列表
        if not isinstance(text, list):
            raise ValueError("Expected a list of integers as input.")

        padded = text[:max_len]  # 截取前 max_len 个元素
        padding_needed = max_len - len(padded)
        if padding_needed > 0:
            padded.extend([0] * padding_needed)  # 使用extend方法来添加多个0
        return padded

    def __getitem__(self, index):
        """
        获取指定索引的数据
        :param index: 数据索引。
        :return: 一个元组，包含文本数据和对应的标签。
        """

        text_tensor = torch.tensor([self.word_vectors[w] for w in self.texts[index]],dtype=torch.float)
        label_tensor = torch.tensor([self.label[index]],dtype=torch.long)
        return text_tensor, label_tensor

In [8]:
class TextCNN(nn.Module):
    def __init__(self,vocab_size,embedding_dim,num_filters,filter_sizes,num_classes,dropout):
        """
        文本分类模型
        :param vocab_size: 词汇表大小。
        :param embedding_dim: 词嵌入维度。
        :param num_filters: 卷积核数量。
        :param filter_sizes: 卷积核大小列表。
        :param num_classes: 分类类别数量。
        :param dropout: 丢弃率。
        """
        super(TextCNN,self).__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim) #词嵌入层
        self.convs = nn.ModuleList([nn.Conv2d(1,num_filters,(fs,embedding_dim)) for fs in filter_sizes]) #卷积层
        self.dropout = nn.Dropout(dropout) #丢弃层
        self.fc = nn.Linear(len(filter_sizes)*num_filters,num_classes) #全连接层

        def forward(self,x):
            """
            前向传播
            :param x: 输入数据，形状为 (batch_size, seq_len)。
            :return: 输出数据，形状为 (batch_size, num_classes)。
            """
            x = self.embedding(x).unsqueeze(1) #形状为 (batch_size, 1, seq_len, embedding_dim)
            x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] #relu激活函数，形状为 (batch_size, num_filters, seq_len - filter_size + 1)
            x = [F.max_pool1d(i,i.size(2)).squeeze(2) for i in x] #最大池化，形状为 (batch_size, num_filters)
            x = torch.cat(x, 1) #形状为 (batch_size, num_filters * len(filter_sizes))
            x=self.dropout(x) #丢弃层
            logits = self.fc(x) #全连接层
            return logits

In [9]:
def train_model(model,dataloader,criterion,optimizer,num_epochs=25):
    """
    训练模型
    :param model: 模型。
    :param dataloader: 数据加载器。
    :param criterion: 损失函数。
    :param optimizer: 优化器。
    :param num_epochs: 训练轮数。
    :return: 训练损失列表和准确率列表。
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    history = {'train_loss': [], 'val_loss': []}

    for epoch in range(num_epochs):
        logging.info('Epoch {}/{}'.format(epoch+1, num_epochs))
        logging.info('-' * 10)

        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # 训练模式
            else:
                model.eval()   # 评估模式

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloader[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad() #梯度清零

                with torch.set_grad_enabled(phase == 'train'): #是否计算梯度
                    outputs = model(inputs) #前向传播
                    _, preds = torch.max(outputs, 1) #预测结果
                    loss = criterion(outputs, labels) #损失函数

                    if phase == 'train':
                        loss.backward() #反向传播
                        optimizer.step() #更新参数

                running_loss += loss.item() * inputs.size() #损失累加
                running_corrects += torch.sum(preds == labels.data) #正确预测数量累加

            epoch_loss = running_loss / len(dataloader[phase].dataset) #平均损失
            epoch_acc = running_corrects.double() / len(dataloader[phase].dataset) #准确率
            logging.info('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            history[f'{phase}_loss'].append(epoch_loss) #记录损失

def plot_train_history(history):
    """
    绘制训练和验证损失随时间的变化图
    """
    epochs = range(len(history['train_loss']))

    plt.figure(figsize=(12,4))
    plt.subplot(121)
    plt.plot(epochs, history['train_loss'],label='train_loss')
    plt.plot(epochs, history['val_loss'], label='val_loss')
    plt.title('model loss')
    plt.xlabel('epoch')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [10]:
#载入数据
data_file = './data/train_set.csv'
data = pd.read_csv(data_file,sep='\t')
labels = data['label'].values
texts = data['text'].values

In [11]:
# 预训练的词向量文件路径
vec_path = "word2vec.txt"  # 替换为实际路径
# 加载词向量文件
word_vectors = gensim.models.KeyedVectors.load_word2vec_format(vec_path, binary=False)

2025-01-22 07:16:00,862 INFO: loading projection weights from word2vec.txt
2025-01-22 07:16:01,249 INFO: KeyedVectors lifecycle event {'msg': 'loaded (5971, 200) matrix of type float32 from word2vec.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-01-22T07:16:01.248409', 'gensim': '4.3.3', 'python': '3.12.8 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:48:34) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'load_word2vec_format'}


In [15]:
X_train

array(['5298 5310 4386 5659 5282 7042 6902 307 6656 3300 1999 2465 2212 6350 1344 7525 3613 945 4439 5491 6811 1580 2662 7495 2435 1168 5718 134 2465 669 19 6508 4939 669 4939 3263 4659 5598 5282 7042 6902 7492 2448 5510 2109 84 913 751 648 2087 6286 3750 5298 5310 4386 5491 2662 1401 4853 5659 5282 7042 6902 4480 38 5445 4269 5948 1778 512 4939 307 6656 512 4939 4469 3731 3750 7509 4811 2847 1044 648 5436 6508 900 5530 7123 2218 6267 4469 1638 648 5298 5310 4386 5659 5282 7042 6902 7400 1963 2741 4500 6040 152 4893 2007 7541 5780 5530 7239 2109 3772 4063 4125 600 3792 3107 1103 3750 669 5640 4939 7399 5278 4848 7328 2525 4190 4939 7399 6542 6127 5430 6542 5445 3750 4967 3605 340 5282 7042 6902 7400 1963 4293 3099 6063 2975 1899 873 3750 2522 3099 4811 4490 1116 2210 1699 648 4967 4128 669 4118 6350 4893 4167 5397 7399 6542 6127 5659 7400 1963 4469 7525 7150 6198 900 637 1871 4046 3568',
       '4704 5977 922 7543 1332 5085 7493 5589 1099 1070 1036 2465 3971 903 803 5445 6543 3605 7399

In [None]:
[int(word) for word in text.split()]

In [12]:
#分割数据集为训练集和验证集
X_train,X_test,y_train,y_test = train_test_split(texts,labels,test_size=0.2,random_state=42)

#创建数据集对象
train_dataset = TextDataset(X_train,y_train,word_vectors,max_len=900)
test_dataset = TextDataset(X_test,y_test,word_vectors,max_len=900)

#创建数据加载器
dataloaders = {
    'train': DataLoader(train_dataset,batch_size=64,shuffle=True),
    'val': DataLoader(test_dataset,batch_size=64,shuffle=False)
}

TypeError: can only concatenate str (not "list") to str

In [None]:
#初始化模型、损失函数和优化器
model = TextCNN(vocab_size=len(word_vectors),
                embedding_dim=200,
                num_filters=128,
                filter_sizes=[3,4,5],
                num_classes=14,
                dropout=0.5)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=0.001)

In [None]:
#开始训练模型
trained_model, history = train_model(model,dataloaders,criterion,optimizer,num_epochs=25)

In [None]:
#可视化训练历史情况
plot_train_history(history)