In [1]:
import sys
from kaitoupao_wsl import *

storage dir: /mnt/d/forCoding_data/Tianchi_NLPNewsClassification
code dir: /mnt/d/forCoding_code/Tianchi_NLPNewsClassification 

21 09 47
先天八卦数: 1乾, 2兑, 3离, 4震, 5巽, 6坎, 7艮, 8坤
本卦上：5 本卦下：1 变爻：5


Unnamed: 0,风天小畜,火泽睽,山天大畜
上卦,☴巽木,☲离火,☶艮土
下卦,☰乾金,☱兑金,☰乾金


01 24 6 巳时
先天八卦数: 1乾, 2兑, 3离, 4震, 5巽, 6坎, 7艮, 8坤
本卦上：1 本卦下：8 变爻：6


Unnamed: 0,天地否,风山渐,泽地萃
上卦,☰乾金,☴巽木,☱兑金
下卦,☷坤土,☶艮土,☷坤土


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
device = torch.device("cuda")

In [5]:
scheme_type = "20250221_1" 

# 加载训练集和测试集，将全量字符列表给它弄出来

In [9]:
data_train = pd.read_csv(create_originalData_path("train_set.csv"), sep="\t", nrows=None)#.sample(1000) , nrows=1000
data_test = pd.read_csv(create_originalData_path("test_a.csv"), sep="\t", nrows=None)#.sample(1000)

In [10]:
data_train.shape, data_test.shape

((200000, 2), (50000, 1))

In [11]:
type_of_class = data_train.label.nunique()

In [12]:
train_data, valid_data = train_test_split(data_train, test_size=0.3, random_state=42)

In [13]:
train_labels = torch.tensor(train_data.label.to_list(), dtype=torch.long)
valid_labels = torch.tensor(valid_data.label.to_list(), dtype=torch.long)
test_labels = torch.tensor([-1 for x in range(data_test.shape[0])], dtype=torch.long) ## test_labels的label是假的。

## 使用TF-IDF提取特征

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_train_features = tfidf_vectorizer.fit_transform(train_data['text'])
tfidf_valid_features = tfidf_vectorizer.transform(valid_data['text'])
tfidf_test_features = tfidf_vectorizer.transform(data_test['text'])

In [15]:
train_features = torch.tensor(tfidf_train_features.toarray(), dtype=torch.float32)
valid_features = torch.tensor(tfidf_valid_features.toarray(), dtype=torch.float32)
test_features = torch.tensor(tfidf_test_features.toarray(), dtype=torch.float32)

In [16]:
sc_input_dim = train_features.shape[1]
sc_input_dim

6695

## 创建适合于语言序列的数据

In [17]:
# 下载数据并进行预处理
vocab_size = 8000  # 只考虑前 20k 词汇
maxlen = 800  # 只考虑每条评论的前 200 个词

In [18]:
def preprocess_seq_str_2_int(seq, len_lim = maxlen):
    rst = [int(wd) for idx, wd in enumerate(seq.strip().split()) if idx < len_lim]
    return rst

In [19]:
x_train = [torch.tensor(preprocess_seq_str_2_int(seq), dtype=torch.long) for seq in train_data.text]
x_valid = [torch.tensor(preprocess_seq_str_2_int(seq), dtype=torch.long) for seq in valid_data.text]
x_test = [torch.tensor(preprocess_seq_str_2_int(seq), dtype=torch.long) for seq in data_test.text]

In [20]:
x_train = pad_sequence(x_train, batch_first=True, padding_value=0)
x_valid = pad_sequence(x_valid, batch_first=True, padding_value=0)
x_test = pad_sequence(x_test, batch_first=True, padding_value=0)

## 准备数据集

In [21]:
batchsize = 32

In [22]:
class MyData(Dataset):
    def __init__(
        self, 
        ori_data, tfidf_feats, label,
    ):
        self.ori_data = ori_data
        self.tfidf_feats = tfidf_feats
        self.label = label
 
    def __len__(self):
        return len(self.ori_data)
 
    def __getitem__(self, idx):
        tuple_ = (
            self.ori_data[idx], 
            self.tfidf_feats[idx], 
            self.label[idx]
        )
        return tuple_

In [23]:
train_loader = DataLoader(MyData(x_train, train_features, train_labels), batch_size=batchsize, shuffle=True,)
val_loader = DataLoader(MyData(x_valid, valid_features, valid_labels), batch_size=batchsize, shuffle=True,)
test_loader = DataLoader(MyData(x_test, test_features, test_labels), batch_size=batchsize, shuffle=False,) ## 这个不要shuffle，否则传到oj上面去就GG了。

# Scheme 1: base

并行使用CNN+BiLSTM+Transformer：
* CNN捕捉局部n-gram特征（kernel_size=3,5,7）
* BiLSTM捕获长距离时序依赖
* Transformer处理全局关系

In [24]:
scheme_type_here = f"{scheme_type}__base"

In [25]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.att = nn.MultiheadAttention(embed_dim, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.layernorm1 = nn.LayerNorm(embed_dim)
        self.layernorm2 = nn.LayerNorm(embed_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        attn_output, _ = self.att(x, x, x)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.pos_emb = nn.Embedding(maxlen, embed_dim)

    def forward(self, x):
        maxlen = x.size(1)
        positions = torch.arange(0, maxlen, device=x.device).unsqueeze(0).expand(x.size(0), x.size(1)) # torch.arange(0, maxlen, device=x.device).unsqueeze(0).expand_as(x)
        # print(positions.shape)
        return x + self.pos_emb(positions)

class TransformerModel(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim, num_heads, ff_dim):
        super(TransformerModel, self).__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        
    def forward(self, x):
        x = self.embedding_layer(x).transpose(0, 1)  # Transformer expects (seq_len, batch_size, embed_dim)
        x = self.transformer_block(x)
        x = x.transpose(0, 1)
        x = self.global_avg_pool(x.permute(0, 2, 1)).squeeze(-1)
        return x

In [26]:
## TextCNN：捕捉短时间的关系

# https://blog.51cto.com/u_15764210/6844118
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
        return F.max_pool1d(x, kernel_size=x.shape[2]) # shape: (batch_size, channel, 1)
        
class TextCNN(nn.Module):
    def __init__(
        self, 
        embedding_dim=128, 
        kernel_sizes=[3, 4, 5, 6], num_channels=[256, 256, 256, 256], 
    ):
        '''
        ：param num_classes：输出维度（类别数num_Classes）
        ：param num_embeddings: size of the dictionary of embeddings，词典的大小（vocab_size），当num_embeddings<O，模型会去除embedding层
        ：param embedding_dim: the size of each embedding vector，词向量特征长度
        ：param kernel_sizes: CNN层卷积核大小
        ：param num_channels: CNN层卷积核通道数
        : return:
        '''
        assert len(kernel_sizes) == len(num_channels), "len(kernel_sizes) should be equal to len(num_channels)"
        super(TextCNN, self).__init__()
        # self.num_classes = num_classes
    
        # 卷积层
        self.cnn_layers = nn.ModuleList() # 创建多个一维卷积层
        for c, k in zip(num_channels, kernel_sizes):
            cnn = nn.Sequential(
                nn.Conv1d(
                    in_channels=embedding_dim,
                    out_channels=c, 
                    kernel_size=k
                ),
                nn.BatchNorm1d(c),
                nn.ReLU(inplace=True),
            )
            self.cnn_layers.append(cnn)
        # 最大池化层
        self.pool = GlobalMaxPool1d()
            
    def forward(self, input_):
        '''
        :param input: (batch_size, context_size, embedding_size(in_channels))
        :return:
        '''
        input_ = input_.permute(0, 2, 1)
        y = []
        for layer in self.cnn_layers:
            x = layer(input_)
            x = self.pool(x).squeeze(-1)
            y.append(x)
        y = torch.cat(y, dim=1)
        return y

In [27]:
# BiLSTM 
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional=False):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bi_dir = bidirectional
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers *(2 if self.bi_dir else 1), x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers *(2 if self.bi_dir else 1), x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        return out[:, -1, :]

In [28]:
# 多层神经网络：
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, 32)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.Linear(32, 32)
        self.dropout2 = nn.Dropout(0.2)
        self.dense2 = nn.Linear(32, output_dim)
 
    def forward(self, x):
        x = F.relu(self.fc(x))
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        x = self.dropout2(x)
        x = self.dense2(x)        
        return x

# # 单层神经网络：
# class SentimentClassifier(nn.Module):
#     def __init__(self, input_dim, output_dim):
#         super().__init__()
#         self.fc = nn.Linear(input_dim, output_dim)
 
#     def forward(self, x):
#         return self.fc(x)

In [29]:
# 定义模型
class FinalModel(nn.Module):
    def __init__(self, 
                 maxlen, vocab_size, embed_dim, num_heads, ff_dim,
                tcnn_ks = [3,5,7,10], tcnn_nc = [32,64,64,64],
                 lstm_hs = 128, lstm_nlyr = 4, lstm_bd = True
                ):
        super(FinalModel, self).__init__()
        ## emb层
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        ## Tsfm部分：
        self.tsfm = TransformerModel(
            maxlen, 
            vocab_size, 
            embed_dim, 
            num_heads, 
            ff_dim
        ) # embed_dim
        ## TextCNN部分：
        self.textcnn = TextCNN(
            embedding_dim = embed_dim, 
            kernel_sizes = tcnn_ks,
            num_channels = tcnn_nc
        ) # sum(tcnn_nc)
        ## BiLSTM:
        self.lstm = LSTMClassifier(
            input_size = embed_dim, 
            hidden_size = lstm_hs, 
            num_layers = lstm_nlyr,  
            bidirectional = lstm_bd
        ) # lstm_hs * (2 if lstm_bd else 1)
        self.mix = nn.Sequential(
            ## 基于这个数字 embed_dim + sum(tcnn_nc) + lstm_hs * (2 if lstm_bd else 1) ，做一个全连接神经网络吧。
            nn.Linear(
                embed_dim + sum(tcnn_nc) + lstm_hs * (2 if lstm_bd else 1),
                ff_dim
            ),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(ff_dim, ff_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(ff_dim, type_of_class)
        )
        self.sc_net = SentimentClassifier(sc_input_dim, type_of_class)
        
    def forward(self, x, x_tfidf):
        x_ori = x
        x_emb = self.token_emb(x)
        ## Tsfm部分：
        x_tsfm = self.tsfm(x_emb) 
        ## TextCNN部分：
        x_tcnn = self.textcnn(x_emb)
        ## BiLSTM部分:
        x_lstm = self.lstm(x_emb)
        ## 综合：
        x_cat = torch.cat(
            [
                x_tsfm, 
                x_tcnn, x_lstm
            ], axis=1
        )        
        return F.log_softmax(self.mix(x_cat) + self.sc_net(x_tfidf), dim=-1)

## 构建模型以及训练

In [30]:
model = FinalModel(maxlen, vocab_size, embed_dim=128, num_heads=8, ff_dim=128)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [31]:
model

FinalModel(
  (token_emb): Embedding(8000, 128)
  (tsfm): TransformerModel(
    (embedding_layer): TokenAndPositionEmbedding(
      (pos_emb): Embedding(800, 128)
    )
    (transformer_block): TransformerBlock(
      (att): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (ffn): Sequential(
        (0): Linear(in_features=128, out_features=128, bias=True)
        (1): ReLU()
        (2): Linear(in_features=128, out_features=128, bias=True)
      )
      (layernorm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (layernorm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (global_avg_pool): AdaptiveAvgPool1d(output_size=1)
  )
  (textcnn): TextCNN(
    (cnn_layers): ModuleList(
      (0): Sequential(
        (0): Conv1d(128, 32, kernel_size=(3,), stride=(1,))
        (1): BatchN

In [32]:
# 训练和评估模型
from torch.utils.tensorboard import SummaryWriter
import shutil

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=2):
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    log_path = "runs/Logs"
    if os.path.exists(
        os.path.dirname(log_path)
    ):
        shutil.rmtree(os.path.dirname(log_path))

    writer = SummaryWriter(log_dir=log_path)
    
    total_train_step = 0
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        loss_sum = 0
        for inputs, input_tfidfs, targets in tqdm.tqdm(train_loader):
            inputs, input_tfidfs, targets = inputs.to(device), input_tfidfs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs, input_tfidfs)
            loss = criterion(outputs, targets)
            loss.requires_grad_(True)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
            total_train_step += 1
            loss_sum += loss.item()
            if total_train_step % 100 == 0:
                # print(total_train_step//100)
                writer.add_scalar(f"train_loss_detail-{scheme_type_here}", loss_sum/100, total_train_step//100)
                loss_sum = 0
                
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {train_loss/len(train_loader)}, Accuracy: {100.*correct/total}%')
        writer.add_scalar(f"epoch_loss-{scheme_type_here}", train_loss/len(train_loader), epoch+1)
        
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            total_predicted = []
            total_label = []
            for inputs, input_tfidfs, targets in tqdm.tqdm(val_loader):
                inputs, input_tfidfs, targets = inputs.to(device), input_tfidfs.to(device), targets.to(device)
                outputs = model(inputs, input_tfidfs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

                total_predicted += list(predicted.cpu())
                total_label += list(targets.cpu())
        f1 = f1_score(total_label, total_predicted, average='macro')
                
        print(f'Validation Loss: {val_loss/len(val_loader)}, Accuracy: {100.*correct/total}%, f1 score is {f1}')
    writer.close()

In [33]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=15)

100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:22<00:00, 16.67it/s]


Epoch 1/15, Loss: 0.3906377124946032, Accuracy: 88.15%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:41<00:00, 44.84it/s]


Validation Loss: 0.228994959337761, Accuracy: 93.02666666666667%, f1 score is 0.9165290498517022


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:20<00:00, 16.81it/s]


Epoch 2/15, Loss: 0.20031537963630897, Accuracy: 93.92214285714286%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:41<00:00, 45.26it/s]


Validation Loss: 0.20809106399888794, Accuracy: 93.50833333333334%, f1 score is 0.9221683104951861


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:19<00:00, 16.83it/s]


Epoch 3/15, Loss: 0.15410339160199676, Accuracy: 95.16214285714285%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:41<00:00, 45.15it/s]


Validation Loss: 0.18277862702794373, Accuracy: 94.41%, f1 score is 0.9319527445370067


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:19<00:00, 16.83it/s]


Epoch 4/15, Loss: 0.12084832793064415, Accuracy: 96.16214285714285%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:41<00:00, 45.19it/s]


Validation Loss: 0.20541956249661744, Accuracy: 94.14166666666667%, f1 score is 0.9290737165845923


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:19<00:00, 16.85it/s]


Epoch 5/15, Loss: 0.09360727666479403, Accuracy: 96.97%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:41<00:00, 45.12it/s]


Validation Loss: 0.20924845323693006, Accuracy: 94.31833333333333%, f1 score is 0.9322833407628525


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:19<00:00, 16.86it/s]


Epoch 6/15, Loss: 0.07289789363950758, Accuracy: 97.59071428571428%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:41<00:00, 45.33it/s]


Validation Loss: 0.22137073640283197, Accuracy: 94.26666666666667%, f1 score is 0.9274365931256724


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:19<00:00, 16.87it/s]


Epoch 7/15, Loss: 0.05838802612501396, Accuracy: 98.045%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:41<00:00, 45.23it/s]


Validation Loss: 0.24911891760854052, Accuracy: 94.16166666666666%, f1 score is 0.9290523147117392


 14%|███████████                                                                     | 606/4375 [00:35<03:43, 16.83it/s]


KeyboardInterrupt: 

## 保存模型

In [None]:
save_pickle_object(model, create_trained_models_path(f"ori_9319-{scheme_type_here}.pkl"))

# Scheme 2: 共用基座但是各自投影

In [34]:
scheme_type_here = f"{scheme_type}__com_bench"

In [35]:
# 定义模型
class FinalModel(nn.Module):
    def __init__(self, 
                 maxlen, vocab_size, 
                 ori_emb_dim = 512,
                 embed_dim=128, 
                 num_heads=8, ff_dim=128,
                tcnn_ks = [3,5,7,10], tcnn_nc = [32,64,64,64],
                 lstm_hs = 128, lstm_nlyr = 4, lstm_bd = True
                ):
        super(FinalModel, self).__init__()
        ## emb层
        self.token_emb = nn.Embedding(vocab_size, ori_emb_dim)
        ## Tsfm部分：
        self.proj_trans = nn.Linear(ori_emb_dim, embed_dim)
        self.tsfm = TransformerModel(
            maxlen, 
            vocab_size, 
            embed_dim, 
            num_heads, 
            ff_dim
        ) # embed_dim
        ## TextCNN部分：
        self.proj_cnn = nn.Linear(ori_emb_dim, embed_dim)
        self.textcnn = TextCNN(
            embedding_dim = embed_dim, 
            kernel_sizes = tcnn_ks,
            num_channels = tcnn_nc
        ) # sum(tcnn_nc)
        ## BiLSTM:
        self.proj_lstm = nn.Linear(ori_emb_dim, embed_dim)
        self.lstm = LSTMClassifier(
            input_size = embed_dim, 
            hidden_size = lstm_hs, 
            num_layers = lstm_nlyr,  
            bidirectional = lstm_bd
        ) # lstm_hs * (2 if lstm_bd else 1)
        self.mix = nn.Sequential(
            ## 基于这个数字 embed_dim + sum(tcnn_nc) + lstm_hs * (2 if lstm_bd else 1) ，做一个全连接神经网络吧。
            nn.Linear(
                embed_dim + sum(tcnn_nc) + lstm_hs * (2 if lstm_bd else 1),
                ff_dim
            ),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(ff_dim, ff_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(ff_dim, type_of_class)
        )
        self.sc_net = SentimentClassifier(sc_input_dim, type_of_class)
        
    def forward(self, x, x_tfidf):
        x_ori = x
        x_emb = self.token_emb(x)
        ## Tsfm部分：
        x_tsfm = self.tsfm(self.proj_trans(x_emb)) 
        ## TextCNN部分：
        x_tcnn = self.textcnn(self.proj_cnn(x_emb))
        ## BiLSTM部分:
        x_lstm = self.lstm(self.proj_lstm(x_emb))
        ## 综合：
        x_cat = torch.cat(
            [
                x_tsfm, 
                x_tcnn, x_lstm
            ], axis=1
        )        
        return F.log_softmax(self.mix(x_cat) + self.sc_net(x_tfidf), dim=-1)

## 构建模型以及训练

In [36]:
model = FinalModel(maxlen, vocab_size, embed_dim=128, num_heads=8, ff_dim=128)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [37]:
model

FinalModel(
  (token_emb): Embedding(8000, 512)
  (proj_trans): Linear(in_features=512, out_features=128, bias=True)
  (tsfm): TransformerModel(
    (embedding_layer): TokenAndPositionEmbedding(
      (pos_emb): Embedding(800, 128)
    )
    (transformer_block): TransformerBlock(
      (att): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (ffn): Sequential(
        (0): Linear(in_features=128, out_features=128, bias=True)
        (1): ReLU()
        (2): Linear(in_features=128, out_features=128, bias=True)
      )
      (layernorm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (layernorm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (global_avg_pool): AdaptiveAvgPool1d(output_size=1)
  )
  (proj_cnn): Linear(in_features=512, out_features=128, bias=True)
  (textcnn): TextC

In [38]:
# 训练和评估模型
from torch.utils.tensorboard import SummaryWriter
import shutil

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=2):
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    log_path = "runs/Logs"
    if os.path.exists(
        os.path.dirname(log_path)
    ):
        shutil.rmtree(os.path.dirname(log_path))

    writer = SummaryWriter(log_dir=log_path)
    
    total_train_step = 0
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        loss_sum = 0
        for inputs, input_tfidfs, targets in tqdm.tqdm(train_loader):
            inputs, input_tfidfs, targets = inputs.to(device), input_tfidfs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs, input_tfidfs)
            loss = criterion(outputs, targets)
            loss.requires_grad_(True)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
            total_train_step += 1
            loss_sum += loss.item()
            if total_train_step % 100 == 0:
                # print(total_train_step//100)
                writer.add_scalar(f"train_loss_detail-{scheme_type_here}", loss_sum/100, total_train_step//100)
                loss_sum = 0
                
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {train_loss/len(train_loader)}, Accuracy: {100.*correct/total}%')
        writer.add_scalar(f"epoch_loss-{scheme_type_here}", train_loss/len(train_loader), epoch+1)
        
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            total_predicted = []
            total_label = []
            for inputs, input_tfidfs, targets in tqdm.tqdm(val_loader):
                inputs, input_tfidfs, targets = inputs.to(device), input_tfidfs.to(device), targets.to(device)
                outputs = model(inputs, input_tfidfs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

                total_predicted += list(predicted.cpu())
                total_label += list(targets.cpu())
        f1 = f1_score(total_label, total_predicted, average='macro')
                
        print(f'Validation Loss: {val_loss/len(val_loader)}, Accuracy: {100.*correct/total}%, f1 score is {f1}')
    writer.close()

In [39]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=15)

100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:30<00:00, 16.18it/s]


Epoch 1/15, Loss: 0.3740772521625672, Accuracy: 88.70214285714286%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:42<00:00, 44.03it/s]


Validation Loss: 0.22246923875659705, Accuracy: 93.135%, f1 score is 0.9176475562128336


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:29<00:00, 16.23it/s]


Epoch 2/15, Loss: 0.21009888617928538, Accuracy: 93.66857142857143%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:42<00:00, 44.05it/s]


Validation Loss: 0.20521633251508076, Accuracy: 93.795%, f1 score is 0.9243268199072087


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:29<00:00, 16.24it/s]


Epoch 3/15, Loss: 0.1737617210941123, Accuracy: 94.64714285714285%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:42<00:00, 44.08it/s]


Validation Loss: 0.20258353450223804, Accuracy: 93.99833333333333%, f1 score is 0.9266895965302121


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:29<00:00, 16.24it/s]


Epoch 4/15, Loss: 0.14536175990280295, Accuracy: 95.53857142857143%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:42<00:00, 44.08it/s]


Validation Loss: 0.1924387978874147, Accuracy: 94.18833333333333%, f1 score is 0.9280358862229895


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:29<00:00, 16.24it/s]


Epoch 5/15, Loss: 0.12350369844596301, Accuracy: 96.02785714285714%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:42<00:00, 44.17it/s]


Validation Loss: 0.19282185274759928, Accuracy: 94.38833333333334%, f1 score is 0.9348072835427755


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:29<00:00, 16.24it/s]


Epoch 6/15, Loss: 0.10505266443258152, Accuracy: 96.63071428571429%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:42<00:00, 44.05it/s]


Validation Loss: 0.20738096985661736, Accuracy: 94.18666666666667%, f1 score is 0.9303970876535498


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:29<00:00, 16.24it/s]


Epoch 7/15, Loss: 0.09142157223665022, Accuracy: 97.02428571428571%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:42<00:00, 44.06it/s]


Validation Loss: 0.20044405336625254, Accuracy: 94.17666666666666%, f1 score is 0.9330439160772191


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:29<00:00, 16.25it/s]


Epoch 8/15, Loss: 0.07691480434419189, Accuracy: 97.42285714285714%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:42<00:00, 44.26it/s]


Validation Loss: 0.2259684952680953, Accuracy: 94.29833333333333%, f1 score is 0.9337127426218473


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:29<00:00, 16.25it/s]


Epoch 9/15, Loss: 0.06871094892332996, Accuracy: 97.70071428571428%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:42<00:00, 44.08it/s]


Validation Loss: 0.22469598795470472, Accuracy: 94.28%, f1 score is 0.9327652180117952


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:29<00:00, 16.25it/s]


Epoch 10/15, Loss: 0.061109739370617484, Accuracy: 97.97928571428571%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:42<00:00, 44.26it/s]


Validation Loss: 0.24996951154020305, Accuracy: 93.88333333333334%, f1 score is 0.9301360155054346


100%|███████████████████████████████████████████████████████████████████████████████| 4375/4375 [04:29<00:00, 16.24it/s]


Epoch 11/15, Loss: 0.05355396345885321, Accuracy: 98.16214285714285%


100%|███████████████████████████████████████████████████████████████████████████████| 1875/1875 [00:42<00:00, 44.16it/s]


Validation Loss: 0.25289686508687836, Accuracy: 94.115%, f1 score is 0.9261031055217089


 17%|█████████████▍                                                                  | 733/4375 [00:45<03:44, 16.23it/s]


KeyboardInterrupt: 

## 保存模型

In [None]:
save_pickle_object(model, create_trained_models_path(f"ori_9319-{scheme_type_here}.pkl"))

# Scheme 3: 独立embedding

In [40]:
scheme_type_here = f"{scheme_type}__idp_emb"

In [41]:
# 定义模型
class FinalModel(nn.Module):
    def __init__(self, 
                 maxlen, vocab_size, 
                 embed_dim=128, 
                 num_heads=8, ff_dim=128,
                tcnn_ks = [3,5,7,10], tcnn_nc = [32,64,64,64],
                 lstm_hs = 128, lstm_nlyr = 4, lstm_bd = True
                ):
        super(FinalModel, self).__init__()
        ## Tsfm部分：
        self.emb_trans = nn.Embedding(vocab_size, embed_dim)
        self.tsfm = TransformerModel(
            maxlen, 
            vocab_size, 
            embed_dim, 
            num_heads, 
            ff_dim
        ) # embed_dim
        ## TextCNN部分：
        self.emb_cnn = nn.Embedding(vocab_size, embed_dim)
        self.textcnn = TextCNN(
            embedding_dim = embed_dim, 
            kernel_sizes = tcnn_ks,
            num_channels = tcnn_nc
        ) # sum(tcnn_nc)
        ## BiLSTM:
        self.emb_lstm = nn.Embedding(vocab_size, embed_dim)
        self.lstm = LSTMClassifier(
            input_size = embed_dim, 
            hidden_size = lstm_hs, 
            num_layers = lstm_nlyr,  
            bidirectional = lstm_bd
        ) # lstm_hs * (2 if lstm_bd else 1)
        self.mix = nn.Sequential(
            ## 基于这个数字 embed_dim + sum(tcnn_nc) + lstm_hs * (2 if lstm_bd else 1) ，做一个全连接神经网络吧。
            nn.Linear(
                embed_dim + sum(tcnn_nc) + lstm_hs * (2 if lstm_bd else 1),
                ff_dim
            ),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(ff_dim, ff_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(ff_dim, type_of_class)
        )
        self.sc_net = SentimentClassifier(sc_input_dim, type_of_class)
        
    def forward(self, x, x_tfidf):
        x_ori = x
        ## Tsfm部分：
        x_tsfm = self.tsfm(self.emb_trans(x_ori)) 
        ## TextCNN部分：
        x_tcnn = self.textcnn(self.emb_cnn(x_ori))
        ## BiLSTM部分:
        x_lstm = self.lstm(self.emb_lstm(x_ori))
        ## 综合：
        x_cat = torch.cat(
            [
                x_tsfm, 
                x_tcnn, x_lstm
            ], axis=1
        )        
        return F.log_softmax(self.mix(x_cat) + self.sc_net(x_tfidf), dim=-1)

## 构建模型以及训练

In [42]:
model = FinalModel(maxlen, vocab_size, embed_dim=128, num_heads=8, ff_dim=128)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [43]:
model

FinalModel(
  (emb_trans): Embedding(8000, 128)
  (tsfm): TransformerModel(
    (embedding_layer): TokenAndPositionEmbedding(
      (pos_emb): Embedding(800, 128)
    )
    (transformer_block): TransformerBlock(
      (att): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (ffn): Sequential(
        (0): Linear(in_features=128, out_features=128, bias=True)
        (1): ReLU()
        (2): Linear(in_features=128, out_features=128, bias=True)
      )
      (layernorm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (layernorm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (global_avg_pool): AdaptiveAvgPool1d(output_size=1)
  )
  (emb_cnn): Embedding(8000, 128)
  (textcnn): TextCNN(
    (cnn_layers): ModuleList(
      (0): Sequential(
        (0): Conv1d(128, 32, kernel_size=(3,)

In [44]:
# 训练和评估模型
from torch.utils.tensorboard import SummaryWriter
import shutil

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=2):
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    log_path = "runs/Logs"
    if os.path.exists(
        os.path.dirname(log_path)
    ):
        shutil.rmtree(os.path.dirname(log_path))

    writer = SummaryWriter(log_dir=log_path)
    
    total_train_step = 0
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        loss_sum = 0
        for inputs, input_tfidfs, targets in tqdm.tqdm(train_loader):
            inputs, input_tfidfs, targets = inputs.to(device), input_tfidfs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs, input_tfidfs)
            loss = criterion(outputs, targets)
            loss.requires_grad_(True)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
            total_train_step += 1
            loss_sum += loss.item()
            if total_train_step % 100 == 0:
                # print(total_train_step//100)
                writer.add_scalar(f"train_loss_detail-{scheme_type_here}", loss_sum/100, total_train_step//100)
                loss_sum = 0
                
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {train_loss/len(train_loader)}, Accuracy: {100.*correct/total}%')
        writer.add_scalar(f"epoch_loss-{scheme_type_here}", train_loss/len(train_loader), epoch+1)
        
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            total_predicted = []
            total_label = []
            for inputs, input_tfidfs, targets in tqdm.tqdm(val_loader):
                inputs, input_tfidfs, targets = inputs.to(device), input_tfidfs.to(device), targets.to(device)
                outputs = model(inputs, input_tfidfs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

                total_predicted += list(predicted.cpu())
                total_label += list(targets.cpu())
        f1 = f1_score(total_label, total_predicted, average='macro')
                
        print(f'Validation Loss: {val_loss/len(val_loader)}, Accuracy: {100.*correct/total}%, f1 score is {f1}')
    writer.close()

In [45]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=15)

 77%|████████████████████████████████████████████████████████████▋                  | 3361/4375 [03:21<01:00, 16.65it/s]


KeyboardInterrupt: 

## 保存模型

In [None]:
save_pickle_object(model, create_trained_models_path(f"ori_9319-{scheme_type_here}.pkl"))

# 选最好的模型来预测

In [24]:
scheme_type_best = "TBD"

In [36]:
model = load_pickle_object(create_trained_models_path(f"ori_9319-{scheme_type_best}.pkl"))

In [37]:
with torch.no_grad():
    total_predicted = []
    for inputs, input_tfidfs, targets in tqdm.tqdm(test_loader):
        inputs, input_tfidfs, targets = inputs.to(device), input_tfidfs.to(device), targets.to(device)
        outputs = model(inputs, input_tfidfs)
        _, predicted = torch.max(outputs, 1)
        total_predicted += list(predicted)

100%|███████████████████████████████████████████| 32/32 [00:01<00:00, 25.93it/s]


In [38]:
oot_rst = [int(x) for x in total_predicted]
len(oot_rst)

1000

In [39]:
store_data_to_newbasepath(pd.DataFrame({"label": oot_rst}), f"rst-{scheme_type_best}", fmt="csv")

df.to_csv("/Users/minkexiu/Downloads/GitHub/Tianchi_NLPNewsClassification/preprocessedData/rst-20250217_1.csv", index=False)
data saved.


'/Users/minkexiu/Downloads/GitHub/Tianchi_NLPNewsClassification/preprocessedData/rst-20250217_1.csv'