In [1]:
import sys
from kaitoupao import *

storage dir: /Users/minkexiu/Downloads/GitHub/Tianchi_NLPNewsClassification
code dir: /Users/minkexiu/Documents/GitHub/Tianchi_NLPNewsClassification 

13 16 24
先天八卦数: 1乾, 2兑, 3离, 4震, 5巽, 6坎, 7艮, 8坤
本卦上：5 本卦下：8 变爻：6


Unnamed: 0,风地观,山地剥,水地比
上卦,☴巽木,☶艮土,☵坎水
下卦,☷坤土,☷坤土,☷坤土


12 14 9 申时
先天八卦数: 1乾, 2兑, 3离, 4震, 5巽, 6坎, 7艮, 8坤
本卦上：4 本卦下：6 变爻：3


Unnamed: 0,雷水解,水火既济,雷风恒
上卦,☳震木,☵坎水,☳震木
下卦,☵坎水,☲离火,☴巽木


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
device = torch.device("cpu")

# 加载训练集和测试集，将全量字符列表给它弄出来

In [5]:
data_train = pd.read_csv(create_originalData_path("train_set.csv"), sep="\t")#.sample(1000)
data_test = pd.read_csv(create_originalData_path("test_a.csv"), sep="\t")#.sample(1000)

In [6]:
data_train.shape, data_test.shape

((200000, 2), (50000, 1))

In [7]:
type_of_class = data_train.label.nunique()

In [8]:
train_data, valid_data = train_test_split(data_train, test_size=0.3, random_state=42)

In [9]:
train_labels = torch.tensor(train_data.label.to_list(), dtype=torch.long)
valid_labels = torch.tensor(valid_data.label.to_list(), dtype=torch.long)
test_labels = torch.tensor([-1 for x in range(data_test.shape[0])], dtype=torch.long) ## test_labels的label是假的。

## 使用TF-IDF提取特征

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_train_features = tfidf_vectorizer.fit_transform(train_data['text'])
tfidf_valid_features = tfidf_vectorizer.transform(valid_data['text'])
tfidf_test_features = tfidf_vectorizer.transform(data_test['text'])

In [11]:
train_features = torch.tensor(tfidf_train_features.toarray(), dtype=torch.float32)
valid_features = torch.tensor(tfidf_valid_features.toarray(), dtype=torch.float32)
test_features = torch.tensor(tfidf_test_features.toarray(), dtype=torch.float32)

In [12]:
sc_input_dim = train_features.shape[1]
sc_input_dim

6695

## 创建适合于语言序列的数据

In [13]:
# 下载数据并进行预处理
vocab_size = 8000  # 只考虑前 20k 词汇
maxlen = 800  # 只考虑每条评论的前 200 个词

In [14]:
def preprocess_seq_str_2_int(seq, len_lim = maxlen):
    rst = [int(wd) for idx, wd in enumerate(seq.strip().split()) if idx < len_lim]
    return rst

In [15]:
x_train = [torch.tensor(preprocess_seq_str_2_int(seq), dtype=torch.long) for seq in train_data.text]
x_valid = [torch.tensor(preprocess_seq_str_2_int(seq), dtype=torch.long) for seq in valid_data.text]
x_test = [torch.tensor(preprocess_seq_str_2_int(seq), dtype=torch.long) for seq in data_test.text]

In [16]:
x_train = pad_sequence(x_train, batch_first=True, padding_value=0)
x_valid = pad_sequence(x_valid, batch_first=True, padding_value=0)
x_test = pad_sequence(x_test, batch_first=True, padding_value=0)

## 准备数据集

In [17]:
batchsize = 32

In [18]:
class MyData(Dataset):
    def __init__(
        self, 
        ori_data, tfidf_feats, label,
    ):
        self.ori_data = ori_data
        self.tfidf_feats = tfidf_feats
        self.label = label
 
    def __len__(self):
        return len(self.ori_data)
 
    def __getitem__(self, idx):
        tuple_ = (
            self.ori_data[idx], 
            self.tfidf_feats[idx], 
            self.label[idx]
        )
        return tuple_

In [19]:
train_loader = DataLoader(MyData(x_train, train_features, train_labels), batch_size=batchsize, shuffle=True,)
val_loader = DataLoader(MyData(x_valid, valid_features, valid_labels), batch_size=batchsize, shuffle=True,)
test_loader = DataLoader(MyData(x_test, test_features, test_labels), batch_size=batchsize, shuffle=True,)

# 定义网络结构

In [20]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.att = nn.MultiheadAttention(embed_dim, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.layernorm1 = nn.LayerNorm(embed_dim)
        self.layernorm2 = nn.LayerNorm(embed_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        attn_output, _ = self.att(x, x, x)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

In [21]:
class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(maxlen, embed_dim)

    def forward(self, x):
        maxlen = x.size(1)
        positions = torch.arange(0, maxlen, device=x.device).unsqueeze(0).expand_as(x)
        return self.token_emb(x) + self.pos_emb(positions)

In [22]:
# 单层神经网络：
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)
 
    def forward(self, x):
        return self.fc(x)

In [23]:
# 定义模型
class TransformerModel(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim, num_heads, ff_dim):
        super(TransformerModel, self).__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.dropout1 = nn.Dropout(0.1)
        self.dense1 = nn.Linear(embed_dim, 20)
        self.dropout2 = nn.Dropout(0.1)
        self.dense2 = nn.Linear(20, type_of_class)

        self.sc_net = SentimentClassifier(sc_input_dim, type_of_class)
        
    def forward(self, x, x_tfidf):
        x = self.embedding_layer(x).transpose(0, 1)  # Transformer expects (seq_len, batch_size, embed_dim)
        x = self.transformer_block(x)
        x = x.transpose(0, 1)
        x = self.global_avg_pool(x.permute(0, 2, 1)).squeeze(-1)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        x = self.dropout2(x)
        return F.log_softmax(self.dense2(x) + self.sc_net(x_tfidf), dim=-1)

# 构建模型以及训练

In [24]:
model = TransformerModel(maxlen, vocab_size, embed_dim=64, num_heads=4, ff_dim=32)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [25]:
model

TransformerModel(
  (embedding_layer): TokenAndPositionEmbedding(
    (token_emb): Embedding(8000, 64)
    (pos_emb): Embedding(800, 64)
  )
  (transformer_block): TransformerBlock(
    (att): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
    )
    (ffn): Sequential(
      (0): Linear(in_features=64, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=64, bias=True)
    )
    (layernorm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (layernorm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (global_avg_pool): AdaptiveAvgPool1d(output_size=1)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dense1): Linear(in_features=64, out_features=20, bias=True)
  (dropout2): Dropout(p=0.1, inplace=False)
  (dense2): Linear(in_features=20, out_features=14, bias=True)
  (sc_net)

In [26]:
# 训练和评估模型
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=2):
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        for inputs, input_tfidfs, targets in tqdm.tqdm(train_loader):
            inputs, input_tfidfs, targets = inputs.to(device), input_tfidfs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs, input_tfidfs)
            loss = criterion(outputs, targets)
            loss.requires_grad_(True)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {train_loss/len(train_loader)}, Accuracy: {100.*correct/total}%')
        
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            total_predicted = []
            total_label = []
            for inputs, input_tfidfs, targets in tqdm.tqdm(val_loader):
                inputs, input_tfidfs, targets = inputs.to(device), input_tfidfs.to(device), targets.to(device)
                outputs = model(inputs, input_tfidfs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

                total_predicted += list(predicted)
                total_label += list(targets)
        f1 = f1_score(total_label, total_predicted, average='macro')
                
        print(f'Validation Loss: {val_loss/len(val_loader)}, Accuracy: {100.*correct/total}%, f1 score is {f1}')

In [27]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=1)

100%|███████████████████████████████████████| 4375/4375 [13:34<00:00,  5.37it/s]


Epoch 1/1, Loss: 0.5823511528006622, Accuracy: 82.61785714285715%


100%|███████████████████████████████████████| 1875/1875 [02:44<00:00, 11.41it/s]


Validation Loss: 0.301023789447546, Accuracy: 90.88833333333334%, f1 score is 0.8851251362069618


In [28]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=5)

100%|███████████████████████████████████████| 4375/4375 [13:27<00:00,  5.42it/s]


Epoch 1/5, Loss: 0.2687277728774718, Accuracy: 91.90857142857143%


100%|███████████████████████████████████████| 1875/1875 [02:40<00:00, 11.67it/s]


Validation Loss: 0.23909189752240975, Accuracy: 92.74666666666667%, f1 score is 0.9105030629766616


100%|███████████████████████████████████████| 4375/4375 [13:38<00:00,  5.35it/s]


Epoch 2/5, Loss: 0.2162070893308946, Accuracy: 93.41071428571429%


100%|███████████████████████████████████████| 1875/1875 [02:40<00:00, 11.68it/s]


Validation Loss: 0.23103216464718182, Accuracy: 92.77333333333333%, f1 score is 0.9138069092074492


100%|███████████████████████████████████████| 4375/4375 [13:29<00:00,  5.40it/s]


Epoch 3/5, Loss: 0.18399135501150574, Accuracy: 94.34285714285714%


100%|███████████████████████████████████████| 1875/1875 [02:42<00:00, 11.56it/s]


Validation Loss: 0.21452704050640264, Accuracy: 93.32333333333334%, f1 score is 0.918239806030278


100%|███████████████████████████████████████| 4375/4375 [13:48<00:00,  5.28it/s]


Epoch 4/5, Loss: 0.16092383600047658, Accuracy: 95.02214285714285%


100%|███████████████████████████████████████| 1875/1875 [02:44<00:00, 11.38it/s]


Validation Loss: 0.21016791744728883, Accuracy: 93.45%, f1 score is 0.9207340613838911


100%|███████████████████████████████████████| 4375/4375 [14:02<00:00,  5.20it/s]


Epoch 5/5, Loss: 0.141867220263396, Accuracy: 95.56428571428572%


100%|███████████████████████████████████████| 1875/1875 [02:44<00:00, 11.42it/s]


Validation Loss: 0.21309670954048635, Accuracy: 93.37%, f1 score is 0.9196165620639102


In [29]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=5)

100%|███████████████████████████████████████| 4375/4375 [13:55<00:00,  5.24it/s]


Epoch 1/5, Loss: 0.12510483619006615, Accuracy: 96.08642857142857%


100%|███████████████████████████████████████| 1875/1875 [03:01<00:00, 10.31it/s]


Validation Loss: 0.20768615328520537, Accuracy: 93.48666666666666%, f1 score is 0.9199339222619498


100%|███████████████████████████████████████| 4375/4375 [13:37<00:00,  5.35it/s]


Epoch 2/5, Loss: 0.10986074984408915, Accuracy: 96.56642857142857%


100%|███████████████████████████████████████| 1875/1875 [02:44<00:00, 11.43it/s]


Validation Loss: 0.2246718433737755, Accuracy: 93.28333333333333%, f1 score is 0.9182423714267137


100%|███████████████████████████████████████| 4375/4375 [14:23<00:00,  5.07it/s]


Epoch 3/5, Loss: 0.09601654694617859, Accuracy: 97.03857142857143%


100%|███████████████████████████████████████| 1875/1875 [02:48<00:00, 11.10it/s]


Validation Loss: 0.23094573213532568, Accuracy: 93.26833333333333%, f1 score is 0.9189863778766393


100%|███████████████████████████████████████| 4375/4375 [14:12<00:00,  5.13it/s]


Epoch 4/5, Loss: 0.08361826944641237, Accuracy: 97.40571428571428%


100%|███████████████████████████████████████| 1875/1875 [02:40<00:00, 11.70it/s]


Validation Loss: 0.24634052984093627, Accuracy: 93.13%, f1 score is 0.9158010193175118


100%|███████████████████████████████████████| 4375/4375 [14:00<00:00,  5.20it/s]


Epoch 5/5, Loss: 0.07296485907400825, Accuracy: 97.72071428571428%


100%|███████████████████████████████████████| 1875/1875 [02:45<00:00, 11.31it/s]


Validation Loss: 0.2607268377479166, Accuracy: 92.94833333333334%, f1 score is 0.9151629876368875


In [30]:
# train_model(model, train_loader, val_loader, criterion, optimizer, epochs=1)

In [31]:
# train_model(model, train_loader, val_loader, criterion, optimizer, epochs=1)

# 预测一下试试

In [32]:
with torch.no_grad():
    total_predicted = []
    for inputs, input_tfidfs, targets in tqdm.tqdm(test_loader):
        outputs = model(inputs, input_tfidfs)
        _, predicted = torch.max(outputs, 1)
        total_predicted += list(predicted)

100%|███████████████████████████████████████| 1563/1563 [02:22<00:00, 10.97it/s]


In [33]:
oot_rst = [int(x) for x in total_predicted]
len(oot_rst)

50000

In [34]:
store_data_to_newbasepath(pd.DataFrame({"label": oot_rst}), "rst-20250113_1", fmt="csv")

df.to_csv("/Users/minkexiu/Downloads/GitHub/Tianchi_NLPNewsClassification/preprocessedData/rst-20250113_1.csv", index=False)
data saved.


'/Users/minkexiu/Downloads/GitHub/Tianchi_NLPNewsClassification/preprocessedData/rst-20250113_1.csv'

# 保存模型

In [35]:
save_pickle_object(model, create_trained_models_path("ori_9041-dev_tfidf.pkl"))

/Users/minkexiu/Downloads/GitHub/Tianchi_NLPNewsClassification/trained_models/ori_9041-dev_tfidf.pkl
