In [1]:
import sys
from kaitoupao import *

storage dir: /Users/minkexiu/Downloads/GitHub/Tianchi_NLPNewsClassification
code dir: /Users/minkexiu/Documents/GitHub/Tianchi_NLPNewsClassification 

28 12 41
先天八卦数: 1乾, 2兑, 3离, 4震, 5巽, 6坎, 7艮, 8坤
本卦上：4 本卦下：4 变爻：5


Unnamed: 0,雷雷震,水山蹇,泽雷随
上卦,☳震木,☵坎水,☱兑金
下卦,☳震木,☶艮土,☳震木


11 28 7 午时
先天八卦数: 1乾, 2兑, 3离, 4震, 5巽, 6坎, 7艮, 8坤
本卦上：3 本卦下：4 变爻：1


Unnamed: 0,火雷噬嗑,水山蹇,火地晋
上卦,☲离火,☵坎水,☲离火
下卦,☳震木,☶艮土,☷坤土


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score

In [3]:
device = torch.device("cpu")

# 加载训练集和测试集，将全量字符列表给它弄出来

In [4]:
data_train = pd.read_csv(create_originalData_path("train_set.csv"), sep="\t")
data_test = pd.read_csv(create_originalData_path("test_a.csv"), sep="\t")

In [5]:
data_train.shape, data_test.shape

((200000, 2), (50000, 1))

In [6]:
type_of_class = data_train.label.nunique()

# 定义网络结构

In [7]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.att = nn.MultiheadAttention(embed_dim, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.layernorm1 = nn.LayerNorm(embed_dim)
        self.layernorm2 = nn.LayerNorm(embed_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        attn_output, _ = self.att(x, x, x)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

In [8]:
class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(maxlen, embed_dim)

    def forward(self, x):
        maxlen = x.size(1)
        positions = torch.arange(0, maxlen, device=x.device).unsqueeze(0).expand_as(x)
        return self.token_emb(x) + self.pos_emb(positions)

In [9]:
# 定义模型
class TransformerModel(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim, num_heads, ff_dim):
        super(TransformerModel, self).__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.dropout1 = nn.Dropout(0.1)
        self.dense1 = nn.Linear(embed_dim, 20)
        self.dropout2 = nn.Dropout(0.1)
        self.dense2 = nn.Linear(20, type_of_class)
        
    def forward(self, x):
        # print(x)
        x = self.embedding_layer(x).transpose(0, 1)  # Transformer expects (seq_len, batch_size, embed_dim)
        x = self.transformer_block(x)
        x = x.transpose(0, 1)
        x = self.global_avg_pool(x.permute(0, 2, 1)).squeeze(-1)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        x = self.dropout2(x)
        return F.log_softmax(self.dense2(x), dim=-1)

# 准备数据集

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train, x_valid, y_train, y_valid = train_test_split(data_train.text, data_train.label, test_size=0.3)

In [12]:
# 下载数据并进行预处理
vocab_size = 8000  # 只考虑前 20k 词汇
maxlen = 1600  # 只考虑每条评论的前 200 个词

In [13]:
def preprocess_seq_str_2_int(seq, len_lim = maxlen):
    rst = [int(wd) for idx, wd in enumerate(seq.strip().split()) if idx < len_lim]
    return rst

In [14]:
x_train = [torch.tensor(preprocess_seq_str_2_int(seq), dtype=torch.long) for seq in x_train]
y_train = torch.tensor(list(y_train), dtype=torch.long)
x_valid = [torch.tensor(preprocess_seq_str_2_int(seq), dtype=torch.long) for seq in x_valid]
y_valid = torch.tensor(list(y_valid), dtype=torch.long)

In [15]:
x_train = pad_sequence(x_train, batch_first=True, padding_value=0)
x_valid = pad_sequence(x_valid, batch_first=True, padding_value=0)

In [16]:
x_val, y_val = x_valid, y_valid

In [17]:
train_dataset = TensorDataset(x_train, y_train)
val_dataset = TensorDataset(x_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [18]:
x_oot = [torch.tensor(preprocess_seq_str_2_int(seq), dtype=torch.long) for seq in data_test.text]

x_oot = pad_sequence(x_oot, batch_first=True, padding_value=0)

oot_dataset = TensorDataset(x_oot,)

oot_loader = DataLoader(oot_dataset, batch_size=32)

# 构建模型以及训练

In [19]:
model = TransformerModel(maxlen, vocab_size, embed_dim=64, num_heads=4, ff_dim=32)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [20]:
model

TransformerModel(
  (embedding_layer): TokenAndPositionEmbedding(
    (token_emb): Embedding(8000, 64)
    (pos_emb): Embedding(1600, 64)
  )
  (transformer_block): TransformerBlock(
    (att): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
    )
    (ffn): Sequential(
      (0): Linear(in_features=64, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=64, bias=True)
    )
    (layernorm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (layernorm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (global_avg_pool): AdaptiveAvgPool1d(output_size=1)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dense1): Linear(in_features=64, out_features=20, bias=True)
  (dropout2): Dropout(p=0.1, inplace=False)
  (dense2): Linear(in_features=20, out_features=14, bias=True)
)

In [21]:
# 训练和评估模型
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=2):
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        for inputs, targets in tqdm.tqdm(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.requires_grad_(True)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {train_loss/len(train_loader)}, Accuracy: {100.*correct/total}%')
        
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            total_predicted = []
            total_label = []
            for inputs, targets in tqdm.tqdm(val_loader):
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

                total_predicted += list(predicted)
                total_label += list(targets)
        f1 = f1_score(total_label, total_predicted, average='macro')
                
        print(f'Validation Loss: {val_loss/len(val_loader)}, Accuracy: {100.*correct/total}%, f1 score is {f1}')

In [22]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=1)

100%|███████████████████████████████████████| 4375/4375 [54:21<00:00,  1.34it/s]


Epoch 1/1, Loss: 0.7651378873177938, Accuracy: 76.07%


100%|███████████████████████████████████████| 1875/1875 [11:54<00:00,  2.63it/s]


Validation Loss: 0.3793178574979305, Accuracy: 88.62333333333333%, f1 score is 0.8205446592853455


In [23]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=5)

100%|███████████████████████████████████████| 4375/4375 [55:48<00:00,  1.31it/s]


Epoch 1/5, Loss: 0.36229306167747294, Accuracy: 89.26214285714286%


100%|███████████████████████████████████████| 1875/1875 [11:37<00:00,  2.69it/s]


Validation Loss: 0.3064984996666511, Accuracy: 90.94666666666667%, f1 score is 0.8761481725077783


100%|███████████████████████████████████████| 4375/4375 [54:43<00:00,  1.33it/s]


Epoch 2/5, Loss: 0.2894775752808367, Accuracy: 91.465%


100%|███████████████████████████████████████| 1875/1875 [11:39<00:00,  2.68it/s]


Validation Loss: 0.27855545281544325, Accuracy: 91.66666666666667%, f1 score is 0.8903979853866149


100%|███████████████████████████████████████| 4375/4375 [54:23<00:00,  1.34it/s]


Epoch 3/5, Loss: 0.24783215670074735, Accuracy: 92.59928571428571%


100%|███████████████████████████████████████| 1875/1875 [11:38<00:00,  2.68it/s]


Validation Loss: 0.26811585601170856, Accuracy: 91.96166666666667%, f1 score is 0.8986694938343415


100%|███████████████████████████████████████| 4375/4375 [54:21<00:00,  1.34it/s]


Epoch 4/5, Loss: 0.21763440595673664, Accuracy: 93.42285714285714%


100%|███████████████████████████████████████| 1875/1875 [11:38<00:00,  2.68it/s]


Validation Loss: 0.2568774534796675, Accuracy: 92.41666666666667%, f1 score is 0.9011963395304763


100%|███████████████████████████████████████| 4375/4375 [54:18<00:00,  1.34it/s]


Epoch 5/5, Loss: 0.19274628082758613, Accuracy: 94.23%


100%|███████████████████████████████████████| 1875/1875 [11:14<00:00,  2.78it/s]


Validation Loss: 0.25922012584979337, Accuracy: 92.535%, f1 score is 0.9027240434011505


In [24]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=5)

100%|███████████████████████████████████████| 4375/4375 [53:15<00:00,  1.37it/s]


Epoch 1/5, Loss: 0.17040749123756374, Accuracy: 94.835%


100%|███████████████████████████████████████| 1875/1875 [11:21<00:00,  2.75it/s]


Validation Loss: 0.26691129733622077, Accuracy: 92.37166666666667%, f1 score is 0.9012353092550038


100%|███████████████████████████████████████| 4375/4375 [54:30<00:00,  1.34it/s]


Epoch 2/5, Loss: 0.15330816777125, Accuracy: 95.30214285714285%


100%|███████████████████████████████████████| 1875/1875 [11:39<00:00,  2.68it/s]


Validation Loss: 0.2821016586912175, Accuracy: 92.31%, f1 score is 0.9000820835068899


100%|███████████████████████████████████████| 4375/4375 [54:44<00:00,  1.33it/s]


Epoch 3/5, Loss: 0.13654350867915366, Accuracy: 95.81428571428572%


100%|███████████████████████████████████████| 1875/1875 [11:39<00:00,  2.68it/s]


Validation Loss: 0.27642642535579703, Accuracy: 92.58666666666667%, f1 score is 0.9057083987103517


100%|███████████████████████████████████████| 4375/4375 [54:26<00:00,  1.34it/s]


Epoch 4/5, Loss: 0.12090337655294155, Accuracy: 96.21%


100%|███████████████████████████████████████| 1875/1875 [11:39<00:00,  2.68it/s]


Validation Loss: 0.3066738303401818, Accuracy: 92.22833333333334%, f1 score is 0.8998882469786104


100%|███████████████████████████████████████| 4375/4375 [54:43<00:00,  1.33it/s]


Epoch 5/5, Loss: 0.10727240897216168, Accuracy: 96.59642857142858%


100%|███████████████████████████████████████| 1875/1875 [11:40<00:00,  2.68it/s]


Validation Loss: 0.33006446319743993, Accuracy: 92.01333333333334%, f1 score is 0.900457546125139


In [25]:
# train_model(model, train_loader, val_loader, criterion, optimizer, epochs=1)

In [26]:
# train_model(model, train_loader, val_loader, criterion, optimizer, epochs=1)

# 预测一下试试

In [27]:
with torch.no_grad():
    total_predicted = []
    for inputs in tqdm.tqdm(oot_loader):
        outputs = model(inputs[0])
        _, predicted = torch.max(outputs, 1)
        total_predicted += list(predicted)

100%|███████████████████████████████████████| 1563/1563 [09:44<00:00,  2.67it/s]


In [28]:
oot_rst = [int(x) for x in total_predicted]
len(oot_rst)

50000

In [29]:
store_data_to_newbasepath(pd.DataFrame({"label": oot_rst}), "rst-20241228_1", fmt="csv")

df.to_csv("/Users/minkexiu/Downloads/GitHub/Tianchi_NLPNewsClassification/preprocessedData/rst-20241228_1.csv", index=False)
data saved.


'/Users/minkexiu/Downloads/GitHub/Tianchi_NLPNewsClassification/preprocessedData/rst-20241228_1.csv'