In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] ="0"

from sklearn.model_selection import train_test_split
import csv
import torch
import numpy as np
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForMaskedLM
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

from data_generation import generate_label

2022-12-28 22:40:32.178948: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2022-12-28 22:40:36.289382: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-12-28 22:40:36.289734: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-12-28 22:40:38.386601: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-12-28 22:40:38.386709: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (MSI): /proc/driver/nvidia/version does not exist
2022-12-28 22:40:42.045408: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-cr

In [2]:
sents_file = 'sents.csv'

sents = []

with open(sents_file,'r') as f:
    sents = [i for i in csv.reader(f)][1:]

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['cls.predictions.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
max_length = 128
p = 0.2
pos_tag = 'D'

class Dataset(torch.utils.data.Dataset):

    def __init__(self):

        pass
        # print('len:',len(self.texts))

    def build(self, sents):
        self.p = p
        self.max_length = max_length
        self.sents = sents
        self.labels = []
        self.texts = []

        for i in tqdm(self.sents):
            try:
                r = generate_label(i,pos_tag,p)
                if '1' in r[1] and len(r[0]) == self.max_length and len(r[1]) == self.max_length:
                    self.labels.append(self.label_padding(r[1], max_length))
                    self.texts.append(r[0])
            except:
                pass

    def load(self, path):
        self.labels = []
        self.texts = []

        with open(path,'r') as f:
            x = csv.reader(f)
            x = [i for i in x][1:]
            self.labels = [np.array([float(r.replace('[','').replace(']','')) for r in  i[1].split(',')]) for i in x]
        
        with open(path,'r') as f:
            x = csv.reader(f)
            self.texts = [tokenizer(i[0], 
                                    padding='max_length', max_length = max_length, truncation=True,
                                        return_tensors="pt") for i in x][1:]

            
        
        print('texts_len',len(self.texts))
        print('labels_len',len(self.labels))
        # for i in self.texts:
        #     print(i)
        # rint(self.texts[0])


    def save(self,path):
        with open(str(len(self.labels))+'_'+path+'_'+str(self.p)+'_'+str(self.max_length)+'.csv', 'w', newline='') as f:
            # 定義欄位
            fieldnames = ['texts', 'labels']

            # 將 dictionary 寫入 CSV 檔
            writer = csv.DictWriter(f, fieldnames=fieldnames)

            # 寫入第一列的欄位名稱
            writer.writeheader()

            for i in range(len(self.labels)):
                writer.writerow({'texts': self.texts[i], 'labels': self.labels[i]})
    

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def label_padding(self, label, lengh):
        while len(label) < lengh:
            label+=('0')
        label = [float(i)for i in label]
        return label

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [5]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = model
        self.max_length = max_length
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, self.max_length)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        print(input_id)
        print(mask)
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [18]:
def train(model, train_data, val_data, learning_rate, epochs):

    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=2)
    print(0)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  

In [22]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [8]:
sents = sents[:500]

In [9]:
df_train,df_val,_,__  = train_test_split(sents, sents, test_size=0.2,
                                                 shuffle=True, # 是否先打乱数据的顺序再划分
                                                 random_state=122)   # 控制将样本随机打乱
                            
print(len(df_train),df_train[0])

400 ['生活是活力四射的青年，总是给人躲闪不及的撞击感，令人耳目一新。']


In [10]:
for i in [0.5]:
    p = i
    train = Dataset()
    train.build(df_train)
    train.save('train')

    val = Dataset()
    val.build(df_val)
    val.save('val')

100%|██████████| 400/400 [01:07<00:00,  5.93it/s]
100%|██████████| 100/100 [00:15<00:00,  6.46it/s]


In [23]:
EPOCHS = 5
model = BertClassifier()
LR = 1e-6

train_ = Dataset()
train_.load('284_train_0.5_128.csv')
val_ = Dataset()
val_.load('80_val_0.5_128.csv')    
train(model, train_, val_, LR, EPOCHS)

texts_len 284
labels_len 284
texts_len 80
labels_len 80
0


  0%|          | 0/142 [00:00<?, ?it/s]

tensor([[ 101, 1762, 6134, 1995, 4638, 2607, 1405,  722,  678, 8024, 2769, 1469,
         6134, 1520, 1350, 1071, 2658, 2703, 4638, 6629, 6716, 8024, 2828,  671,
         1920, 1831,  691, 6205, 3021, 6822, 6134, 1995, 4638, 1309, 2147, 8024,
         6821, 4905, 2697, 6230, 8024, 6158,  782, 1461, 3341, 1600, 1343, 4638,
         2697, 6230, 8024,  671, 4157, 1036,  679, 1962, 8013,  102,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101,  977, 3623, 4638, 6841, 6852, 6823, 3175, 8024,  679, 1386, 3698,
         4638, 2735, 2739, 3313, 3341, 8024, 6121, 




: 

In [12]:
# evaluate(model, df_test)