In [150]:
import pandas as pd
from torch.utils.data import DataLoader
import torch
from poprogress import simple_progress as simp

In [9]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [98]:
train_len = max(train["sent_id"] + 1)
test_len = max(test["sent_id"] + 1)
print("-"*30)
print("train_len: ",train_len)
print("test_len: ",test_len)
print("-"*30)

------------------------------
train_len:  17091
test_len:  4272
------------------------------


In [71]:
train.head()

Unnamed: 0,sent_id,word_id,word,tag
0,0,0,British,B-MISC
1,0,1,Foreign,O
2,0,2,Secretary,O
3,0,3,Malcolm,B-PER
4,0,4,Rifkind,I-PER


In [65]:
tag = list(train["tag"].unique())

In [85]:
tag_to_id = {k: v for v,k in enumerate(tag)}
id_to_tag = {k: v for k,v in enumerate(tag)}
print(tag_to_id)
print(id_to_tag)

{'B-MISC': 0, 'O': 1, 'B-PER': 2, 'I-PER': 3, 'B-LOC': 4, 'I-LOC': 5, 'B-ORG': 6, 'I-MISC': 7, 'I-ORG': 8}
{0: 'B-MISC', 1: 'O', 2: 'B-PER', 3: 'I-PER', 4: 'B-LOC', 5: 'I-LOC', 6: 'B-ORG', 7: 'I-MISC', 8: 'I-ORG'}


In [131]:
def get_sent(sent_id, data):
    sent = ''
    temp = data[data["sent_id"] == sent_id]["word"]
    for x in temp:
        sent = sent + str(x)
        if x != '.':
            sent = sent + ' '
    
    return sent
# for i in range(test_len):
#     print(get_sent(i, test))
print(get_sent(0, train))

British Foreign Secretary Malcolm Rifkind said on Tuesday that his government would only take action against a planned conference of Islamist groups in London if law was broken .


In [119]:
def get_tags(sent_id, data):
    return list(data[data["sent_id"] == sent_id]["tag"])

print(get_tags(0, train))

['B-MISC', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O']


In [33]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

2024-01-16 02:16:53.015678: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [46]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
# model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

# nlp = pipeline("ner", model=model, tokenizer=tokenizer)
# example = "My name is Wolfgang and I live in Berlin"

# ner_results = nlp(example)
# print(ner_results)

In [120]:
example = get_sent(0, train)
example_tags = get_tags(0, train)

In [47]:
# from transformers import BertTokenizerFast
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
text_tokenized = tokenizer(example, padding='max_length',
                           max_length=512, truncation=True,
                           return_tensors="pt")

In [48]:
print(text_tokenized)

{'input_ids': tensor([[  101,  1418,  4201,  2909,  8491,   155,  8914, 17215,  1163,  1113,
          9667,  1115,  1117,  1433,  1156,  1178,  1321,  2168,  1222,   170,
          2919,  3511,  1104,  6489,  1776,  2114,  1107,  1498,  1191,  1644,
          1108,  3088,   119,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [49]:
print(tokenizer.decode(text_tokenized.input_ids[0]))

[CLS] British Foreign Secretary Malcolm Rifkind said on Tuesday that his government would only take action against a planned conference of Islamist groups in London if law was broken. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [50]:
print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0]))

['[CLS]', 'British', 'Foreign', 'Secretary', 'Malcolm', 'R', '##if', '##kind', 'said', 'on', 'Tuesday', 'that', 'his', 'government', 'would', 'only', 'take', 'action', 'against', 'a', 'planned', 'conference', 'of', 'Islam', '##ist', 'groups', 'in', 'London', 'if', 'law', 'was', 'broken', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PA

In [51]:
word_ids = text_tokenized.word_ids()
print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0]))
print(word_ids)

['[CLS]', 'British', 'Foreign', 'Secretary', 'Malcolm', 'R', '##if', '##kind', 'said', 'on', 'Tuesday', 'that', 'his', 'government', 'would', 'only', 'take', 'action', 'against', 'a', 'planned', 'conference', 'of', 'Islam', '##ist', 'groups', 'in', 'London', 'if', 'law', 'was', 'broken', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PA

In [92]:
def align_word_id(text_tokenized, tags):
    
    word_ids = text_tokenized.word_ids()
    tags_list = []
    pre_word_id = None
    
    for word_id in word_ids:
        if word_id is None:
            tags_list.append(-99)
        elif word_id != pre_word_id:
            tags_list.append(tag_to_id[tags[word_id]])
        else:
            tags_list.append(-99)
            
        pre_word_id = word_id    
    return tags_list

In [121]:
new_label = align_word_id(text_tokenized, example_tags)
print(new_label)
print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0]))

[-99, 0, 1, 1, 2, 3, -99, -99, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, -99, 1, 1, 4, 1, 1, 1, 1, 1, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -99, -9

In [149]:
class DataSequence(torch.utils.data.Dataset):
    def __init__(self, df):
        
        self.tokenized_text = []
        self.tags = []
        for i in simp(range(max(df["sent_id"] + 1))):
            sent = get_sent(i, df)
            tags = get_tags(i, df)
            temp = tokenizer(sent, padding='max_length', max_length=512,
                                truncation=True, return_tensors="pt")
            self.tokenized_text.append([temp])
            self.tags.append([tags])
            
    def __len__(self):
        return len(self.tags)
        
    def get_batch_text(self, id):
        return self.tokenized_text[id]
    
    def get_bach_tag(self, id):
        return self.tags[id]
        
    def __getitem__(self, id):
        batch_text = self.get_batch_text(id)
        batch_tag = self.get_bach_tag(id)
        return batch_text, batch_tag

In [139]:
from transformers import BertForTokenClassification
class BertModel(torch.nn.Module):
    def __init__(self):
        super(BertModel, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased',
                                     num_labels=len(tag_to_id))

    def forward(self, input_id, mask, label):
        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
        return output

In [182]:
def train_loop(model, df_train, df_val):
      # 开始训练循环
      best_acc = 0
      best_loss = 1000

      for epoch_num in range(EPOCHS):
            print("-- epoch: ", epoch_num)
            total_acc_train = 0
            total_loss_train = 0
            # 训练模型
            model.train()
            # # 按批量循环训练模型
            ii = 0
            for train_data, train_label in train_dataloader:
                  print("ii: ",ii)
                  ii = ii + 1
                  # 从train_data中获取mask和input_id
            #       train_label = train_label[0].to(device)
            #       mask = train_data['attention_mask'][0].to(device)
            #       input_id = train_data['input_ids'][0].to(device)
            #       # 梯度清零！！
            #       optimizer.zero_grad()
            #       # 输入模型训练结果：损失及分类概率
            #       loss, logits = model(input_id, mask, train_label)
            #       # 过滤掉特殊token及padding的token
            #       logits_clean = logits[0][train_label != -99]
            #       label_clean = train_label[train_label != -99]
            #       # 获取最大概率值
            #       predictions = logits_clean.argmax(dim=1)
            #       # 计算准确率
            #       acc = (predictions == label_clean).float().mean()
            #       total_acc_train += acc
            #       total_loss_train += loss.item()
            #       # 反向传递
            #       loss.backward()
            #       # 参数更新
            #       optimizer.step()
                  
            # # 模型评估
            # model.eval()

            # total_acc_val = 0
            # total_loss_val = 0
            # for val_data, val_label in val_dataloader:
            # # 批量获取验证数据
            #       val_label = val_label[0].to(device)
            #       mask = val_data['attention_mask'][0].to(device)
            #       input_id = val_data['input_ids'][0].to(device)
            #       # 输出模型预测结果
            #       loss, logits = model(input_id, mask, val_label)
            #       # 清楚无效token对应的结果
            #       logits_clean = logits[0][val_label != -100]
            #       label_clean = val_label[val_label != -100]
            #       # 获取概率值最大的预测
            #       predictions = logits_clean.argmax(dim=1)          
            #       # 计算精度
            #       acc = (predictions == label_clean).float().mean()
            #       total_acc_val += acc
            #       total_loss_val += loss.item()

            # # val_accuracy = total_acc_val / len(df_val)
            # # val_loss = total_loss_val / len(df_val)

            # print(
            # f'''Epochs: {epoch_num + 1} | 
            #     Loss: {total_loss_train / len(df_train): .3f} | 
            #     Accuracy: {total_acc_train / len(df_train): .3f} |
            #     Val_Loss: {total_loss_val / len(df_val): .3f} | 
            #     Accuracy: {total_acc_val / len(df_val): .3f}''')

In [151]:
LEARNING_RATE = 1e-2
EPOCHS = 5
model = BertModel()

# 判断是否使用GPU，如果有，尽量使用，可以加快训练速度
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
if use_cuda:
    model = model.cuda()
    
# 定义优化器
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

# 定义训练和验证集数据
train_dataset = DataSequence(train)
val_dataset = DataSequence(test)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 17091/17091 [02:24<00:00, 118.57it/s]
100%|██████████| 17091/17091 [02:12<00:00, 128.99it/s]


In [197]:
# 批量获取训练和验证集数据
train_dataloader = DataLoader(train_dataset, num_workers=1, batch_size=1, shuffle=True)
val_dataloader = DataLoader(val_dataset, num_workers=1, batch_size=1)


In [196]:

for x,y in val_dataloader:
    print(x)
    print(y)

    break

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


KeyboardInterrupt: 

In [183]:
train_loop(model, train, test)


-- epoch:  0


0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


KeyboardInterrupt: 