In [3]:
import os
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset,DataLoader
from transformers import AutoTokenizer
#加载bert模型的分词器
from transformers import BertModel
#用于加载bert模型
from pathlib import Path

In [4]:
#每批次训练的数据量
batch_size=16
text_max_length=128
#训练100轮
epochs=100
lr=3e-5
validation_ratio=0.1
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#50次打印一次
log_per_step=50
#定义path
datasets_dir=Path("./datasets")
os.makedirs(datasets_dir) if not os.path.exists(datasets_dir) else ''
model_dir=Path("./model/bert_checkpoints")
os.makedirs(model_dir) if not os.path.exists(model_dir) else ''
print("Device:",device)

Device: cuda


In [5]:
pd_train_data=pd.read_csv(datasets_dir.joinpath('train.csv'),encoding_errors='ignore')

In [6]:
pd_train_data['title']=pd_train_data['title'].fillna('')
pd_train_data['abstract']=pd_train_data['abstract'].fillna('')

In [7]:
test_data=pd.read_csv(datasets_dir.joinpath('test.csv'),encoding_errors='ignore')
test_data['title']=test_data['title'].fillna('')
test_data['abstract']=test_data['abstract'].fillna('')

In [8]:
pd_train_data['text'] = pd_train_data['title'].fillna('') + ' ' +  pd_train_data['author'].fillna('') + ' ' + pd_train_data['abstract'].fillna('')+ ' ' + pd_train_data['Keywords'].fillna('')
test_data['text'] = test_data['title'].fillna('') + ' ' +  test_data['author'].fillna('') + ' ' + test_data['abstract'].fillna('')+ ' ' + pd_train_data['Keywords'].fillna('')

In [9]:
pd_train_data[:5]

Unnamed: 0,uuid,title,author,abstract,Keywords,label,text
0,0,Accessible Visual Artworks for Blind and Visua...,"Quero, Luis Cavazos; Bartolome, Jorge Iranzo; ...",Despite the use of tactile graphics and audio ...,accessibility technology; multimodal interacti...,0,Accessible Visual Artworks for Blind and Visua...
1,1,Seizure Detection and Prediction by Parallel M...,"Li, Chenqi; Lammie, Corey; Dong, Xuening; Amir...","During the past two decades, epileptic seizure...",CNN; Seizure Detection; Seizure Prediction; EE...,1,Seizure Detection and Prediction by Parallel M...
2,2,Fast ScanNet: Fast and Dense Analysis of Multi...,"Lin, Huangjing; Chen, Hao; Graham, Simon; Dou,...",Lymph node metastasis is one of the most impor...,Histopathology image analysis; computational p...,1,Fast ScanNet: Fast and Dense Analysis of Multi...
3,3,Long-Term Effectiveness of Antiretroviral Ther...,"Huang, Peng; Tan, Jingguang; Ma, Wenzhe; Zheng...",In order to assess the effectiveness of the Ch...,HIV; ART; mortality; observational cohort stud...,0,Long-Term Effectiveness of Antiretroviral Ther...
4,4,Real-Time Facial Affective Computing on Mobile...,"Guo, Yuanyuan; Xia, Yifan; Wang, Jing; Yu, Hui...",Convolutional Neural Networks (CNNs) have beco...,facial affective computing; convolutional neur...,0,Real-Time Facial Affective Computing on Mobile...


In [10]:
test_data[:5]

Unnamed: 0,uuid,title,author,abstract,Keywords,text
0,0,Monitoring Changes in Intracellular Reactive O...,"Al-Hassan M Mustafa,Ramy Ashry,Oliver H Krämer...",Reactive oxygen species (ROS) are induced by s...,Flow cytometry; HDACi; Leukemia; ROS.,Monitoring Changes in Intracellular Reactive O...
1,1,Source Printer Classification Using Printer Sp...,"Joshi, Sharad; Khanna, Nitin",The knowledge of the source printer can help i...,Printer classification; local texture patterns...,Source Printer Classification Using Printer Sp...
2,2,Plasma-processed CoSn/RGO nanocomposite: A low...,"Omelianovych, Oleksii; Larina, Liudmila L.; Oh...",The high cost of state-of-the-art Pt counter e...,Plasma reduction; Bimatalic alloy CoxSn1-x; Re...,Plasma-processed CoSn/RGO nanocomposite: A low...
3,3,Immediate Antiretroviral Therapy: The Need for...,"Mgbako, Ofole; E. Sobieszczyk, Magdalena; Olen...","Immediate antiretroviral therapy (iART), defin...",HIV; antiretroviral therapy; rapid; health equity,Immediate Antiretroviral Therapy: The Need for...
4,4,Design and analysis of an ultra-low-power LC q...,"Lee, Kin Keung; Bryant, Carl; Tormanen, Markus...",This paper presents the design of an ultra-low...,Varactor; Spiral inductor; Quadrature generati...,Design and analysis of an ultra-low-power LC q...


In [11]:
#从训练集中随机采样测试集
#frac定义从训练集中采样的比例
validation_data=pd_train_data.sample(frac=validation_ratio)
#在train_data中取得index不在测试集的数据
train_data=pd_train_data[~pd_train_data.index.isin(validation_data.index)]

In [12]:
len(train_data),len(pd_train_data)

(5400, 6000)

In [13]:
#构建Dataset Dataset定义了数据集的内容，它相当于一个类似列表的数据结构，具有确定的长度，能够用索引获取数据集中的元素。
#一般只需要实现getitem和len方法，一个是通过index返回features和labels，一个是获取数据集长度
#与DataLoader交替进行来构建通往模型的数据管道
class MyDataset(Dataset):
    #从Dataset中继承
    #定义构造函数
    def __init__(self,mode='train'):
        super(MyDataset,self).__init__()
        self.mode=mode
        if mode=='train':
            self.dataset=train_data
        elif mode=='validation':
            self.dataset=validation_data
        elif mode=='test':
            self.dataset=test_data
        else:
            raise Exception("Unknown mode{}".format(mode))
    #定义从index取得数据
    def __getitem__(self,index):
        data=self.dataset.iloc[index]
        text=data['text']
        if self.mode=='test':
            label=data['uuid']
        else:
            label=data['label']
        return text,label
    def __len__(self):
        return len(self.dataset)

In [14]:
train_dataset=MyDataset('train')
validation_dataset=MyDataset('validation')

In [15]:
train_dataset.__getitem__(0)

('Accessible Visual Artworks for Blind and Visually Impaired People: Comparing a Multimodal Approach with Tactile Graphics Quero, Luis Cavazos; Bartolome, Jorge Iranzo; Cho, Jundong Despite the use of tactile graphics and audio guides, blind and visually impaired people still face challenges to experience and understand visual artworks independently at art exhibitions. Art museums and other art places are increasingly exploring the use of interactive guides to make their collections more accessible. In this work, we describe our approach to an interactive multimodal guide prototype that uses audio and tactile modalities to improve the autonomous access to information and experience of visual artworks. The prototype is composed of a touch-sensitive 2.5D artwork relief model that can be freely explored by touch. Users can access localized verbal descriptions and audio by performing touch gestures on the surface while listening to themed background music along. We present the design requi

In [16]:
validation_dataset.__getitem__(0)

('A Scalable Near-Memory Architecture for Training Deep Neural Networks on Large In-Memory Datasets Schuiki, Fabian; Schaffner, Michael; Gurkaynak, Frank K.; Benini, Luca Most investigations into near-memory hardware accelerators for deep neural networks have primarily focused on inference, while the potential of accelerating training has received relatively little attention so far. Based on an in-depth analysis of the key computational patterns in state-of-the-art gradient-based training methods, we propose an efficient near-memory acceleration engine called NTX that can be used to train state-of-the-art deep convolutional neural networks at scale. Our main contributions are: (i) a loose coupling of RISC-V cores and NTX co-processors reducing offloading overhead by 7 x over previously published results; (ii) an optimized IEEE 754 compliant data path for fast high-precision convolutions and gradient propagation; (iii) evaluation of near-memory computing with NTX embedded into residual 

In [17]:
# tokenizer=AutoTokenizer.from_pretrained("/root/pc_code/exp4TCKE/my_model/bert-base-uncased")
tokenizer=AutoTokenizer.from_pretrained("./my_model/bert-base-uncased/")

In [18]:
tokenizer("i am a good person")
#input_ids:存储着每个token在字典中的索引值
#token_type_ids:literally识别句子的界限，单句分类任务则默认全为0，俩个句子输入的时候，第一个句子全为0，另一个全为1
#attention_mask:界定注意力的范围，1表示被关注的位置，纳入下游任务计算

{'input_ids': [101, 1045, 2572, 1037, 2204, 2711, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [19]:
#构造Dataloader中的collate_fn实现对句子编码、填充、组装batch
def collate_fn(batch):
    """
    将一个batch的文本句子转成tensor，并组成batch。
    :param batch: 一个batch的句子，例如: [('推文', target), ('推文', target), ...]
    :return: 处理后的结果，例如：
             src: {'input_ids': tensor([[ 101, ..., 102, 0, 0, ...], ...]), 'attention_mask': tensor([[1, ..., 1, 0, ...], ...])}
             target：[1, 1, 0, ...]
    """
    text,label=zip(*batch)
    text,label=list(text),list(label)
    # src is to be sent to bert, so no special processing is required, just use the tokenizer result directly
    # padding='max_length' padding for insufficient length
    #parameter return_tensors='pt', returns the pytorch tensor type
    # truncation=True .cut if the length is too long
    src=tokenizer(text,padding='max_length',max_length=text_max_length,return_tensors='pt',truncation=True)
    return src,torch.LongTensor(label)

In [20]:
train_loader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True,collate_fn=collate_fn)
validation_loader=DataLoader(validation_dataset,batch_size=batch_size,shuffle=False,collate_fn=collate_fn)

In [21]:
inputs,targets=next(iter(train_loader))#next() 返回迭代器的下一个项目。next() 函数要和生成迭代器的 iter() 函数一起使用。
print("inputs:",inputs)
print("targets:",targets)
#input_ids: stores the index value of each token in the dictionary
#token_type_ids: Literally identify the boundaries of sentences, and the single-sentence classification task defaults to all 0. When two sentences are input, the first sentence is all 0, and the other is all 1
#attention_mask: Define the scope of attention, 1 indicates the position of attention, included in the calculation of downstream tasks

inputs: {'input_ids': tensor([[  101,  4895,  6342,  ..., 12515,  1996,   102],
        [  101,  9349, 29477,  ...,  4958, 11921,   102],
        [  101,  2051,  2186,  ...,  1011,  1996,   102],
        ...,
        [  101,  3145,  5876,  ...,  1996, 18749,   102],
        [  101,  7170,  1011,  ...,  2836,  1997,   102],
        [  101, 12702, 21102,  ..., 11628,  3824,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}
targets: tensor([0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1])


In [22]:
#Define the pre-training model, which consists of the bert basic model and the prediction layer
class MyModel(nn.Module): 

    def __init__(self):
        super(MyModel,self).__init__()
        #load bert model from local disk
        self.bert=BertModel.from_pretrained("./my_model/bert-base-uncased/")
        #the prediction layer 
        self.predictor=nn.Sequential(
            #The feature vector output by bert is generally 786 dimensions
            nn.Linear(768,256), 
            nn.ReLU(), 
            nn.Linear(256,1),
            nn.Sigmoid() 
        )
    def forward(self,src): 
        '''
        :param src: Text data after tokenizer
        
        Unpack the src direct sequence and pass it to bert, because bert and tokenizer are a set, so you can do this.
         Get the output of the encoder, and use the output of the front [CLS] as the input of the final linear layer
         **src Pass src as a dictionary of keywords
         Take the shape of last_hidden_state as (batch_size, max_length, hidden_size)=(16,128,786)
         [:, 0, :] slices the output, retaining the hidden state corresponding to the first token of the sample.
         Since the input to BERT is a series of tokens, the first token is usually a special "[CLS]" token, 
         which is calculated to contain the semantic information of the entire sentence.
         Extract the hidden state of the first token ([CLS] token) corresponding to each sample
        '''
        outputs=self.bert(**src).last_hidden_state[:,0,:]
        return self.predictor(outputs)

In [23]:
model=MyModel() 
model=model.to(device) 

In [24]:
#使用二元交叉熵和Adam优化器
criteria=nn.BCELoss()
opimizer=torch.optim.Adam(model.parameters(),lr=lr)

In [25]:
#由于inputs是字典类型，定义一个函数可以支持gpu运算
def to_device(dic_tensors): 
    result_tensor={}
    for key,value in dic_tensors.items():
        result_tensor[key]=value.to(device)
    return result_tensor

In [1]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33m429750130[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [26]:
wandb.init(
    project="bert4text_categorization",
    config={
        "learning_rate":lr,
        "epochs":epochs, 
        "batch_size":batch_size,
    }
)

In [27]:
#定义验证方法，获取accuracy和loss
def validate():
    model.eval()
    total_loss=0.
    total_corrate=0
    for inputs,targets in validation_loader:
        inputs,targets=to_device(inputs),targets.to(device)
        outputs=model(inputs)
        loss=criteria(outputs.view(-1),targets.float())
        total_loss+=float(loss)
        correct_num=(((outputs>=0.5).float()*1).flatten()==targets).sum()
        total_corrate+=correct_num
    acc=total_corrate/len(validation_dataset)
    loss=total_loss/len(validation_dataset)
    # wandb.log({"acc":acc,"loss":loss})
    return acc,loss

In [28]:
# import gc
# gc.collect()
# torch.cuda.empty_cache()
os.environ["WANDB_SILENT"] = "true"
os.environ["WANDB_DISABLED"] = "true"
# memory=5800/240 #(GPU total memory/number of cores)
# os.environ["PYTORCH_CUDA_ALLOC_CONF"]="max_split_size_mb:{}".format(memory)

In [29]:
#training mode
model.train()
#刷新cuda缓存
if torch.cuda.is_available():
    torch.cuda.empty_cache()
total_loss=0.
step=0 
best_accuracy=0
for epoch in range(epochs):
    model.train()
    for i,(inputs,targets) in enumerate(train_loader):
        inputs,targets=to_device(inputs),targets.to(device)
        outputs=model(inputs)   
        loss=criteria(outputs.view(-1),targets.float())
        loss.backward()
        opimizer.step()
        opimizer.zero_grad()
        total_loss+=float(loss)
        step+=1
        if step%log_per_step==0:
            wandb.log({
                "epoch":epoch,
                "step":step,
                "total loss":total_loss
            })
            total_loss=0
        del inputs,targets
    accuracy,validation_loss=validate()
    wandb.log({
        "epoch":epoch,
        "accuracy":accuracy,
        "validation loss":validation_loss
    })
    torch.save(model,model_dir/f"model_{epoch}.pt")
    if accuracy>best_accuracy:
        torch.save(model,model_dir/f"model_best.pt")
        best_accuracy=accuracy
# 保存代码
arti_code=wandb.Artifact('ipynb',type='code')
arti_code.add_file('./bert4text_categorization.ipynb')
wandb.log_artifact(arti_code)
arti_model=wandb.Artifact('bert',type='model')
arti_model.add_file('./model/bert_checkpoints/model_best.pt')
wandb.log_artifact(arti_model)

wandb.finish()

0,1
accuracy,▆█▆▇██▆█▆▄▇▇▇▇████▄▄▃▃▃▃▅▇▆▄▅▃▃▅▅▅▅▅▅▅▅▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
total loss,▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▂
validation loss,▁▁▃▃▁▃▄▅▅▄▅▅▆▆▆▇▇▇▄▆▇▇██▇▄▃▆▇▅▃▅▅▅▆▇▇▇▇▇

0,1
accuracy,0.945
epoch,99.0
step,33800.0
total loss,0.46664
validation loss,0.02322


In [30]:
model=torch.load(model_dir/f"model_best.pt")
model=model.eval()

In [31]:
test_dataset=MyDataset('test')
test_loader=DataLoader(test_dataset,batch_size=batch_size,shuffle=False,collate_fn=collate_fn)

In [32]:
results=[]
for inputs,ids in test_loader:
    outputs=model(to_device(inputs))
    outputs=(outputs>=0.5).int().flatten().tolist()
    ids=ids.tolist()
    results=results+[(id,result) for result,id in zip(outputs,ids)]
    

In [33]:
test_lable=[pair[1] for pair in results]
test_data['label']=test_lable
test_data[['uuid','Keywords','label']].to_csv('submit_task3.csv',index=None)