In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer, BertConfig,AdamW, BertForSequenceClassification,get_linear_schedule_with_warmup


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import accuracy_score,matthews_corrcoef

from tqdm import tqdm, trange,tnrange,tqdm_notebook
import random
import os
import io
%matplotlib inline

In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### **Model for Whailing Wall Identification**

##### **Load Training Set**

In [3]:
sample_Whailing_Wall = pd.read_csv('LiwenliangSample_Whailing_Wall.csv')
sample_Whailing_Wall['Whailing Wall'].unique()

array([0, 1], dtype=int64)

In [4]:
# define undersampling strategy
undersample = RandomUnderSampler(sampling_strategy='majority', random_state=42)

# fit and apply the transform
X_under, y_under = undersample.fit_resample(sample_Whailing_Wall['评论内容'].to_frame(), sample_Whailing_Wall['Whailing Wall'])
# summarize class distribution
print("After undersampling: \n", y_under.value_counts())

df_whailing_wall = pd.concat([X_under,y_under],axis=1)
df_whailing_wall.rename(columns={'评论内容':'sentence','Whailing Wall':'label'},inplace=True)

After undersampling: 
 0    1881
1    1881
Name: Whailing Wall, dtype: int64


In [5]:
df_whailing_wall

Unnamed: 0,sentence,label
0,埋头事业便令大家好过,0
1,这个可怕的2020要过去了！,0
2,2021年，要好好照顾自己，要认认真真 对待每一分钟，每一天~,0
3,有的人真的不配当母亲,0
4,又哭了看来我要哭够了心情才好回来,0
...,...,...
3757,你吹哨了，但有些就是听不入耳,1
3758,李文亮元旦快乐 愿你在天国安好 ！,1
3759,致敬❤️,1
3760,李医生，一百天了。祝您安好。我今天早睡！晚安,1


In [6]:
sentences = df_whailing_wall.sentence.values
labels = df_whailing_wall.label.values

##### **Adding Codebook**

In [7]:
codebook = pd.read_csv('codebook.csv')
codebook

Unnamed: 0,Whaling Wall,Tree Hole,Not Related
0,天堂,烦,
1,李医生,今天,图片评论
2,你,自己,图片评论 网页链接
3,您,天气,。
4,英雄,明天,看看
...,...,...,...
119,,论文,
120,,保佑,
121,,备考,
122,,作业,


In [8]:
codebook_whaling_wall = pd.DataFrame(data=codebook['Whaling Wall'].unique(),columns=['code'])
codebook_whaling_wall.dropna(inplace=True)
codebook_whaling_wall

Unnamed: 0,code
0,天堂
1,李医生
2,你
3,您
4,英雄
...,...
86,学长
87,李兄
88,民族英雄
89,敬礼


In [9]:
new_tokens = []
for i in (codebook_whaling_wall['code']):
    new_tokens.append(i)
print(new_tokens)

['天堂', '李医生', '你', '您', '英雄', '致敬', '谢谢', '感谢', '文亮', '亮哥', '李哥', '一路走好', '安好', 'RIP', '谢谢您', '🙏🏻', '🙏', '抢救', '永远记得', '铭记', '[小白菊]', '安息', '李老师', '烈士', '在那边', '💐', '[蠟燭]', '吹哨人', '走好', '生日快乐', '还好吗', '老李', '恭喜', '不会忘记', '永垂不朽', '好久不见', '文亮老哥', '庆余年2', '祈祷', '再见', '您家人', 'R.I.P', '加油', '调查', '迟到的正义', '李大夫', '遗忘', '怀念', '训诫书', '天使', '放心', '先生', '千古', '庆余年', '奇迹', '永远', '青年', '公道', '没有忘记', '平安回来', '辟谣', '🕯️', '勇敢的人', '你的家人', '您的家人', '保重', '想到你', '好久没来', '回来', '安', '李醫師', '师兄', '看看你', '表彰', '祝好', '2.6', '李sir', '伟大', '亮亮', '平行世界', '悼念', '记得你', '当爸爸', '等你', '活过来', '辛苦了', '学长', '李兄', '民族英雄', '敬礼', '缅怀']


##### **Loading Model**

In [10]:
from transformers import BertTokenizer

# 加载 BERT 分词器

model_name = 'hfl/chinese-roberta-wwm-ext'

tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

In [11]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# 加载 BertForSequenceClassification, 预训练 BERT 模型 + 顶层的线性分类层 
model = BertForSequenceClassification.from_pretrained(
    model_name, # 小写的 12 层预训练模型
    num_labels = 2, # 分类数 --2 表示二分类
                    # 你可以改变这个数字，用于多分类任务  
    output_attentions = False, # 模型是否返回 attentions weights.
    output_hidden_states = False, # 模型是否返回所有隐层状态.
)

# 在 gpu 中运行该模型
model.cuda()

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model che

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
num_added_toks = tokenizer.add_tokens(new_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(21216, 768)

#### **Validate the outcome of codebooks inplant**

In [13]:
# 输出原始句子
print(' Original: ', sentences[3760])

# 将分词后的内容输出
print('Tokenized: ', tokenizer.tokenize(sentences[3760]))

# 将每个词映射到词典下标
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[3760])))

 Original:  李医生，一百天了。祝您安好。我今天早睡！晚安
Tokenized:  ['李医生', '，', '一', '百', '天', '了', '。', '祝', '您', '安好', '。', '我', '今', '天', '早', '睡', '！', '晚', '安']
Token IDs:  [21129, 8024, 671, 4636, 1921, 749, 511, 4867, 2644, 21138, 511, 2769, 791, 1921, 3193, 4717, 8013, 3241, 2128]


In [14]:
max_len = 0
for sent in sentences:

    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  142


In [15]:
MAX_LEN = 142
# 将数据集分完词后存储到列表中
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # 输入文本
                        add_special_tokens = True, # 添加 '[CLS]' 和 '[SEP]'
                        max_length = MAX_LEN,           # 填充 & 截断长度
                        padding = 'max_length',
                        return_attention_mask = True,   # 返回 attn. masks.
                        return_tensors = 'pt',     # 返回 pytorch tensors 格式的数据
                   )
    
    # 将编码后的文本加入到列表  
    input_ids.append(encoded_dict['input_ids'])
    
    # 将文本的 attention mask 也加入到 attention_masks 列表
    attention_masks.append(encoded_dict['attention_mask'])

# 将列表转换为 tensor
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# 输出第 1 行文本的原始和编码后的信息
print('Original: ', sentences[3760])
print('Token IDs:', input_ids[3760])

Original:  李医生，一百天了。祝您安好。我今天早睡！晚安
Token IDs: tensor([  101, 21129,  8024,   671,  4636,  1921,   749,   511,  4867,  2644,
        21138,   511,  2769,   791,  1921,  3193,  4717,  8013,  3241,  2128,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     

In [16]:
from torch.utils.data import TensorDataset, random_split

# 将输入数据合并为 TensorDataset 对象
dataset = TensorDataset(input_ids, attention_masks, labels)

# 计算训练集和验证集大小
train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size

# 按照数据大小随机拆分训练集和测试集
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

3,573 training samples
  189 validation samples


In [17]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# 在 fine-tune 的训练中，BERT 作者建议小批量大小设为 16 或 32
batch_size = 16

# 为训练和验证集创建 Dataloader，对训练样本随机洗牌
train_dataloader = DataLoader(
            train_dataset,  # 训练样本
            sampler = RandomSampler(train_dataset), # 随机小批量
            batch_size = batch_size # 以小批量进行训练
        )

# 验证集不需要随机化，这里顺序读取就好
validation_dataloader = DataLoader(
            val_dataset, # 验证样本
            sampler = SequentialSampler(val_dataset), # 顺序选取小批量
            batch_size = batch_size 
        )

In [18]:
# 我认为 'W' 代表 '权重衰减修复"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )

from transformers import get_linear_schedule_with_warmup

# 训练 epochs。 BERT 作者建议在 2 和 4 之间，设大了容易过拟合 
epochs = 3

# 总的训练样本数
total_steps = len(train_dataloader) * epochs

# 创建学习率调度器
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)




In [19]:
import numpy as np

# 根据预测结果和标签数据来计算准确率
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [20]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # 四舍五入到最近的秒
    elapsed_rounded = int(round((elapsed)))
    
    # 格式化为 hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [21]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the device using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)        
        loss = output.loss
        total_train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        
            output= model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()
        # Move logits and labels to CPU if we are using GPU
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    #print("  Validation took: {:}".format(validation_time))
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 0.37
  Training epcoh took: 0:02:06

Running Validation...
  Accuracy: 0.88

Training...

  Average training loss: 0.24
  Training epcoh took: 0:02:07

Running Validation...
  Accuracy: 0.89

Training...

  Average training loss: 0.17
  Training epcoh took: 0:02:07

Running Validation...
  Accuracy: 0.89

Training complete!
Total training took 0:06:27 (h:mm:ss)


In [22]:
trainingstatsdf = pd.DataFrame(training_stats)
trainingstatsdf

Unnamed: 0,epoch,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
0,1,0.370054,0.289748,0.884215,0:02:06,0:00:02
1,2,0.240789,0.276768,0.889423,0:02:07,0:00:02
2,3,0.168861,0.329575,0.889423,0:02:07,0:00:02


In [23]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2, subplot_titles=('Training and Validation Loss', 'Validation Accuracy'))

fig.add_trace(
    go.Scatter(x=trainingstatsdf['epoch'], y=trainingstatsdf['Training Loss'], mode='lines+markers', name='Training Loss'), row=1, col=1)

fig.add_trace(
    go.Scatter(x=trainingstatsdf['epoch'], y=trainingstatsdf['Valid. Loss'], mode='lines+markers', name='Valid. Loss'), row=1, col=1)

fig.add_trace(
    go.Scatter(x=trainingstatsdf['epoch'], y=trainingstatsdf['Valid. Accur.'], mode='lines+markers', name='Valid. Accur.'), row=1, col=2)

fig.update_layout(title='Training history of Whaling Wall Model', xaxis_title='Epoch', yaxis_title='Loss')

fig.show()


In [25]:
import os

# 模型存储到的路径
output_dir = './model_save/whaling_wall'

# 目录不存在则创建
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# 使用 `save_pretrained()` 来保存已训练的模型，模型配置和分词器
# 它们后续可以通过 `from_pretrained()` 加载
model_to_save = model.module if hasattr(model, 'module') else model  # 考虑到分布式/并行（distributed/parallel）训练
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
#torch.save(args, os.path.join(output_dir, 'training_args.bin'))

Saving model to ./model_save/whaling_wall


('./model_save/whaling_wall\\tokenizer_config.json',
 './model_save/whaling_wall\\special_tokens_map.json',
 './model_save/whaling_wall\\vocab.txt',
 './model_save/whaling_wall\\added_tokens.json')

Evaluate the Bert Model

In [26]:
df = pd.read_csv("LiwenliangSampleAll.csv",names=['index', 'sentence', '0', 'label','2','3'])
df = df.drop(['index','0','2','3'],axis=1)
df = df.drop(df.index[0])
df['label'] = df['label'].astype(int)

In [27]:
import pandas as pd

# 加载数据集

# 打印数据集大小
print('Number of test sentences: {:,}\n'.format(df.shape[0]))
# 将数据集转换为列表
sentences = df.sentence.values
labels = df.label.values

# 分词、填充或截断
input_ids = []
attention_masks = []
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = 128,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

batch_size = 16

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Number of test sentences: 6,024




The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).



In [28]:
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [29]:
print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))
# 依然是评估模式
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# 预测
for batch in prediction_dataloader:
  # 将数据加载到 gpu 中
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  
  # 不需要计算梯度
  with torch.no_grad():
      # 前向传播，获取预测结果
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # 将结果加载到 cpu 中
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # 存储预测结果和 labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 6,024 test sentences...
    DONE.


In [30]:
dataframe_logits_lables = pd.DataFrame({'logits':predictions,'labels':true_labels})

from sklearn.metrics import matthews_corrcoef

matthews_set = []

# 计算每个 batch 的 MCC
print('Calculating Matthews Corr. Coef. for each batch...')

# For each input batch...
for i in range(len(true_labels)):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  
  # 计算该 batch 的 MCC  
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
  matthews_set.append(matthews)

dataframe1 = pd.DataFrame({'MCC':matthews_set})

Calculating Matthews Corr. Coef. for each batch...
0.83


In [40]:
print('Average Matthews Corr. Coef Score of model is: ',dataframe1['MCC'].mean().round(5))

Average Matthews Corr. Coef Score of model is:  0.8338


In [31]:
# 创建柱状图来显示每个 batch 的 MCC 分数
import plotly.express as px
bar = px.bar(x=list(range(len(matthews_set))), y=matthews_set, labels={'x':'Batch', 'y':'MCC'}, title='MCC Score per Batch')
bar.show()

### **Prediction Practical Test**

In [32]:
from transformers import TextClassificationPipeline
device = 'cuda:0'
model = model.to(device)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer,device=0)

In [33]:
label = []
sentence = []
rate = []
for i in range(1,len(df['sentence'])):
    pipelist = pipe(df['sentence'][i])
    label.append(pipelist[0]['label'])
    sentence.append(df['sentence'][i])
    rate.append(pipelist[0]['score'])


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



In [34]:
dataframe2 = pd.DataFrame({'label':label,'sentence':sentence,'rate':rate})
display(dataframe2)

Unnamed: 0,label,sentence,rate
0,LABEL_0,埋头事业便令大家好过,0.995447
1,LABEL_1,致敬,0.992313
2,LABEL_1,你好李医生 几天没来看你啦 晚安💤,0.995290
3,LABEL_1,疫情期间唯一一个让我牵肠挂肚祈福的人，李医生，人民不会忘记你，勋章给你，全世界的幸福给你的家人。,0.987428
4,LABEL_1,李医生坚持住，加油啊🙏,0.990866
...,...,...,...
6018,LABEL_1,缅怀,0.991868
6019,LABEL_0,今天真是在家足足躺了一天,0.993798
6020,LABEL_1,生日快乐🎂,0.993732
6021,LABEL_1,晚安，李医生,0.996489


In [35]:
dataframe2.describe()

Unnamed: 0,rate
count,6023.0
mean,0.952533
std,0.098038
min,0.500302
25%,0.974529
50%,0.992103
75%,0.995399
max,0.997876
