In [None]:
%load_ext autoreload
%autoreload 2

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
% cd /content/gdrive/My\ Drive/Colab\ Notebooks/nlp_task/bert_cla
! pwd
! ls

/content/gdrive/My Drive/Colab Notebooks/nlp_task/bert_cla
/content/gdrive/My Drive/Colab Notebooks/nlp_task/bert_cla
albert_large	  __init__.py		  run_multi_label_linear.py
albert_tiny	  main.py		  runs
apex		  models		  train.tsv
base_runner.py	  outputs		  val.tsv
base_utils.py	  __pycache__		  wandb
cache_dir	  run.ipynb		  xbert_cnn_runner.py
global_config.py  run_multi_class_cnn.py  xbert_multi_label_linear_runner.py


In [None]:
!pip install wandb tensorboardX transformers

In [None]:
# !git clone https://github.com/NVIDIA/apex
%cd apex
!python setup.py install --cpp_ext --cuda_ext
%cd ..

In [None]:
!/opt/bin/nvidia-smi

In [None]:
!python main.py

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
from xbert_multi_label_linear_runner import MultiBinaryClaRunner


def process_tsv(file):
    "根据数据的储存方式修改"
    data = pd.read_table(file, names=['label', 'text'], encoding='utf-8')
    label = data.label.apply(lambda x: x.split('@'))
    label = np.array(label.to_list())
    enc = OneHotEncoder()
    # '0_-1', '0_-2', '0_0', '0_1' => 1000, 0100, 0010, 0001
    label_ont_hot = enc.fit_transform(label)
    for i, row_label in enumerate(label_ont_hot.toarray()):
        data.iloc[i]['label'] = row_label
    return data


def main():
    "查看训练效果"
    eval_df = process_tsv('val.tsv')
    model = MultiBinaryClaRunner('albert',
                                 './outputs/checkpoint-12000',
                                 num_labels=80,
                                 use_cuda=False,
                                 args={
                                     "reprocess_input_data": False,
                                     "overwrite_output_dir": False,
                                     "num_train_epochs": 5,
                                     "use_cached_eval_features": True,
                                 })
    
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
    print(result)
    print(model_outputs)
    return model_outputs

In [None]:
model_outputs = main()

INFO:transformers.configuration_utils:loading configuration file ./outputs/checkpoint-12000/config.json
INFO:transformers.configuration_utils:Model config AlbertConfig {
  "architectures": [
    "AlbertForMultiBinaryLabelSeqClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "directionality": "bidi",
  "embedding_size": 128,
  "eos_token_id": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 312,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23":

HBox(children=(FloatProgress(value=0.0, max=1875.0), HTML(value='')))


{'LRAP': 0.8503416726137365, 'eval_loss': 0.2725202091852824}
[[0.00418422 0.9829518  0.00285136 ... 0.7647685  0.01880348 0.15971665]
 [0.05347089 0.3439139  0.06759455 ... 0.47151136 0.03505317 0.62766325]
 [0.0066029  0.97901857 0.00484641 ... 0.74131244 0.04904596 0.09119504]
 ...
 [0.00493792 0.9439298  0.00663487 ... 0.34487233 0.01076601 0.67223877]
 [0.00395735 0.9822786  0.00330733 ... 0.6269512  0.01819935 0.32523066]
 [0.03622032 0.12109882 0.04457749 ... 0.4758353  0.01685664 0.529969  ]]


In [None]:
type(model_outputs)
model_outputs.shape

(15000, 80)

In [None]:
m = np.hsplit(model_outputs, 20)  # 每种label的预测
print(m[0].shape)

In [None]:
y_pre = []
for lab in m:
    lab_pre = []
    for item in lab:
        i = np.argmax(item)
        new = np.zeros((4))
        new[i] = 1
        lab_pre.append(new)
    y_pre.append(lab_pre)

print(len(y_pre))
print(len(y_pre[0]))

20


In [None]:
eval_df = process_tsv('val.tsv')
y = eval_df.label.to_numpy()
y_split = [np.split(ar, 20) for ar in y]
y_zip = list(zip(*y_split))

In [None]:
print(y_split[0])

In [None]:
def cal_f1(label_num,predicted,truth):
    results = []
    for i in range(label_num):
        results.append({"TP": 0, "FP": 0, "FN": 0, "TN": 0})
    
    for i, p in enumerate(predicted):
        t = truth[i]
        for j in range(label_num):
            if p[j] == 1:
                if t[j] == 1:
                    results[j]['TP'] += 1
                else:
                    results[j]['FP'] += 1
            else:
                if t[j] == 1:
                    results[j]['FN'] += 1
                else:
                    results[j]['TN'] += 1
    
    precision = [0.0] * label_num
    recall = [0.0] * label_num
    f1 = [0.0] * label_num
    for i in range(label_num):
        if results[i]['TP'] == 0:
            if results[i]['FP']==0 and results[i]['FN']==0:
                precision[i] = 1.0
                recall[i] = 1.0
                f1[i] = 1.0
            else:
                precision[i] = 0.0
                recall[i] = 0.0
                f1[i] = 0.0
        else:
            precision[i] = results[i]['TP'] / (results[i]['TP'] + results[i]['FP'])
            recall[i] = results[i]['TP'] / (results[i]['TP'] + results[i]['FN'])
            f1[i] =  2 * precision[i] * recall[i] / (precision[i] + recall[i])
    
    # for i in range(label_num):
    #     print(i,results[i], precision[i], recall[i], f1[i])
    return sum(f1)/label_num, sum(precision)/label_num, sum(recall)/label_num

In [None]:
 m[i]

array([[0.01855174, 0.7647685 , 0.01880348, 0.15971665],
       [0.01238796, 0.47151136, 0.03505317, 0.62766325],
       [0.1077148 , 0.74131244, 0.04904596, 0.09119504],
       ...,
       [0.00390053, 0.34487233, 0.01076601, 0.67223877],
       [0.01361998, 0.6269512 , 0.01819935, 0.32523066],
       [0.00492287, 0.4758353 , 0.01685664, 0.529969  ]], dtype=float32)

In [None]:
results = {}
total_f1 = 0
for i in range(20):
    # print("# Get f1 score for",label_name)
    f1,precision,recall = cal_f1(4, y_pre[i], y_zip[i])
    results[i] = f1
    total_f1 += f1
    print("# {0} - {1}".format(i,f1))

final_f1 = total_f1 / len(results)
    
print(final_f1)

# 0 - 0.4350798956434013
# 1 - 0.25423651742976927
# 2 - 0.374390923252088
# 3 - 0.2343910472075645
# 4 - 0.565270394738733
# 5 - 0.25308507083415144
# 6 - 0.22945955300812967
# 7 - 0.359041490735619
# 8 - 0.319463242820002
# 9 - 0.4501493568128927
# 10 - 0.4091086744762951
# 11 - 0.3708626074415709
# 12 - 0.35268555078063407
# 13 - 0.4253311095275243
# 14 - 0.3059930302839872
# 15 - 0.38861011858904965
# 16 - 0.21045234845445976
# 17 - 0.3012516641577144
# 18 - 0.48432925660342474
# 19 - 0.293108032437173
0.35081499426170915


In [None]:
# print(model_outputs.shape)
# m = np.hsplit(model_outputs, 20)
# print(m[0].shape)

# label_map = {0:  '0_-1', 1:'0_-2', 2:'0_0', 3:'0_1' }
# label_pred = []
# for lab_item in m:
#     # print(lab_item.shape)
#     # print(lab_item)
    
#     pred = np.argmax(lab_item, axis=1)
#     # print(pred)
#     # print(type(pred))

#     l = []
#     for i in pred:
#         l.append(label_map[i])
#     label_pred.append(l)

# print(label_pred[:2])
# preds = list(zip(*label_pred))