In [1]:
import warnings
warnings.filterwarnings('ignore')

##  Config and Data

In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel, BertConfig,BertForSequenceClassification, AdamW
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import os

class Config:
    def __init__(self):
        self.num_labels = 28
        self.epochs = 10
        self.lr = 2e-5
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = 'bert-base-uncased'
        self.result_path = 'result'
        self.checkpoint_path = 'saved_dict'
        self.batch_size = 32
        self.max_length = 32
        self.score_key = 'f1_score'

config = Config()
# 定义Tokenizer
tokenizer = BertTokenizer.from_pretrained(config.model)

# 自定义数据集类，用于PyTorch的DataLoader
class TextDataset(Dataset):
    def __init__(self, tokenizer, max_length, num_class=28, mode='train'):
        self.data = pd.read_csv("data/{}.tsv".format(mode), sep='\t', header=None)
        self.data.columns = ['text', 'labels', 'id']
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_class = num_class

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data['text'][idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        labels = self.data['labels'][idx].split(',')
        return { 
            'input_ids': encoding['input_ids'].squeeze(0), 
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(convert_onehot(labels), dtype=torch.float32)
        }
    
def convert_onehot(labels):
    res = [0] * config.num_labels
    for label in labels:
        res[int(label)] = 1
    return res

train_dataset = TextDataset(tokenizer, max_length=config.max_length, mode='train')
dev_dataset = TextDataset(tokenizer, max_length=config.max_length, mode='dev')
test_dataset = TextDataset(tokenizer, max_length=config.max_length, mode='test')

train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False)

## Training, Evaluation and Metrics

In [3]:
from tqdm import tqdm
import json

def train(train_loader, val_loader, model, config):

    optimizer = AdamW(model.parameters(), lr=config.lr)
    
    model_name = '{}_B-{}_E-{}_Lr-{}_add'.format(config.model, config.batch_size,
                                                        config.epochs, config.lr)
    max_score = 0
    
    for epoch in range(config.epochs):
        model.train()

        preds_all = np.zeros((0, config.num_labels), dtype=int)
        labels_all = np.zeros((0, config.num_labels), dtype=int)
        loss_all, length = 0, 0

        for batch in tqdm(train_loader, desc='Training', colour='MAGENTA'):
            labels = batch['labels'].int().numpy()

            optimizer.zero_grad()
            for key, val in batch.items():
                batch[key] = val.to(config.device)
            outputs = model(**batch)

            loss, logits = outputs.loss, outputs.logits
            preds = get_preds(logits)
            preds_all = np.concatenate((preds_all, preds))
            labels_all = np.concatenate((labels_all, labels))
            loss_all += loss.item()
            length += 1

            loss.backward()
            optimizer.step()
            
        train_score = get_scores(preds_all, labels_all, loss_all, length, 'train')
        dev_score, _ =  evaluate(val_loader, model, config)
        f = open('{}/{}.all_scores.txt'.format(config.result_path, model_name), 'a')
        f.write(' ==================================================  Epoch: {}  ==================================================\n'.format(epoch))
        f.write('TrainScore: \n{}\nEvalScore: \n{}\n'.format(
                json.dumps(train_score), json.dumps(dev_score)))
        max_score = save_best(config, epoch, model_name,
                                  model, dev_score, max_score)
        print('End for {} epoch'.format(epoch))
        
def save_best(config, epoch, model_name, model, score, max_score):
    score_key = config.score_key
    curr_score = score[score_key]
    print('The epoch_{} {}: {}\nCurrent max {}: {}'.format(
        epoch, score_key, curr_score, score_key, max_score))

    if curr_score > max_score or epoch == 0:
        torch.save({
            'epoch': config.epochs,
            'model_state_dict': model.state_dict(),
        }, '{}/{}-BEST.tar'.format(config.checkpoint_path, model_name))
        return curr_score
    else:
        return max_score

def get_preds(logit):
    return (logit > 0).int().cpu().numpy()

def get_scores(preds, labels, loss_all=None, length=None, data_name='train'):
    score_dict = dict()
    union = preds | labels
    inter = preds & labels
    tp = np.sum(inter, axis=1)
    accuracy =  tp / np.sum(union, axis=1)
    precision = tp / (np.sum(preds, axis=1) + 1e-10) # 防止除0
    recall = tp / np.sum(labels, axis=1)
    f1_score = 2*(precision*recall)/(precision + recall + 1e-10)
    
    score_dict['accuracy'] = np.mean(accuracy)
    score_dict['precision'] = np.mean(precision)
    score_dict['recall'] = np.mean(recall)
    score_dict['f1_score'] = np.mean(f1_score)
    if loss_all is not None:
        score_dict['all_loss'] = loss_all / length

    print("Evaling on \"{}\" data".format(data_name))
    for s_name, s_val in score_dict.items():
        print("{}: {}".format(s_name, s_val))
    return score_dict

def evaluate(val_loader, model, config):
    model.eval()
    preds_all = np.zeros((0, config.num_labels), dtype=int)
    labels_all = np.zeros((0, config.num_labels), dtype=int)
    loss_all, length = 0, 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Evaling', colour='CYAN'):
            labels = batch['labels'].int().numpy()
            for key, val in batch.items():
                batch[key] = val.to(config.device)
            outputs = model(**batch)
            loss, logits = outputs.loss, outputs.logits
            preds = get_preds(logits)

            preds_all = np.concatenate((preds_all, preds))
            labels_all = np.concatenate((labels_all, labels))
            loss_all += loss.item()
            length += 1
    
    dev_score = get_scores(preds_all, labels_all, loss_all, length, 'dev')
    return dev_score, preds_all

## training process of bert

In [4]:
config.model = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(config.model, num_labels=config.num_labels).to(config.device)
train(train_loader, test_loader, model, config)

Training: 100%|[35m██████████[0m| 1357/1357 [01:17<00:00, 17.51it/s]


Evaling on "train" data
accuracy: 0.16295629166109565
precision: 0.17390011095568528
recall: 0.16738462719803426
f1_score: 0.166980639720203
all_loss: 0.15064760671438499


Evaling: 100%|[36m██████████[0m| 170/170 [00:03<00:00, 43.31it/s]


Evaling on "dev" data
accuracy: 0.34483446962717273
precision: 0.3729500644550949
recall: 0.34520299735888454
f1_score: 0.3541305816241977
all_loss: 0.10174957268378314
The epoch_0 f1_score: 0.3541305816241977
Current max f1_score: 0
End for 0 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [01:19<00:00, 17.01it/s]


Evaling on "train" data
accuracy: 0.41066958458112574
precision: 0.44487445284677835
recall: 0.41193273439299694
f1_score: 0.4222084004800003
all_loss: 0.09398879052339534


Evaling: 100%|[36m██████████[0m| 170/170 [00:03<00:00, 42.99it/s]


Evaling on "dev" data
accuracy: 0.45130827344757696
precision: 0.49004975119519845
recall: 0.45457895706651924
f1_score: 0.4650390024718762
all_loss: 0.08856828659772872
The epoch_1 f1_score: 0.4650390024718762
Current max f1_score: 0.3541305816241977
End for 1 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [01:20<00:00, 16.87it/s]


Evaling on "train" data
accuracy: 0.522343929970053
precision: 0.5664017506927211
recall: 0.5273669661368348
f1_score: 0.5384496659219061
all_loss: 0.0790788012672718


Evaling: 100%|[36m██████████[0m| 170/170 [00:03<00:00, 42.77it/s]


Evaling on "dev" data
accuracy: 0.4790092746145814
precision: 0.5203918677701805
recall: 0.4855352865303114
f1_score: 0.49479761680471285
all_loss: 0.08640128450796884
The epoch_2 f1_score: 0.49479761680471285
Current max f1_score: 0.4650390024718762
End for 2 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [01:21<00:00, 16.72it/s]


Evaling on "train" data
accuracy: 0.6153359440989019
precision: 0.6673270367154913
recall: 0.6241323043845504
f1_score: 0.6353513014801683
all_loss: 0.06742131575535744


Evaling: 100%|[36m██████████[0m| 170/170 [00:03<00:00, 42.95it/s]


Evaling on "dev" data
accuracy: 0.5405841164547632
precision: 0.5848227995251423
recall: 0.5517167250168908
f1_score: 0.5589214421162294
all_loss: 0.08911058405304656
The epoch_3 f1_score: 0.5589214421162294
Current max f1_score: 0.49479761680471285
End for 3 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [01:21<00:00, 16.62it/s]


Evaling on "train" data
accuracy: 0.6995868847423787
precision: 0.7558723027748888
recall: 0.7114892881824465
f1_score: 0.7221414858706713
all_loss: 0.05659970880932859


Evaling: 100%|[36m██████████[0m| 170/170 [00:03<00:00, 42.81it/s]


Evaling on "dev" data
accuracy: 0.5070480928689884
precision: 0.5520852526953847
recall: 0.5278238437442417
f1_score: 0.528984706046572
all_loss: 0.09429361322785125
The epoch_4 f1_score: 0.528984706046572
Current max f1_score: 0.5589214421162294
End for 4 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [01:22<00:00, 16.43it/s]


Evaling on "train" data
accuracy: 0.7661053520694157
precision: 0.8231359900911104
recall: 0.7793680411579513
f1_score: 0.7894357235315694
all_loss: 0.04690261387218863


Evaling: 100%|[36m██████████[0m| 170/170 [00:03<00:00, 42.54it/s]


Evaling on "dev" data
accuracy: 0.5310945273631841
precision: 0.5764080829862115
recall: 0.5549720533136785
f1_score: 0.5542411399240513
all_loss: 0.10037500207476756
The epoch_5 f1_score: 0.5542411399240513
Current max f1_score: 0.5589214421162294
End for 5 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [01:22<00:00, 16.50it/s]


Evaling on "train" data
accuracy: 0.8139345772863396
precision: 0.8707690239500465
recall: 0.8291703140597404
f1_score: 0.8379450641440638
all_loss: 0.03897100615650745


Evaling: 100%|[36m██████████[0m| 170/170 [00:04<00:00, 40.67it/s]


Evaling on "dev" data
accuracy: 0.5297893249800381
precision: 0.5756403168786869
recall: 0.5557705300657207
f1_score: 0.5538541858057773
all_loss: 0.10562693702385706
The epoch_6 f1_score: 0.5538541858057773
Current max f1_score: 0.5589214421162294
End for 6 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [01:23<00:00, 16.33it/s]


Evaling on "train" data
accuracy: 0.8508519542348153
precision: 0.9047204944225166
recall: 0.8662355064117331
f1_score: 0.8739430786425552
all_loss: 0.03249875659242481


Evaling: 100%|[36m██████████[0m| 170/170 [00:04<00:00, 42.07it/s]


Evaling on "dev" data
accuracy: 0.533843130028868
precision: 0.5794023708067003
recall: 0.5684693814876236
f1_score: 0.5607333701304005
all_loss: 0.11245935250292806
The epoch_7 f1_score: 0.5607333701304005
Current max f1_score: 0.5589214421162294
End for 7 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [01:23<00:00, 16.29it/s]


Evaling on "train" data
accuracy: 0.8800572064808416
precision: 0.9308051139397304
recall: 0.8951025109421791
f1_score: 0.9021109355086073
all_loss: 0.026930882796568297


Evaling: 100%|[36m██████████[0m| 170/170 [00:04<00:00, 42.11it/s]


Evaling on "dev" data
accuracy: 0.5234230084147166
precision: 0.5675941280710808
recall: 0.5598089797923961
f1_score: 0.5506295681536136
all_loss: 0.1200662418761674
The epoch_8 f1_score: 0.5506295681536136
Current max f1_score: 0.5607333701304005
End for 8 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [01:23<00:00, 16.20it/s]


Evaling on "train" data
accuracy: 0.900697611917377
precision: 0.9470840051311054
recall: 0.9148782922521693
f1_score: 0.9210825900522344
all_loss: 0.022604327550735777


Evaling: 100%|[36m██████████[0m| 170/170 [00:04<00:00, 42.00it/s]

Evaling on "dev" data
accuracy: 0.5161507278422701
precision: 0.5591947668535475
recall: 0.5561697684417418
f1_score: 0.5442601805244636
all_loss: 0.12801834975971896
The epoch_9 f1_score: 0.5442601805244636
Current max f1_score: 0.5607333701304005
End for 9 epoch





In [9]:
config.model = 'bert-large-uncased'
model = BertForSequenceClassification.from_pretrained(config.model, num_labels=config.num_labels).to(config.device)
train(train_loader, test_loader, model, config)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Evaling on "train" data
accuracy: 0.25424053763604365
precision: 0.27473760199196123
recall: 0.2596847884512017
f1_score: 0.26174031054518354
all_loss: 0.1293997049891694


Evaling: 100%|[36m██████████[0m| 170/170 [00:10<00:00, 15.99it/s]


Evaling on "dev" data
accuracy: 0.4422486333763282
precision: 0.4785332595801732
recall: 0.4435999017259383
f1_score: 0.4545543885056815
all_loss: 0.09069331498707042
The epoch_0 f1_score: 0.4545543885056815
Current max f1_score: 0
End for 0 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [04:09<00:00,  5.43it/s]


Evaling on "train" data
accuracy: 0.48525109421792206
precision: 0.5275205405298279
recall: 0.4894743914612608
f1_score: 0.5004760807302335
all_loss: 0.08548167836286166


Evaling: 100%|[36m██████████[0m| 170/170 [00:11<00:00, 15.26it/s]


Evaling on "dev" data
accuracy: 0.5075701738222468
precision: 0.5495669798608577
recall: 0.5121921257907991
f1_score: 0.5228364350562644
all_loss: 0.08456194291219991
The epoch_1 f1_score: 0.5228364350562644
Current max f1_score: 0.4545543885056815
End for 1 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [04:12<00:00,  5.38it/s]


Evaling on "train" data
accuracy: 0.5929870229593797
precision: 0.6446018581873781
recall: 0.6013994471319973
f1_score: 0.6127096016326813
all_loss: 0.07123435584055472


Evaling: 100%|[36m██████████[0m| 170/170 [00:11<00:00, 15.34it/s]


Evaling on "dev" data
accuracy: 0.5355475707880352
precision: 0.5810761009199051
recall: 0.5491370308949083
f1_score: 0.5550334745470974
all_loss: 0.08714902153348221
The epoch_2 f1_score: 0.5550334745470974
Current max f1_score: 0.5228364350562644
End for 2 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [04:15<00:00,  5.32it/s]


Evaling on "train" data
accuracy: 0.7032262151577976
precision: 0.761114950397788
recall: 0.7155808953390156
f1_score: 0.7264290649884915
all_loss: 0.056467017647518596


Evaling: 100%|[36m██████████[0m| 170/170 [00:11<00:00, 15.45it/s]


Evaling on "dev" data
accuracy: 0.5342577237270437
precision: 0.5810146796318628
recall: 0.5569989558380934
f1_score: 0.5574841839626835
all_loss: 0.0923270632238949
The epoch_3 f1_score: 0.5574841839626835
Current max f1_score: 0.5550334745470974
End for 3 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [04:16<00:00,  5.29it/s]


Evaling on "train" data
accuracy: 0.7920321738462719
precision: 0.8516624432866153
recall: 0.8069116179067803
f1_score: 0.8166859730873784
all_loss: 0.043200480098326194


Evaling: 100%|[36m██████████[0m| 170/170 [00:11<00:00, 15.01it/s]


Evaling on "dev" data
accuracy: 0.547616854001597
precision: 0.5932989373545784
recall: 0.5717093544622566
f1_score: 0.5710337202306451
all_loss: 0.1026441698346068
The epoch_4 f1_score: 0.5710337202306451
Current max f1_score: 0.5574841839626835
End for 4 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [04:18<00:00,  5.25it/s]


Evaling on "train" data
accuracy: 0.8525508715349766
precision: 0.9094659447788539
recall: 0.8676464716271213
f1_score: 0.8765161087756855
all_loss: 0.03292277273663657


Evaling: 100%|[36m██████████[0m| 170/170 [00:11<00:00, 15.18it/s]


Evaling on "dev" data
accuracy: 0.5327375468337326
precision: 0.5759167126779237
recall: 0.5677783919906639
f1_score: 0.5593022541056232
all_loss: 0.1140287245021147
The epoch_5 f1_score: 0.5593022541056232
Current max f1_score: 0.5710337202306451
End for 5 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [04:19<00:00,  5.23it/s]


Evaling on "train" data
accuracy: 0.8867833832450279
precision: 0.938969515382418
recall: 0.9020221915073333
f1_score: 0.90927501883297
all_loss: 0.02623055455828732


Evaling: 100%|[36m██████████[0m| 170/170 [00:11<00:00, 15.24it/s]


Evaling on "dev" data
accuracy: 0.5109698421472882
precision: 0.5524230697842201
recall: 0.553236901910202
f1_score: 0.5394816042703762
all_loss: 0.12521540890721714
The epoch_6 f1_score: 0.5394816042703762
Current max f1_score: 0.5710337202306451
End for 6 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [04:21<00:00,  5.20it/s]


Evaling on "train" data
accuracy: 0.9093143754212633
precision: 0.95555756046404
recall: 0.9230434615679951
f1_score: 0.9294138574428513
all_loss: 0.02132659053084061


Evaling: 100%|[36m██████████[0m| 170/170 [00:11<00:00, 15.31it/s]


Evaling on "dev" data
accuracy: 0.512146059824335
precision: 0.5535286529791712
recall: 0.5536207849640685
f1_score: 0.5401142435431397
all_loss: 0.1300198796479141
The epoch_7 f1_score: 0.5401142435431397
Current max f1_score: 0.5710337202306451
End for 7 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [04:19<00:00,  5.23it/s]


Evaling on "train" data
accuracy: 0.9254707056745758
precision: 0.9659970819938573
recall: 0.9384277816171388
f1_score: 0.9434154955691967
all_loss: 0.01734548890702399


Evaling: 100%|[36m██████████[0m| 170/170 [00:11<00:00, 15.08it/s]


Evaling on "dev" data
accuracy: 0.5281739450893679
precision: 0.5698882132021142
recall: 0.5880474172348136
f1_score: 0.5624101713094561
all_loss: 0.13633491887765772
The epoch_8 f1_score: 0.5624101713094561
Current max f1_score: 0.5710337202306451
End for 8 epoch


Training: 100%|[35m██████████[0m| 1357/1357 [04:19<00:00,  5.22it/s]


Evaling on "train" data
accuracy: 0.9400341703140597
precision: 0.9747761651540955
recall: 0.951535744452123
f1_score: 0.9556017740572809
all_loss: 0.014264276805906861


Evaling: 100%|[36m██████████[0m| 170/170 [00:10<00:00, 15.60it/s]

Evaling on "dev" data
accuracy: 0.5152939008660402
precision: 0.556307966289559
recall: 0.5706037712671213
f1_score: 0.5476611650196318
all_loss: 0.1433068057631745
The epoch_9 f1_score: 0.5476611650196318
Current max f1_score: 0.5710337202306451
End for 9 epoch





## evaluation models

#### BERT

In [4]:
def test_data(model_path, test_loader, config):
    model = BertForSequenceClassification.from_pretrained(config.model, num_labels=config.num_labels).to(config.device)
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    print('metrics of {} after {} epochs of fine_tune'.format(config.model, checkpoint['epoch']))
    test_score, preds = evaluate(test_loader, model, config)
    return test_score, preds

config.model = 'bert-base-uncased'
model_path = "saved_dict/bert-base-uncased_B-32_E-10_Lr-2e-05_add-BEST.tar"
test_score, preds = test_data(model_path, test_loader, config)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

metrics of bert-base-uncased after 10 epochs of fine_tune


Evaling: 100%|[36m██████████[0m| 170/170 [00:03<00:00, 43.69it/s]

Evaling on "dev" data
accuracy: 0.533843130028868
precision: 0.5794023708067003
recall: 0.5684693814876236
f1_score: 0.5607333701304005
all_loss: 0.11245935250292806





In [5]:
config.model = 'bert-large-uncased'
model_path = "saved_dict/bert-large-uncased_B-32_E-10_Lr-2e-05_add-BEST.tar"
test_score, preds = test_data(model_path, test_loader, config)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

metrics of bert-large-uncased after 10 epochs of fine_tune


Evaling: 100%|[36m██████████[0m| 170/170 [00:09<00:00, 18.37it/s]


Evaling on "dev" data
accuracy: 0.547616854001597
precision: 0.5932989373545784
recall: 0.5717093544622566
f1_score: 0.5710337202306451
all_loss: 0.1026441698346068


#### llama

In [6]:
def analyze_llm(result_path, name):
    llm_result = pd.read_csv(result_path)
    preds = np.zeros((0, config.num_labels), dtype=int)
    labels = np.zeros((0, config.num_labels), dtype=int)

    for pred, label in zip(llm_result['predicted_sentiment'], llm_result['sentiment_index']):
        pred = [json.loads(pred)]
        preds = np.append(preds, pred, axis=0)
        label = [convert_onehot(label.split(','))]
        labels = np.append(labels, label, axis=0)
    print('metrics of {}'.format(name))
    get_scores(preds, labels, data_name='test')

analyze_llm("result/goemotions_with_predictions_llama_3.1_8B_instruct.csv", 'llama_zero_shot')
print("-"*100)
analyze_llm("result/goemotions_with_predictions__llama_3.1_8B_instruct_sst_finetuned.csv", 'llama_fine_tuned')

metrics of llama_zero_shot
Evaling on "test" data
accuracy: 0.2096631042319268
precision: 0.2348565812690553
recall: 0.24006510656593577
f1_score: 0.2266416301566222
----------------------------------------------------------------------------------------------------
metrics of llama_fine_tuned
Evaling on "test" data
accuracy: 0.34401493414760415
precision: 0.36251765858661245
recall: 0.41290461273877527
f1_score: 0.36660261300023267
