In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 kB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m92.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m
Ins

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
import argparse
import pandas as pd
import torch
import os

from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report, \
    precision_recall_fscore_support, precision_score, recall_score

In [3]:
!pip install evaluate
import evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting pyarrow-hotfix (from datasets>=2.0.0-

In [4]:
def get_dict(dataset_name):
    if dataset_name == 'CAMS':
        labels = {0: 'none', 1: 'bias', 2: 'job', 3: 'medication', 4: 'relation', 5: 'alienation'}
        num_labels = 6
    elif dataset_name == 'CLP':
        labels = {0: 'no', 1: 'yes'}
        num_labels = 2
    elif dataset_name == 'DR':
        labels = {0: 'no', 1: 'yes'}
        num_labels = 2
    elif dataset_name == 'dreaddit':
        labels = {0: 'no', 1: 'yes'}
        num_labels = 2
    elif dataset_name == 'Irf':
        labels = {0: 'no', 1: 'yes'}
        num_labels = 2
    elif dataset_name == 'loneliness':
        labels = {0: 'no', 1: 'yes'}
        num_labels = 2
    elif dataset_name == 'MultiWD':
        labels = {0: 'no', 1: 'yes'}
        num_labels = 2
    elif dataset_name == 'SAD':
        labels = {0: 'school', 1: 'financial',2: 'family', 3: 'social',4: 'work', 5: 'health',6: 'emotional', 7: 'everyday',8: 'other'}
        num_labels = 9
    elif dataset_name == 'swmh':
        labels = {0: 'depression', 1: 'suicide', 2: 'anxiety', 3: 'bipolar', 4: 'no mental'}
        num_labels = 5
    elif dataset_name == 't-sid':
        labels = {0: 'depression', 1: 'suicide', 2: 'ptsd', 3: 'control'}
        num_labels = 4
    else:
        raise Exception("ERROR! please choose the correct dataset!")

    return labels, num_labels

In [5]:
def calculate_f1(goldens, final_labels, dataset_name):
    golden_label = []
    output_label = []
    for golden, label in zip(goldens, final_labels):
        ref_an = golden.split("Reasoning:")[0]
        output_an = label.strip()

        if 'swmh' in dataset_name:
            if 'no mental' in output_an.lower():
                output_label.append(0)
            elif 'suicide' in output_an.lower():
                output_label.append(1)
            elif 'depression' in output_an.lower():
                output_label.append(2)
            elif 'anxiety' in output_an.lower():
                output_label.append(3)
            elif 'bipolar' in output_an.lower():
                output_label.append(4)
            else:
                raise Exception('Wrong label in predictions for {}'.format(dataset_name))

            if 'no mental' in ref_an.lower():
                golden_label.append(0)
            elif 'suicide' in ref_an.lower():
                golden_label.append(1)
            elif 'depression' in ref_an.lower():
                golden_label.append(2)
            elif 'anxiety' in ref_an.lower():
                golden_label.append(3)
            elif 'bipolar' in ref_an.lower():
                golden_label.append(4)
            else:
                output_label = output_label[:-1]

        elif dataset_name == 't-sid':
            if 'depression' in output_an.lower():
                output_label.append(2)
            elif 'suicide' in output_an.lower():
                output_label.append(1)
            elif 'ptsd' in output_an.lower():
                output_label.append(3)
            elif 'control' in output_an.lower():
                output_label.append(0)
            else:
                raise Exception('Wrong label in predictions for {}'.format(dataset_name))

            if 'depression' in ref_an.lower():
                golden_label.append(2)
            elif 'suicide or self-harm' in ref_an.lower():
                golden_label.append(1)
            elif 'ptsd' in ref_an.lower():
                golden_label.append(3)
            elif 'no mental' in ref_an.lower():
                golden_label.append(0)

        elif dataset_name in ['CLP', 'DR', 'dreaddit', 'loneliness', 'Irf', 'MultiWD']:
            if 'yes' in output_an.lower():
                output_label.append(1)
            elif 'no' in output_an.lower():
                output_label.append(0)
            else:
                raise Exception('Wrong label in predictions for {}'.format(dataset_name))

            if 'yes' in ref_an.lower():
                golden_label.append(1)
            elif 'no' in ref_an.lower():
                golden_label.append(0)

        elif dataset_name == 'SAD':
            if 'school' in output_an.lower():
                output_label.append(0)
            elif 'financial' in output_an.lower():
                output_label.append(1)
            elif 'family' in output_an.lower():
                output_label.append(2)
            elif 'social' in output_an.lower():
                output_label.append(3)
            elif 'work' in output_an.lower():
                output_label.append(4)
            elif 'health' in output_an.lower():
                output_label.append(5)
            elif 'emotional' in output_an.lower():
                output_label.append(6)
            elif 'everyday' in output_an.lower():
                output_label.append(7)
            elif 'other' in output_an.lower():
                output_label.append(8)
            else:
                raise Exception('Wrong label in predictions for {}'.format(dataset_name))

            if 'school' in ref_an.lower():
                golden_label.append(0)
            elif 'financial problem' in ref_an.lower():
                golden_label.append(1)
            elif 'family issues' in ref_an.lower():
                golden_label.append(2)
            elif 'social relationships' in ref_an.lower():
                golden_label.append(3)
            elif 'work' in ref_an.lower():
                golden_label.append(4)
            elif 'health issues' in ref_an.lower():
                golden_label.append(5)
            elif 'emotion turmoil' in ref_an.lower():
                golden_label.append(6)
            elif 'everyday decision making' in ref_an.lower():
                golden_label.append(7)
            elif 'other' in ref_an.lower():
                golden_label.append(8)

        elif dataset_name == 'CAMS':
            if 'none' in output_an.lower():
                output_label.append(0)
            elif 'bias' in output_an.lower():
                output_label.append(1)
            elif 'job' in output_an.lower():
                output_label.append(2)
            elif 'medication' in output_an.lower():
                output_label.append(3)
            elif 'relation' in output_an.lower():
                output_label.append(4)
            elif 'alienation' in output_an.lower():
                output_label.append(5)
            else:
                raise Exception('Wrong label in predictions for {}'.format(dataset_name))

            if 'no causes' in ref_an.lower():
                golden_label.append(0)
            elif 'bias or abuse' in ref_an.lower():
                golden_label.append(1)
            elif 'jobs and career' in ref_an.lower():
                golden_label.append(2)
            elif 'medication' in ref_an.lower():
                golden_label.append(3)
            elif 'relationship' in ref_an.lower():
                golden_label.append(4)
            elif 'alienation' in ref_an.lower():
                golden_label.append(5)
    avg_accuracy = round(accuracy_score(golden_label, output_label) * 100, 2)
    weighted_f1 = round(f1_score(golden_label, output_label, average='weighted') * 100, 2)
    micro_f1 = round(f1_score(golden_label, output_label, average='micro') * 100, 2)
    macro_f1 = round(f1_score(golden_label, output_label, average='macro') * 100, 2)
    print("Dataset: {}, average acc:{}, weighted F1 {}, micro F1 {}, macro F1 {}".format(dataset_name,
                                                                                         avg_accuracy, weighted_f1,
                                                                                         micro_f1, macro_f1))

In [6]:
import pandas as pd

In [10]:
data = pd.read_csv("/content/MultiWD123.csv")

In [13]:
goldens = data['goldens'].to_list()
generated_text = data['generated text'].to_list()

In [15]:
labels, num_labels = get_dict("MultiWD")

In [16]:
from transformers import AutoModel, AutoTokenizer

# Replace 'bert-base-uncased' with the name of the model you want
model_name = 'Tianlin668/MultiWD'

# Download the pre-trained model and tokenizer
mentalbert = BertForSequenceClassification.from_pretrained(model_name,num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained(model_name)

(…)nlin668/MultiWD/resolve/main/config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

(…)ltiWD/resolve/main/tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

(…)ianlin668/MultiWD/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)iWD/resolve/main/special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

(…)n668/MultiWD/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
print(labels)

{0: 'no', 1: 'yes'}


In [18]:
all_labels = []
for i in range(0, len(generated_text)):
    batch_data = generated_text[i]
    inputs = tokenizer(batch_data, return_tensors="pt", truncation=True, max_length=512)
    outputs = mentalbert(**inputs)[0]
    outputs = outputs.cpu().detach().numpy()
    all_labels.append(np.argmax(outputs))
print(all_labels)
final_labels = []
for num in all_labels:
    final_labels.append(labels[num])
calculate_f1(goldens, final_labels,"MultiWD")

[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0]
Dataset: MultiWD, average acc:71.0, weighted F1 70.56, micro F1 71.0, macro F1 68.53


In [19]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=f87a6aa8bea5314fd99a3bdf0de9ed2bd1745b8560c9b281745d56fa2f34797b
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [22]:
def rouge():
    rouge = evaluate.load('rouge')
    score_results = {}
    dname = "SADp"
    predictions = data['generated text'].to_list()
    references = data['goldens'].to_list()
    result = rouge.compute(predictions=predictions, references=references)
    score_results[dname] = result
    return score_results


In [23]:
print(rouge())

{'SADp': {'rouge1': 0.5996267783650431, 'rouge2': 0.3963551111857645, 'rougeL': 0.49629120191213183, 'rougeLsum': 0.49673748045497285}}


In [26]:
def bleu():
    rouge = evaluate.load('bleu')
    score_results = {}
    dname = "SAD"
    predictions = data['generated text'].to_list()
    references = data['goldens'].to_list()
    result = rouge.compute(predictions=predictions, references=references)
    score_results[dname] = result['bleu']
    return score_results


In [27]:
print(bleu())

{'SAD': 0.34920875550994324}


In [28]:
!git clone https://github.com/neulab/BARTScore

Cloning into 'BARTScore'...
remote: Enumerating objects: 220, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 220 (delta 18), reused 14 (delta 14), pack-reused 194[K
Receiving objects: 100% (220/220), 101.98 MiB | 20.63 MiB/s, done.
Resolving deltas: 100% (47/47), done.
Updating files: 100% (192/192), done.


In [29]:

from BARTScore.bart_score import BARTScorer
def compute_bart_score(a,b):
    scorer = BARTScorer()
    score = scorer.score(a,b)
    return score


In [30]:
print(sum(compute_bart_score(generated_text,goldens))/200)

(…)k/bart-large-cnn/resolve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

(…)k/bart-large-cnn/resolve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)rt-large-cnn/resolve/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

(…)/bart-large-cnn/resolve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

(…)-cnn/resolve/main/generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

-2.3035568702220917


In [31]:
pip install bert-score


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13


In [32]:
from bert_score import score

def calculate_bert_score(reference, candidate):

    P, R, F1 = score(candidate, reference, lang="en", verbose=True)

    return F1.mean().item()


reference_sentence = goldens
candidate_sentence = generated_text

bert_score = calculate_bert_score(reference_sentence,candidate_sentence)

print("BERTScore:", bert_score)


(…)o/roberta-large/resolve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

(…)co/roberta-large/resolve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

(…)co/roberta-large/resolve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)oberta-large/resolve/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 5.79 seconds, 34.57 sentences/sec
BERTScore: 0.9246304035186768
