## Predict emotions with fine-tuned MentalBERT

In [3]:
from pathlib import Path
import json

import pandas as pd
import torch
from tqdm.auto import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

MODEL_DIR = Path('mentalbert-goemotions-ekman-model')
DATA_PATH = Path('dataset/final_cleaned_dataset_bb.csv')
OUTPUT_PATH = Path('dataset/beyondb_post_predictions.csv')


In [4]:
df = pd.read_csv(DATA_PATH)
df['cleaned_post_content'] = df['cleaned_post_content'].fillna('')
df[['forum_name', 'thread_id', 'cleaned_post_content']].head()


Unnamed: 0,forum_name,thread_id,cleaned_post_content
0,anxiety,610659,hi my name is michelle and i have had anxiety ...
1,anxiety,611578,i ve never written on a forum like this before...
2,anxiety,611573,i am international student to tasmania in 2021...
3,anxiety,611587,"hello, i struggle with feeling very home sick ..."
4,anxiety,180185,and it was entirely my own fault. i tried cros...


In [5]:
with open(MODEL_DIR / 'label_mapping.json') as f:
    raw_mapping = json.load(f)

id2label = {int(idx): label for idx, label in raw_mapping['id2label'].items()}
label2id = {label: int(idx) for label, idx in raw_mapping['label2id'].items()}
label_order = label2id

model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model.config.id2label = id2label
model.config.label2id = label2id

device = 0 if torch.cuda.is_available() else -1
classifier = pipeline(
    task='text-classification',
    model=model,
    tokenizer=tokenizer,
    device=device,
    function_to_apply='softmax',
    return_all_scores=True,
)

texts = df['cleaned_post_content'].tolist()
predictions = []

batch_size = 32
for start in tqdm(range(0, len(texts), batch_size), desc='Scoring posts'):
    batch = texts[start:start + batch_size]
    outputs = classifier(
        batch,
        truncation=True,
        padding=True,
    )
    predictions.extend(outputs)

pred_emotions = []
pred_probabilities = []

for pred in predictions:
    best = max(pred, key=lambda x: x['score'])
    pred_emotions.append(best['label'])
    sorted_probs = sorted(pred, key=lambda x: label_order[x['label']])
    pred_probabilities.append(json.dumps({item['label']: round(float(item['score']), 6) for item in sorted_probs}))

result_df = df.copy()
result_df['pred_emotions'] = pred_emotions
result_df['pred_probabilities'] = pred_probabilities
result_df[['forum_name', 'thread_id', 'pred_emotions', 'pred_probabilities']].head()


Device set to use cuda:0
Scoring posts:   3%|▎         | 10/399 [00:01<00:55,  6.99it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Scoring posts: 100%|██████████| 399/399 [01:00<00:00,  6.61it/s]


Unnamed: 0,forum_name,thread_id,pred_emotions,pred_probabilities
0,anxiety,610659,fear,"{""anger"": 0.036796, ""disgust"": 0.010219, ""fear..."
1,anxiety,611578,fear,"{""anger"": 0.024068, ""disgust"": 0.002226, ""fear..."
2,anxiety,611573,sadness,"{""anger"": 0.062954, ""disgust"": 0.003347, ""fear..."
3,anxiety,611587,fear,"{""anger"": 0.023452, ""disgust"": 0.003108, ""fear..."
4,anxiety,180185,sadness,"{""anger"": 0.005593, ""disgust"": 0.001109, ""fear..."


In [6]:
output_df = result_df[['forum_name', 'thread_id', 'cleaned_post_content', 'pred_emotions', 'pred_probabilities']]
output_df.to_csv(OUTPUT_PATH, index=False)
OUTPUT_PATH


PosixPath('dataset/beyondb_post_predictions.csv')

Predictions saved to `project_B/dataset/final_cleaned_dataset_bb_with_predictions.csv`.