## Predict emotions for author replies

In [1]:
from pathlib import Path
import json

import pandas as pd
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

MODEL_DIR = Path('mentalbert-goemotions-ekman-model')
DATA_PATH = Path('dataset/beyondb_authors_reply.csv')
OUTPUT_PATH = Path('dataset/beyondb_authors_reply_with_emotions.csv')


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(DATA_PATH)
df['cleaned_authors_comment'] = df['cleaned_authors_comment'].fillna('')
reply_mask = df['Author_replied'] == 1
reply_texts = df.loc[reply_mask, 'cleaned_authors_comment'].tolist()
print(f'Total rows: {len(df):,}; replies: {reply_mask.sum():,}')


Total rows: 12,763; replies: 7,321


In [3]:
with open(MODEL_DIR / 'label_mapping.json') as f:
    raw_mapping = json.load(f)
id2label = {int(k): v for k, v in raw_mapping['id2label'].items()}
label2id = {v: int(k) for v, k in raw_mapping['label2id'].items()}
label_order = label2id

model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model.config.id2label = id2label
model.config.label2id = label2id
max_position_embeddings = getattr(model.config, 'max_position_embeddings', None)
if max_position_embeddings and max_position_embeddings > 2:
    MAX_SEQ_LENGTH = max_position_embeddings - 2
else:
    model_max = tokenizer.model_max_length if tokenizer.model_max_length < 10_000 else 512
    MAX_SEQ_LENGTH = model_max if model_max > 0 else 512
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline(
    task='text-classification',
    model=model,
    tokenizer=tokenizer,
    device=device,
    function_to_apply='softmax',
    return_all_scores=True,
)
MAX_SEQ_LENGTH


Device set to use cuda:0


510

In [4]:
predictions = []
batch_size = 32
for start in tqdm(range(0, len(reply_texts), batch_size), desc='Scoring replies'):
    batch = reply_texts[start:start + batch_size]
    outputs = classifier(
        batch,
        truncation=True,
        padding=True,
        max_length=MAX_SEQ_LENGTH,
    )
    predictions.extend(outputs)

pred_emotions = []
pred_probabilities = []
for pred in predictions:
    best = max(pred, key=lambda x: x['score'])
    pred_emotions.append(best['label'])
    sorted_probs = sorted(pred, key=lambda x: label_order[x['label']])
    pred_probabilities.append(json.dumps({item['label']: round(float(item['score']), 6) for item in sorted_probs}))

result_df = df.copy()
result_df.loc[reply_mask, 'pred_authors_reply_emotions'] = pred_emotions
result_df.loc[reply_mask, 'pred_authors_reply_probabilities'] = pred_probabilities
result_df[['forum_name', 'thread_id', 'pred_authors_reply_emotions']].head()


Scoring replies:   4%|▍         | 10/229 [00:01<00:32,  6.74it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Scoring replies: 100%|██████████| 229/229 [00:36<00:00,  6.27it/s]


Unnamed: 0,forum_name,thread_id,pred_authors_reply_emotions
0,anxiety,610659,
1,anxiety,611578,
2,anxiety,611573,
3,anxiety,611587,
4,anxiety,180185,


In [5]:
output_df = result_df[['forum_name', 'thread_id', 'Author_replied', 'cleaned_authors_comment']].copy()
output_df['pred_authors_reply_emotions'] = result_df['pred_authors_reply_emotions']
output_df['pred_authors_reply_probabilities'] = result_df['pred_authors_reply_probabilities']
output_df.to_csv(OUTPUT_PATH, index=False)
OUTPUT_PATH


PosixPath('dataset/beyondb_authors_reply_with_emotions.csv')

In [14]:
df = pd.read_csv("dataset/beyondb_authors_reply_with_emotions.csv")
count = df["pred_authors_reply_emotions"].value_counts()
print(count)

pred_authors_reply_emotions
joy         5924
sadness      727
fear         281
surprise     257
anger         87
neutral       41
disgust        4
Name: count, dtype: int64


Emotion predictions saved to `project_B/dataset/beyondb_authors_reply_with_emotions.csv`.