In [1]:
import pandas as pd
import torch
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline


In [2]:
# Mount Drive if not already mounted
from google.colab import drive
drive.mount('/content/drive')

# Path where you saved the trained model
MODEL_PATH = "/content/drive/MyDrive/mentalbert_models/"


Mounted at /content/drive


In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

device = 0 if torch.cuda.is_available() else -1

clf = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=device,
)


Device set to use cuda:0


In [4]:
UNLABELED_PATH ='/content/drive/My Drive/collab_dataset/comments_author_doesnt_reply.csv'
df = pd.read_csv(UNLABELED_PATH)

print("Shape:", df.shape)
df.head()


Shape: (5442, 9)


Unnamed: 0,forum_name,thread_id,cleaned_post_content,cleaned_comments_content,comments_sentiment,Author_replied,cleaned_authors_comment,pred_authors_reply_emotions,pred_authors_reply_probabilities
0,anxiety,610659,hi my name is michelle and i have had anxiety ...,dear michelle ~welcome here to the support for...,positive || positive,0,,,
1,anxiety,611578,i ve never written on a forum like this before...,"welcome, and thanks for posting. i understand ...",positive || neutral || negative || positive ||...,0,,,
2,anxiety,611573,i am international student to tasmania in 2021...,"hello tevont, sometimes it can feel like every...",neutral || positive,0,,,
3,anxiety,611587,"hello, i struggle with feeling very home sick ...","hello olive, i can understand how it can be ha...",neutral || positive,0,,,
4,anxiety,180185,and it was entirely my own fault. i tried cros...,hi grovi. i'm so sorry for you with having had...,negative || negative || negative || neutral ||...,0,,,


In [5]:
# Make sure text & sentiment are clean
df["cleaned_post_content"] = df["cleaned_post_content"].fillna("").astype(str)
df["cleaned_comments_content"] = df["cleaned_comments_content"].fillna("").astype(str)
df["comments_sentiment"] = (
    df["comments_sentiment"]
    .astype(str)
    .str.lower()
    .str.strip()
)

def build_input_text(row):
    post = row["cleaned_post_content"]
    comments = row["cleaned_comments_content"]
    sentiment = str(row["comments_sentiment"]).upper()
    return (
        f"POST:\n{post}\n\n"
        f"COMMENTS:\n{comments}\n\n"
        f"[SENTIMENT: {sentiment}]"
    )

df["input_text"] = df.apply(build_input_text, axis=1)
df[["input_text"]].head()


Unnamed: 0,input_text
0,POST:\nhi my name is michelle and i have had a...
1,POST:\ni ve never written on a forum like this...
2,POST:\ni am international student to tasmania ...
3,"POST:\nhello, i struggle with feeling very hom..."
4,POST:\nand it was entirely my own fault. i tri...


In [6]:
texts = df["input_text"].tolist()
batch_size = 32   # you can change if needed

pred_labels = []
pred_scores = []

for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    outputs = clf(batch, truncation=True, max_length=256)
    for o in outputs:
        pred_labels.append(o["label"])   # e.g. 'sadness', 'joy', ...
        pred_scores.append(o["score"])   # confidence 0â€“1


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [7]:
df["pred_authors_reply_emotions"] = pred_labels
df["pred_authors_reply_probabilities"] = pred_scores

df[[
    "cleaned_post_content",
    "cleaned_comments_content",
    "comments_sentiment",
    "pred_authors_reply_emotions",
    "pred_authors_reply_probabilities"
]].head()


Unnamed: 0,cleaned_post_content,cleaned_comments_content,comments_sentiment,pred_authors_reply_emotions,pred_authors_reply_probabilities
0,hi my name is michelle and i have had anxiety ...,dear michelle ~welcome here to the support for...,positive || positive,joy,0.92831
1,i ve never written on a forum like this before...,"welcome, and thanks for posting. i understand ...",positive || neutral || negative || positive ||...,joy,0.890014
2,i am international student to tasmania in 2021...,"hello tevont, sometimes it can feel like every...",neutral || positive,joy,0.790593
3,"hello, i struggle with feeling very home sick ...","hello olive, i can understand how it can be ha...",neutral || positive,joy,0.944687
4,and it was entirely my own fault. i tried cros...,hi grovi. i'm so sorry for you with having had...,negative || negative || negative || neutral ||...,joy,0.706641


In [9]:
OUTPUT_PATH = '/content/drive/My Drive/collab_dataset/unlabeled_with_author_emotion_predictions.csv'
df.to_csv(OUTPUT_PATH, index=False)
OUTPUT_PATH


'/content/drive/My Drive/collab_dataset/unlabeled_with_author_emotion_predictions.csv'

In [10]:
df["pred_authors_reply_emotions"].value_counts()


Unnamed: 0_level_0,count
pred_authors_reply_emotions,Unnamed: 1_level_1
joy,5078
sadness,353
fear,11
