In [2]:
import pandas as pd
import re
import json
from tqdm import tqdm

from llama_index.llms.openai import OpenAI

from dotenv import load_dotenv
import os
load_dotenv()

True

In [3]:
json_file_path = "dictionary/slang.json"

# Load JSON file
with open(json_file_path, "r", encoding="utf-8") as f:
    slang_dict = json.load(f)

In [None]:
train_val_df = pd.read_excel("backup_v2/train_val_df.xlsx")
test_df = pd.read_excel("backup_v2/test_df.xlsx")
df = pd.concat([train_val_df, test_df])
unique_text_df = pd.DataFrame({'text' : df.text.unique()})

In [5]:
myWorker = OpenAI(model="gpt-4o-mini", temperature=0, max_retries=20)

# Segmentation, and Entity/Aspect, Opinion and Emotion extraction

In [73]:
def prompt_update(comment, slang_part):
    # GPT prompt template
    PROMPT_TEMPLATE = f"""# Objective: You are an assistant that processes long user comments. Your task is to:

# Rules:
1. Split the comment into short, coherent segments.
2. For each segment, extract all (aspect, opinion, emotion) triplets.
3. Use ONLY the following emotion labels: ["NEUTRAL", "SADNESS", "JOY", "ANGER", "DISGUST", "FEAR", "SURPRISE", "INVALID"]
4. If no valid or clear emotion is expressed, label the triplet with "INVALID".
5. When the tone is sarcastic, mocking, or ironic, infer the actual emotion conveyed.
6. Do not include explanatory text. Only return the JSON in the exact format shown below.

# Format:
[
  {{
    "segment": "...",
    "triplets": [
      {{"aspect": "...", "opinion": "...", "emotion": "..."}}
    ]
  }}
]

# Examples:

Comment: "> You don't take off your mask in malls either (only in restaurants). People are allowed to take off their masks to consume food and drink. Personally, I see many people eat their OCK and sip their BBT while walking around malls."
Response:
[
  {{
    "segment": "You don't take off your mask in malls either (only in restaurants). People are allowed to take off their masks to consume food and drink.",
    "triplets": [
      {{"aspect": "You", "opinion": "don't take off in malls", "emotion": "NEUTRAL"}},
      {{"aspect": "people", "opinion": "allowed to take off their masks to consume food", "emotion": "INVALID"}}
    ]
  }},
  {{
    "segment": "Personally, I see many people eat their OCK and sip their BBT while walking around malls.",
    "triplets": [
      {{"aspect": "people", "opinion": "eat their OCK", "emotion": "NEUTRAL"}},
      {{"aspect": "people", "opinion": "sip their BBT", "emotion": "NEUTRAL"}}
    ]
  }}
]

Comment: "> Fools still think the shot is helping. Awesome"
Response:
[
  {{
    "segment": "Fools still think the shot is helping. Awesome",
    "triplets": [
      {{"aspect": "shot", "opinion": "is helping", "emotion": "DISGUST"}}
    ]
  }}
]

Comment: "> Sure, the government *really* cares about us."
Response:
[
  {{
    "segment": "Sure, the government really cares about us.",
    "triplets": [
      {{"aspect": "government", "opinion": "really cares about us", "emotion": "DISGUST"}}
    ]
  }}
]

Comment: "> It was raining yesterday."
Response:
[
  {{
    "segment": "It was raining yesterday.",
    "triplets": [
      {{"aspect": "weather", "opinion": "raining", "emotion": "INVALID"}}
    ]
  }}
]

Comment: "Singapore's nightlife not bad eh, got some hidden gems."
Response:
[
  {{
    "segment": "Singapore's nightlife not bad eh, got some hidden gems.",
    "triplets": [
      {{"aspect": "Singapore's nightlife", "opinion": "not bad", "emotion": "JOY"}},
      {{"aspect": "Singapore's nightlife", "opinion": "hidden gems", "emotion": "JOY"}}
    ]
  }}
]

Here is the comment: "{comment}"
{slang_part}
"""

    return PROMPT_TEMPLATE

In [None]:
start_index = 0
text_list = []

for index, row in tqdm(unique_text_df.iloc[start_index:].iterrows(), total=len(unique_text_df) - start_index):
    row_text = row.text.replace('\n\n', ' ')

    # Collect slang meanings
    pot_slangs = row_text.split()
    slang_part = ""
    for slang in pot_slangs:
        slang = re.sub(r"[^\w\s]", "", slang.strip().lower())
        if slang in slang_dict:
            slang_part += f"\nAdditional information: The slang '{slang}' in the comment could mean '{slang_dict[slang]}'. Use this potential meaning to refine your response."

    slang_part = slang_part.replace('\n\n', '\n')
    prompt = prompt_update(row_text, slang_part)

    response = myWorker.complete(prompt).text
    text_list.append(response)

100%|██████████| 3574/3574 [3:53:40<00:00,  3.92s/it]  


## Saving the raw responses

In [90]:
folder_path = "backup_v4"
json_file_path = os.path.join(folder_path, "all_data_raw_processed.json")
os.makedirs(folder_path, exist_ok=True)

with open(json_file_path, "w") as file:
    json.dump(text_list, file, indent=4)

# Make responses into a df

In [91]:
with open("backup_v4/all_data_raw_processed.json", "r", encoding="utf-8") as f:
    loaded_text = json.load(f)

In [104]:
processed_list = []

for cleaned_replies, original in zip(loaded_text, unique_text_df.itertuples(index=False)):
    comment = original.text

    try:
        annotations = json.loads(cleaned_replies)
    except json.JSONDecodeError:
        print(f"Error parsing JSON")
        print(original)
        break

    for annotation in annotations:
        segment = annotation["segment"]
        for triplet in annotation["triplets"]:
            processed_list.append({
                "text": comment,             # Full original comment
                "segment": segment,          # The local segment
                "aspect": triplet["aspect"],
                "opinion": triplet["opinion"],
                "emotion": triplet["emotion"]
            })


In [154]:
final_df = pd.DataFrame(processed_list)

# Clean emotions

In [None]:
emotion_mapping = {
    "INVALID": "NEUTRAL",
    "JOY": "JOY",
    "NEUTRAL": "NEUTRAL",
    "DISGUST": "DISGUST",
    "SADNESS": "SADNESS",
    "FEAR": "FEAR",
    "ANGRY": "ANGER",
    "ANGER": "ANGER",
    "SURPRISE": "SURPRISE",
    "CONFUSION": "SADNESS",
    "FRUSTRATION": "ANGER",
    "HOPE": "JOY",
    "SKEPTICISM": "NEUTRAL",
    "SKEPTICAL": "NEUTRAL"
}

final_df["emotion"] = final_df["emotion"].map(emotion_mapping)

# Clean Opinions

In [157]:
invalid_opinion = ["not invalid",
                    "invalid",
                    "INVALID",
                    "is invalid",
                    "invalidated",
                    "not invaliding",
                    "are invalid",
                    "invalid",
                    "is invalidating",
                    "doesn't mean either point is invalid"]

In [158]:
print(f"before dropping: {len(final_df)}")
final_df = final_df[~final_df["opinion"].isin(invalid_opinion)]
print(f"after dropping: {len(final_df)}")

before dropping: 17292
after dropping: 17266


In [159]:
final_df = final_df.dropna(subset=["opinion"])
final_df = final_df[(final_df["opinion"].str.strip() != "")]

# Clean unmatched triplets

In [160]:
def clean_unmatched_triplets(triplet_data):
    """
    Removes triplets where the aspect or opinion does not appear in the segment text.
    Assumes triplet_data is a list of dicts with keys: 'text', 'segment', 'aspect', 'opinion', 'emotion'
    """
    cleaned = []

    for idx, row in triplet_data.iterrows():
        segment = row["segment"].lower()
        aspect = row["aspect"].lower()
        opinion = row["opinion"].lower()

        if aspect in segment and opinion in segment:
            cleaned.append(row)

    return cleaned

In [161]:
cleaned_data = clean_unmatched_triplets(final_df)
final_df_cleaned = pd.DataFrame(cleaned_data)

In [162]:
print(final_df_cleaned.emotion.value_counts(normalize=True))
print(len(final_df_cleaned))

emotion
NEUTRAL     0.337441
JOY         0.195355
SADNESS     0.182654
DISGUST     0.172796
ANGER       0.074408
FEAR        0.027393
SURPRISE    0.009953
Name: proportion, dtype: float64
10550


# Saving

In [None]:
final_df_cleaned.to_excel("backup_v4/final_df.xlsx", index=False)
final_df_cleaned.to_json("backup_v4/final_df.json", orient="records", indent=2, force_ascii=False)

In [164]:
df_testing = pd.read_excel("backup_v4/final_df.xlsx")
df_testing

Unnamed: 0,text,segment,aspect,opinion,emotion
0,pei shan lin there won't be any complaint?,pei shan lin there won't be any complaint?,complaint,won't be any,NEUTRAL
1,breaking: singapore will repeal section 377a a...,breaking: singapore will repeal section 377a a...,section 377a,will repeal,JOY
2,breaking: singapore will repeal section 377a a...,breaking: singapore will repeal section 377a a...,sex between men,decriminalise,JOY
3,"Kama Dege That’s how propaganda works. Russia,...",Kama Dege That’s how propaganda works.,propaganda,works,NEUTRAL
4,"Kama Dege That’s how propaganda works. Russia,...",So no love from the world.,world,no love,SADNESS
...,...,...,...,...,...
10545,"sanip said if u abolish it, a whole lot more w...","sanip said if u abolish it, a whole lot more w...",abolish it,a whole lot more will come,NEUTRAL
10546,Koh Ah Chwee I do support them lifting for oth...,Koh Ah Chwee I do support them lifting for oth...,lifting,support them lifting for other issues,JOY
10547,Koh Ah Chwee I do support them lifting for oth...,And lifting doesn't automatically mean they ha...,PAP,got no excuse to hide,DISGUST
10548,Koh Ah Chwee I do support them lifting for oth...,So they are actually even more accountable now...,actions,have to explain their actions,NEUTRAL
