In [1]:
import pandas as pd
import numpy as np

In [2]:
emotions = ['anger', 'fear', 'joy', 'sadness', 'surprise']

In [None]:
api_key = ""

In [4]:
test_df = pd.read_csv('test1.csv')

test_text = []

for idx, row in test_df.iterrows():
    test_text.append(row['text'])

In [5]:
from openai import OpenAI
client = OpenAI(api_key=api_key)

In [6]:
import pandas as pd
import json
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document

def retrieve_similar(query_text, k=5):
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=api_key)
    vector_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
    results = vector_db.similarity_search(query_text, k=k)

    out = []
    for i, doc in enumerate(results):
        out.append((doc.page_content, doc.metadata['label'], doc.metadata['reasoning']))
    return out

In [17]:
# Parse response
import ast
import re

preds = {}
for idx, p in enumerate(test_text):
    if idx % 200 == 0:
        print(idx)
        
    example = retrieve_similar(p)
    reasoning = '\n'.join(i[2] for i in example)
    
    prompt2 = f"""    
    Task: Classify tweets according to the emotions likely experienced by the author while writing them.
    
    Available Emotions:
    - Joy
    - Fear
    - Anger
    - Sadness
    - Surprise
    
    Classification Rules:
    1. Focus on the author's emotional state, not your reaction as a reader
    2. A tweet can contain multiple emotions simultaneously
    3. Consider both explicit emotional words and implicit context
    4. Identify emotional indicators only when they are strongly supported by explicit or contextually clear evidence.
    5. The only emotions you are categorizing are 'anger', 'fear', 'joy', 'sadness', and 'surprise'
    6. Slight emotions do count. However, make sure to add your reasoning to why you think it is present.
    7. The dataset is classified by 3 different humans. They may have biases. Use the examples to assist you in predicting what THE HUMANS LABELERS would classify the emotions as.

    Thinking Process:
    1. Go through every emotion and think step by step if the emotion is in the tweet.
    2. Think of every possible tone or context the author has.

    After making initial analysis, explicitly ask:
    1. "Am I confident in this conclusion?"
    2. "What evidence supports or contradicts this?"
    3. "What assumptions am I making?"
    4. "Would a reasonable person with no background context agree that this emotion is clearly present based solely on the tweet?"
        
    After asking those questions, make your final prediction.
    
    Output Format:
    For each tweet, provide:
    1. A clear explanation of why each emotion was identified, referencing specific words or phrases from the tweet and other reasoning from the guidelines.
    2. The emotions detected [in brackets], with quotes around each emotion. Like this: ['joy', 'sadness']
    
    Examples:
    {reasoning}

    Please classify the following tweet:
    {p}

    """


    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        store=True,
        messages=[
            {"role": "user", "content": prompt2}
        ]
    )
    # out = ast.literal_eval(re.findall(r'\[.*?\]', completion.choices[0].message.content)[0]) 
    preds.update({idx: completion})

0
200
400
600
800
1000
1200


In [70]:
q = [i.choices[0].message.content for i in preds_list]

In [59]:
for i in q:
    print(re.findall(r'\[.*?\]', i))
    k = ast.literal_eval(re.findall(r'\[.*?\]', i)[0])
    print(k)
    break

["['fear', 'anger']"]
['fear', 'anger']


In [75]:
# preds_list = list(preds.values())
cleaned = [ast.literal_eval(re.findall(r'\[.*?\]', i)[0]) for i in q]

binary_preds = []
for i in cleaned:
    pred = []
    for emotion in emotions:
        if emotion in i:
            pred.append(1)
        else:
            pred.append(0)
    binary_preds.append(pred)

In [71]:
# ast.literal_eval(re.findall(r'\[.*?\]', q[1])[0])

['fear', 'sadness']

In [76]:
binary_preds[1]

[0, 1, 0, 1, 0]

In [77]:
qk = []

for idx, row in test_df.iterrows():
    id = row['id']
    qk.append([id] + binary_preds[idx])

output_df = pd.DataFrame(qk, columns=['id', 'Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'])


In [78]:
# output_df = output_df.drop('text', axis=1)
output_df

Unnamed: 0,id,Anger,Fear,Joy,Sadness,Surprise
0,eng_test_track_a_00001,1,1,0,0,0
1,eng_test_track_a_00002,0,1,0,1,0
2,eng_test_track_a_00003,1,0,0,0,0
3,eng_test_track_a_00004,0,0,0,1,0
4,eng_test_track_a_00005,0,0,1,1,0
...,...,...,...,...,...,...
1345,eng_test_track_a_01346,0,1,0,0,0
1346,eng_test_track_a_01347,0,1,0,0,0
1347,eng_test_track_a_01348,0,0,0,1,0
1348,eng_test_track_a_01349,0,0,1,0,0


In [79]:
output_df.to_csv('1_results.csv', index=False)