In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import pipeline
import torch

In [2]:
def load_sst5(split="train", num_samples=5000):
    dataset = load_dataset("SetFit/sst5", "default", split=split)
    dataset = dataset.filter(lambda x: x["label_text"] is not None)
    dataset = dataset.shuffle(seed=42).select(range(min(num_samples, len(dataset))))

    texts = []
    labels = []
    for sample in dataset:
        text = sample["text"].strip().replace("\n", " ")
        label_text = sample["label_text"]
        full_text = f"Text: {text}\nLabel: {label_text}"
        texts.append(full_text)
        labels.append(label_text)
    return texts, labels

In [3]:
train_texts, train_labels = load_sst5()
test_texts, test_labels = load_sst5("test", 1000)

Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


In [4]:
def organize_SST5_into_df(texts, labels):
    input_data = []
    for each_text in texts:
        sentiment_line = next(line for line in each_text.split('\n') if line.lower().startswith("text"))
        input_data.append(sentiment_line.lower().split('text:')[1].strip())
    df = pd.DataFrame({
        'text': input_data,
        'sentiment': labels
    })
    return df

In [5]:
df_test_1000 = organize_SST5_into_df(test_texts, test_labels)

In [6]:
from sentence_transformers import SentenceTransformer

def encode_text(text):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    return model.encode(text)

from sklearn.metrics.pairwise import cosine_similarity

def select_top_k(query_embedding, candidate_embeddings, k):
    similarities = cosine_similarity([query_embedding], candidate_embeddings)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]
    return top_k_indices

In [7]:
df_train_5000_with_embeddings = pd.read_pickle("train_5000_with_embeddings_SST5.pkl")

In [None]:
# from openai import OpenAI
# client = OpenAI(api_key="your_apikey")

def zero_shot_cot_gpt_4_o(input_):
    completion = client.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
        {"role": "user", "content": "What is the sentiment of the input? negative, neutral, positive, very negative, very positive?\nThe response should follow the format: Sentiment:{negative, neutral, positive, very negative, very positive}\nReason:{reason}" + f"\nInput: {input_}" + "\nLet's think step by step."}
      ] 
    )
    return completion.choices[0].message.content

def few_shot_cot_gpt_4_o(examples, input_):
    completion = client.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
        {"role": "user", "content": "What is the sentiment of the input? negative, neutral, positive, very negative, very positive?" + f"\n{examples}" +"\nThe response should follow the format: Sentiment:{negative, neutral, positive, very negative, very positive}\nReason:{reason}\nHere is the test data" + f"\nInput: {input_}" + "\nLet's think step by step."}
      ] 
    )
    return completion.choices[0].message.content

In [9]:
from collections import Counter

def iterative_demonstration_selection(test_sample, train_samples, k=4, q=2):

    train_embeddings = np.stack(train_samples['embedding'].to_numpy())
    all_answers = []
    reasoning_path = zero_shot_cot_gpt_4_o(test_sample)

    for _ in range(q):
        query_embedding = encode_text(reasoning_path)
        selected_indices = select_top_k(query_embedding, train_embeddings, k)
        demonstrations = [train_samples.iloc[i] for i in selected_indices]

        examples_prompt = "\n".join(
            [f"Input: {row['text']}\nTopic: {row['sentiment']}" for row in demonstrations]
        )

        result = few_shot_cot_gpt_4_o(examples_prompt, test_sample)

        try:
            sentiment_line = next(line for line in result.split('\n') if line.lower().startswith("sentiment"))
            reason_line = next(line for line in result.split('\n') if line.lower().startswith("reason"))
        except StopIteration:
            sentiment_line = "Sentiment: unknown"
            reason_line = "Reason: unknown"

        reasoning_path = reason_line
        answer = sentiment_line.split(":", 1)[-1].strip().lower()
        all_answers.append(answer)

    final_answer = Counter(all_answers).most_common(1)[0][0]
    return final_answer

In [10]:
def run_ids_on_test_set(df_test, df_train, k=4, q=3):
    predictions = []
    reasoning_paths = []

    for i, row in tqdm(df_test.iterrows(), total=len(df_test), desc="Running IDS on test set"):
        test_text = row['text']
        true_label = row['sentiment']
        
        # Run IDS
        try:
            final_answer = iterative_demonstration_selection(test_text, df_train, k=k, q=q)
        except Exception as e:
            final_answer = "error"
            print(f"Error on test sample {i}: {e}")

        predictions.append(final_answer)
        reasoning_paths.append(test_text)

    df_test_result = df_test.copy()
    df_test_result['prediction'] = predictions
    df_test_result['input'] = reasoning_paths

    return df_test_result

In [11]:
# import torch
# torch.cuda.empty_cache()

In [12]:
df_test_result = run_ids_on_test_set(df_test_1000, df_train_5000_with_embeddings, k=4, q=3)

Running IDS on test set: 100%|██████████| 1000/1000 [2:19:22<00:00,  8.36s/it] 


In [13]:
df_test_result

Unnamed: 0,text,sentiment,prediction,input
0,a 93-minute condensation of a 26-episode tv se...,negative,neutral,a 93-minute condensation of a 26-episode tv se...
1,the premise is overshadowed by the uberviolenc...,negative,very negative,the premise is overshadowed by the uberviolenc...
2,this is a startling film that gives you a fasc...,positive,neutral,this is a startling film that gives you a fasc...
3,but you 'll definitely want the t-shirt .,neutral,positive,but you 'll definitely want the t-shirt .
4,i tried to read the time on my watch .,negative,neutral,i tried to read the time on my watch .
...,...,...,...,...
995,focuses on joan 's raging hormones and sledgeh...,very negative,very negative,focuses on joan 's raging hormones and sledgeh...
996,polished korean political-action film is just ...,neutral,neutral,polished korean political-action film is just ...
997,-lrb- fessenden -rrb- is much more into ambigu...,positive,negative,-lrb- fessenden -rrb- is much more into ambigu...
998,lovingly choreographed bloodshed taking place ...,positive,negative,lovingly choreographed bloodshed taking place ...


In [14]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(df_test_result['sentiment'], df_test_result['prediction'])
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 52.60%


In [15]:
df_test_result[['input','sentiment','prediction']].to_csv("SST5_ids_results_gpt4o.csv", index=False)