In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from transformers import pipeline
import torch

In [2]:
def load_commonsenseqa(split="train", num_samples=5000):
    dataset = load_dataset("commonsense_qa", split=split)
    dataset = dataset.shuffle(seed=42).select(range(min(num_samples, len(dataset))))

    texts = []
    labels = []

    for example in dataset:
        question = example['question']
        choices = example['choices']['text']
        labels_list = example['choices']['label']
        
        # Create multiple choice string
        choices_text = "\n".join([f"{label}. {text}" for label, text in zip(labels_list, choices)])
        prompt = f"Question: {question}\n{choices_text}"

        if "answerKey" in example:
            answer_index = labels_list.index(example['answerKey'])
            label_text = choices[answer_index]
        else:
            label_text = None  # or "unknown"

        texts.append(prompt)  # Only the prompt, no answer
        labels.append(label_text)

    return texts, labels

In [3]:
train_texts, train_labels = load_commonsenseqa()
test_texts, test_labels = load_commonsenseqa('validation', 1000)

In [4]:
def organize_CQA_into_df(texts, labels):
    input_data = []
    for each_text in texts:
        input_data.append(each_text.lower().split('question:')[1].strip())
    df = pd.DataFrame({
        'text': input_data,
        'label': labels
    })
    return df

In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
def encode_text(text):
    return model.encode(text)

from sklearn.metrics.pairwise import cosine_similarity

def select_top_k(query_embedding, candidate_embeddings, k):
    similarities = cosine_similarity([query_embedding], candidate_embeddings)[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]
    return top_k_indices

In [6]:
df_train_5000_with_embeddings = pd.read_pickle("train_5000_with_embeddings_CQA.pkl")
df_test_1000 = organize_CQA_into_df(test_texts, test_labels)

In [None]:
# from openai import OpenAI
# client = OpenAI(api_key="your_apikey")

def zero_shot_cot_gpt_4_o(input_):
    completion = client.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
        {"role": "user", "content": "Choose the most commonsense answer to the question below.\nYou are given a question and five choices labeled A to E.\nThe response should follow the format:\nAnswer: {your answer}\nReason:{your reasoning here}" + f"\nInput: {input_}" + "\nLet's think step by step."}
      ] 
    )
    return completion.choices[0].message.content

def few_shot_cot_gpt_4_o(examples, input_):
    completion = client.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
        {"role": "user", "content": "Choose the most commonsense answer to the question below.\nYou are given a question and five choices labeled A to E.\nThe response should follow the format:\nAnswer: {your answer}\nReason:{your reasoning here}" + f"\n\n{examples}" + f"\n\nInput: {input_}" + "\nLet's think step by step."}
      ] 
    )
    return completion.choices[0].message.content

In [8]:
from collections import Counter

def iterative_demonstration_selection(test_sample, train_samples, k=4, q=2):

    train_embeddings = np.stack(train_samples['embedding'].to_numpy())
    all_answers = []
    reasoning_path = zero_shot_cot_gpt_4_o(test_sample)

    for _ in range(q):
        query_embedding = encode_text(reasoning_path)
        selected_indices = select_top_k(query_embedding, train_embeddings, k)
        demonstrations = [train_samples.iloc[i] for i in selected_indices]

        examples_prompt = "\n".join(
            [f"Input: {row['text']}\nTopic: {row['label']}" for row in demonstrations]
        )

        result = few_shot_cot_gpt_4_o(examples_prompt, test_sample)

        try:
            answer_line = next(line for line in result.split('\n') if line.lower().startswith("answer"))
            reason_line = next(line for line in result.split('\n') if line.lower().startswith("reason"))
        except StopIteration:
            answer_line = "Answer: unknown"
            reason_line = "Reason: unknown"

        reasoning_path = reason_line
        answer = answer_line.split(":", 1)[-1].strip().lower()
        all_answers.append(answer)

    final_answer = Counter(all_answers).most_common(1)[0][0]
    return final_answer

In [9]:
def run_ids_on_test_set(df_test, df_train, k=4, q=3):
    predictions = []
    reasoning_paths = []

    for i, row in tqdm(df_test.iterrows(), total=len(df_test), desc="Running IDS on test set"):
        test_text = row['text']
        true_label = row['label']
        
        # Run IDS
        try:
            final_answer = iterative_demonstration_selection(test_text, df_train, k=k, q=q)
        except Exception as e:
            final_answer = "error"
            print(f"Error on test sample {i}: {e}")

        predictions.append(final_answer)
        reasoning_paths.append(test_text)

    df_test_result = df_test.copy()
    df_test_result['prediction'] = predictions
    df_test_result['input'] = reasoning_paths

    return df_test_result

In [10]:
df_test_result = run_ids_on_test_set(df_test_1000, df_train_5000_with_embeddings, k=4, q=3)

Running IDS on test set: 100%|██████████| 1000/1000 [1:42:34<00:00,  6.15s/it] 


In [11]:
df_test_result

Unnamed: 0,text,label,prediction,input
0,if you are prone to postpone work what will yo...,hasten,b. hasten,if you are prone to postpone work what will yo...
1,what is a person who is good at sports conside...,talented,c. talented,what is a person who is good at sports conside...
2,where could you find hundreds of thousands of ...,city or town,d. city or town,where could you find hundreds of thousands of ...
3,why would you take a bus to work?\na. commute\...,commute,a. commute,why would you take a bus to work?\na. commute\...
4,where is there a telephone book in almost ever...,at hotel,a. at hotel,where is there a telephone book in almost ever...
...,...,...,...,...
995,you can do knitting to get the feeling of what...,relaxation,a. relaxation,you can do knitting to get the feeling of what...
996,where is a salt shaker most often kept?\na. cr...,table setting,b. table setting,where is a salt shaker most often kept?\na. cr...
997,john was not happy with his marriage. he and h...,unfortunate,d. unfortunate,john was not happy with his marriage. he and h...
998,where do people live?\na. apartment\nb. eat ca...,surface of earth,e. surface of earth,where do people live?\na. apartment\nb. eat ca...


In [12]:
def clean_prediction(pred):
    pred = pred.strip().lower()
    if len(pred) > 2 and pred[1] == '.' and pred[0] in 'abcde':
        return pred[3:].strip()
    return pred
df_test_result['prediction_clean'] = df_test_result['prediction'].apply(clean_prediction)

In [13]:
df_test_result

Unnamed: 0,text,label,prediction,input,prediction_clean
0,if you are prone to postpone work what will yo...,hasten,b. hasten,if you are prone to postpone work what will yo...,hasten
1,what is a person who is good at sports conside...,talented,c. talented,what is a person who is good at sports conside...,talented
2,where could you find hundreds of thousands of ...,city or town,d. city or town,where could you find hundreds of thousands of ...,city or town
3,why would you take a bus to work?\na. commute\...,commute,a. commute,why would you take a bus to work?\na. commute\...,commute
4,where is there a telephone book in almost ever...,at hotel,a. at hotel,where is there a telephone book in almost ever...,at hotel
...,...,...,...,...,...
995,you can do knitting to get the feeling of what...,relaxation,a. relaxation,you can do knitting to get the feeling of what...,relaxation
996,where is a salt shaker most often kept?\na. cr...,table setting,b. table setting,where is a salt shaker most often kept?\na. cr...,table setting
997,john was not happy with his marriage. he and h...,unfortunate,d. unfortunate,john was not happy with his marriage. he and h...,unfortunate
998,where do people live?\na. apartment\nb. eat ca...,surface of earth,e. surface of earth,where do people live?\na. apartment\nb. eat ca...,surface of earth


In [14]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_test_result['label'], df_test_result['prediction_clean'])
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 82.90%


In [15]:
df_test_result[['input','label','prediction_clean']].to_csv("CQA_ids_results_gpt4o.csv", index=False)