In [None]:
# llama CSQA

import numpy as np
import torch
import re
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.q_table = {}
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.actions = []

    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0.0)

    def update_q_value(self, state, action, reward, next_state):
        max_next_q = max([self.get_q_value(next_state, a) for a in self.actions] or [0])
        old_q = self.q_table.get((state, action), 0.0)
        self.q_table[(state, action)] = old_q + self.alpha * (reward + self.gamma * max_next_q - old_q)

class RDESelector:
    def __init__(self, knowledge_base, model, tokenizer, pipeline, k=5):
        self.knowledge_base = knowledge_base
        self.model = model
        self.tokenizer = tokenizer
        self.k = k
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform([ex['question'] for ex in knowledge_base])
        self.agent = QLearningAgent()
        self.agent.actions = list(range(len(knowledge_base)))
        self.pipe = pipeline

    def _predict_answer(self, prompt):
        """Improved answer extraction with fallback"""
        try:
            output = self.pipe(
                prompt,
                max_new_tokens=50,
                pad_token_id=self.tokenizer.eos_token_id
            )[0]['generated_text']

            match = re.search(r'\b([A-E])\b', output.upper())
            if match:
                return match.group(1)

            for choice in ['A', 'B', 'C', 'D', 'E']:
                if f" {choice} " in output:
                    return choice
            return None
        except Exception as e:
            print(f"Prediction error: {e}")
            return None

    def _get_reward(self, state_indices, action_idx, current_example):
        selected_indices = state_indices + (action_idx,)
        selected = [self.knowledge_base[i] for i in selected_indices[:self.k]]
        prompt = self._create_prompt(selected, current_example)
        prediction = self._predict_answer(prompt)
        return 1 if prediction == current_example['answerKey'] else 0

    def _create_prompt(self, demos, current_q=None):
        messages = [{
            "role": "system",
            "content": "Answer commonsense questions by selecting the correct option (A-E). First think step-by-step, then provide your final answer."
        }]

        for d in demos:
            q_text = f"{d['question']}\nOptions:\n" + "\n".join(
                [f"{label}: {text}" for label, text in zip(d['choices']['label'], d['choices']['text'])]
            )
            messages.append({"role": "user", "content": q_text})
            messages.append({"role": "assistant", "content": f"Answer: {d['answerKey']}"})

        if current_q:
            target_q = f"{current_q['question']}\nOptions:\n" + "\n".join(
                [f"{label}: {text}" for label, text in zip(current_q['choices']['label'], current_q['choices']['text'])]
            )
            messages.append({"role": "user", "content": target_q})

        return self.tokenizer.apply_chat_template(messages, tokenize=False)

    def train(self, train_data, num_episodes=100):
        progress_bar = tqdm(range(num_episodes), desc="Training")
        for episode in progress_bar:
            total_reward = 0
            for ex in train_data:
                input_text = ex['question']
                input_vec = self.vectorizer.transform([input_text])
                similarities = cosine_similarity(input_vec, self.tfidf_matrix).flatten()
                sorted_indices = np.argsort(similarities)[::-1]

                state = tuple(sorted_indices[:self.k])
                action = self._choose_action(state)
                reward = self._get_reward(state, action, ex)
                next_state = tuple(sorted_indices[:self.k] + [action])[:self.k]
                self.agent.update_q_value(state, action, reward, next_state)
                total_reward += reward

            progress_bar.set_postfix({
                "Episode": episode+1,
                "Avg Reward": total_reward/len(train_data)
            })

    def _choose_action(self, state):
        if np.random.random() < self.agent.epsilon:
            return np.random.choice(self.agent.actions)
        return max(self.agent.actions, key=lambda a: self.agent.get_q_value(state, a))

    def select_demos(self, input_text):
        input_vec = self.vectorizer.transform([input_text])
        similarities = cosine_similarity(input_vec, self.tfidf_matrix).flatten()
        sorted_indices = np.argsort(similarities)[::-1]
        state = tuple(sorted_indices[:self.k])
        action = self._choose_action(state)
        selected_indices = state + (action,)
        return [self.knowledge_base[i] for i in selected_indices][:self.k]

model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=50,
    temperature=0.7,
    device_map="auto",
    pad_token_id=tokenizer.eos_token_id
)


csqa = load_dataset('tau/commonsense_qa')
print("Sample validation example:", csqa['validation'][0])

train_data = [{
    'question': ex['question'],
    'choices': ex['choices'],
    'answerKey': ex['answerKey']
} for ex in csqa['train']#.select(range(5000))]

selector = RDESelector(train_data, model, tokenizer, pipe, k=5)

selector.train(csqa['validation'].select(range(400)), num_episodes=7)

def evaluate(selector, test_data):
    correct = 0
    progress_bar = tqdm(test_data, desc="Evaluating")
    for ex in progress_bar:
        if 'answerKey' not in ex:
            continue

        demos = selector.select_demos(ex['question'])
        prompt = selector._create_prompt(demos, ex)
        prediction = selector._predict_answer(prompt)

        if prediction and prediction == ex['answerKey']:
            correct += 1

        progress_bar.set_postfix({
            "Current Accuracy": f"{correct/(progress_bar.n+1):.2f}",
            "Prediction": prediction or 'None',
            "Correct": ex.get('answerKey', '?')
        })

    return correct / len(test_data) if test_data else 0


def verify_answers(dataset_split):
    return [ex for ex in dataset_split if ex['answerKey'] in ['A','B','C','D','E']]

test_samples = csqa['validation'] #.select(range(450, 1200))

test_samples = verify_answers(csqa['validation'] #.select(range(450, 1200)))
accuracy = evaluate(selector, test_samples)
print(f"Final Test Accuracy: {accuracy:.2f}")


In [None]:
# GPT2 CSQA

import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
from typing import List, Dict, Tuple

class QLearningAgent:
    def __init__(self, alpha: float = 0.1, gamma: float = 0.9, epsilon: float = 0.1):
        self.q_table = {}
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.valid_actions_map = {}

    def get_action(self, state: str, valid_actions: List[int]) -> int:
        if np.random.random() < self.epsilon or state not in self.q_table:
            return int(np.random.choice(valid_actions))
        return max(self.q_table[state], key=self.q_table[state].get)

    def update_q_table(self, state: str, action: int, reward: float,
                      next_state: str, valid_actions: List[int]):
        if state not in self.q_table:
            self.q_table[state] = {a: 0.0 for a in valid_actions}
            self.valid_actions_map[state] = valid_actions

        old_value = self.q_table[state].get(action, 0)
        next_max = max(self.q_table[next_state].values()) if next_state in self.q_table else 0
        new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * next_max)
        self.q_table[state][action] = new_value

class Environment:
    def __init__(self, knowledge_base: List[Dict]):
        self.kb = knowledge_base
        self.vectorizer = TfidfVectorizer()
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "left"
        self.model = GPT2LMHeadModel.from_pretrained('gpt2')
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.resize_token_embeddings(len(self.tokenizer))

    def calculate_diversity(self, selected_demos: List[Dict]) -> float:
        answers = [demo['answerKey'] for demo in selected_demos]
        return len(set(answers)) / len(answers) if len(selected_demos) > 0 else 0.0

    def get_reward(self, input_text: Dict, selected_demos: List[Dict]) -> float:
        prompt = self._construct_prompt(input_text, selected_demos)
        prediction = self._llm_predictor(prompt, input_text)
        return 1.0 if prediction == input_text['answerKey'] else 0.0

    def _construct_prompt(self, input_text: Dict, demos: List[Dict]) -> str:
        prompt = ""
        for demo in demos:
            prompt += f"Q: {demo['question']}\n"
            prompt += "\n".join([f"{k}: {v}" for k, v in demo['choices'].items()])
            prompt += f"\nA: {demo['answerKey']}\n\n"
        prompt += f"Q: {input_text['question']}\n"
        prompt += "\n".join([f"{k}: {v}" for k, v in input_text['choices'].items()])
        prompt += "\nA:"
        return prompt

    def _llm_predictor(self, prompt: str, input_text: Dict) -> str:
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(self.device)

        outputs = self.model.generate(
            inputs.input_ids,
            max_new_tokens=1,
            pad_token_id=self.tokenizer.eos_token_id,
            return_dict_in_generate=True,
            output_scores=True
        )

        answer_token = self.tokenizer.decode(outputs.sequences[0, -1])
        return answer_token.strip()

class RDESEvaluator:
    def __init__(self, agent: QLearningAgent, env: Environment):
        self.agent = agent
        self.env = env

    def select_demonstrations(self, input_text: Dict, k: int) -> Tuple[List[Dict], float]:
        texts = [d['question'] for d in self.env.kb] + [input_text['question']]
        tfidf_matrix = self.env.vectorizer.fit_transform(texts)

        input_vec = tfidf_matrix[-1]
        demo_vecs = tfidf_matrix[:-1]
        similarities = cosine_similarity(input_vec, demo_vecs)[0]

        top_indices = np.argsort(similarities)[-k:][::-1]
        selected = [self.env.kb[int(i)] for i in top_indices]

        diversity = self.env.calculate_diversity(selected)
        if diversity < 0.6:
            remaining_indices = np.argsort(similarities)[:-k]
            for idx in remaining_indices[::-1]:
                if len(selected) >= k:
                    break
                selected.append(self.env.kb[int(idx)])
                diversity = self.env.calculate_diversity(selected)
                if diversity >= 0.6:
                    break

        return selected, diversity

    def train(self, episodes: int, demo_set_size: int = 5):
        self.env.model.train()
        for episode in range(episodes):
            state = str(np.random.choice(list(self.agent.q_table.keys()))) if self.agent.q_table else "init"
            for _ in range(demo_set_size):
                valid_actions = list(range(len(self.env.kb)))
                action = self.agent.get_action(state, valid_actions)

                next_state = f"state_{action}"
                reward = self.env.get_reward(self.env.kb[int(action)], [])

                self.agent.update_q_table(state, action, reward, next_state, valid_actions)
                state = next_state
        self.env.model.eval()

    def evaluate(self, test_set: List[Dict], demo_set_size: int = 5) -> float:
        total, correct = 0, 0
        for example in test_set:
            selected_demos, _ = self.select_demonstrations(example, demo_set_size)
            reward = self.env.get_reward(example, selected_demos)
            correct += reward
            total += 1
        return correct / total

dataset = load_dataset("tau/commonsense_qa")
train_data = dataset['train'].shuffle(seed=42)#.select(range(100))
print("train data size: ", train_data.shape)
test_data = dataset['validation'].shuffle(seed=42)#.select(range(50))
print("test data size: ", test_data.shape)

agent = QLearningAgent()
env = Environment(train_data)
evaluator = RDESEvaluator(agent, env)

evaluator.train(episodes=7)
accuracy = evaluator.evaluate(test_data)
print(f"RDES Evaluation Accuracy: {accuracy:.2f}")


In [None]:
# Gemma CSQA

import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, pipeline
import torch
from sentence_transformers import SentenceTransformer

class RDESelector:
    def __init__(self, demo_pool, num_classes, q_table=None):
        self.demo_pool = demo_pool
        self.num_classes = num_classes
        self.q_table = q_table if q_table else {}
        self.alpha = 0.1
        self.gamma = 0.9
        self.epsilon = 0.2

        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.demo_embeddings = self.embedding_model.encode(
            [d['question'] for d in demo_pool],
            convert_to_numpy=True
        )

    def diversity_score(self, selected_indices):
        answer_counts = np.zeros(self.num_classes)
        for idx in selected_indices:
            answer_idx = ord(self.demo_pool[idx]['answerKey']) - ord('A')
            answer_counts[answer_idx] += 1
        entropy = -np.sum((answer_counts/np.sum(answer_counts)) *
                        np.log(answer_counts/np.sum(answer_counts) + 1e-9))
        return entropy

    def get_state_key(self, current_state):
        return tuple(sorted(current_state['selected']))

    def select_demos(self, input_sample, k=5):
        selected = []
        state = {'input': input_sample, 'selected': []}

        input_embedding = self.embedding_model.encode(
            input_sample['question'],
            convert_to_numpy=True
        )

        for _ in range(k):
            valid_demos = [i for i in range(len(self.demo_pool))
                         if i not in state['selected']]

            if np.random.random() < self.epsilon:
                action = int(np.random.choice(valid_demos))
            else:
                q_values = [self.q_table.get((self.get_state_key(state), a), 0)
                          for a in valid_demos]
                action = int(valid_demos[np.argmax(q_values)])

            selected.append(action)
            state['selected'].append(action)

            next_state = state.copy()
            next_state['selected'] = selected.copy()
            reward = self.calculate_reward(input_embedding, selected)

            current_state_key = self.get_state_key(state)
            next_max = max([self.q_table.get((self.get_state_key(next_state), a), 0)
                          for a in valid_demos if a != action], default=0)

            self.q_table[(current_state_key, action)] = (
                (1 - self.alpha) * self.q_table.get((current_state_key, action), 0) +
                self.alpha * (reward + self.gamma * next_max)
            )

        return [self.demo_pool[i] for i in selected]

    def calculate_reward(self, input_embedding, selected_indices):
        demo_embeddings = self.demo_embeddings[selected_indices]
        similarities = np.dot(demo_embeddings, input_embedding)
        relevance = np.mean(similarities)

        diversity = self.diversity_score(selected_indices)
        max_entropy = np.log(self.num_classes)
        normalized_diversity = diversity / max_entropy

        return 0.5 * normalized_diversity + 0.5 * relevance

commonsense_qa = load_dataset("tau/commonsense_qa")
train_demos = commonsense_qa["train"]#.select(range(1000))
test_samples = commonsense_qa["validation"]#.select(range(100))

model_id = "google/gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model_id = "google/gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    model_kwargs={"low_cpu_mem_usage": True}
)
pipe.tokenizer.pad_token_id = pipe.model.config.eos_token_id



rde_selector = RDESelector(train_demos, num_classes=5)

def format_prompt(demos, test_sample):
    prompt = "Answer these commonsense questions. Choose the best option from A-E.\n\n"
    for demo in demos:
        prompt += f"Q: {demo['question']}\nOptions:\n"
        for label, text in zip(demo['choices']['label'], demo['choices']['text']):
            prompt += f"{label}: {text}\n"
        prompt += f"Answer: {demo['answerKey']}\n\n"

    prompt += f"Q: {test_sample['question']}\nOptions:\n"
    for label, text in zip(test_sample['choices']['label'], test_sample['choices']['text']):
        prompt += f"{label}: {text}\n"
    prompt += "Answer:"
    return prompt

correct = 0
for idx, sample in enumerate(test_samples):
    selected_demos = rde_selector.select_demos(sample, k=5)
    prompt = format_prompt(selected_demos, sample)

    outputs = pipe(
        prompt,
        max_new_tokens=2,
        return_full_text=False
    )

    predicted = outputs[0]['generated_text'].strip()[0]
    actual = sample['answerKey']

    if predicted.upper() == actual:
        correct += 1

    color_code = "\033[92m" if str(actual) == str(predicted) else "\033[91m"
    print(f"Sample {idx+1}:")
    print(f"  Predicted: {predicted} | Actual: {actual}")
    # print(f"  Text: {sample['text'][:100]}...")
    print(f"{color_code}  Result: {'CORRECT' if str(actual) == str(predicted) else 'INCORRECT'}\033[0m")
    print("-" * 80)

print(f"\nFinal Accuracy: {correct/len(test_samples):.2%}")
