In [None]:
import math
import numpy as np
import pandas as pd
import os
import gc
import torch
import random
from collections import defaultdict
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

#==============================================================================
# DATA LOADING
#==============================================================================

def load_arc_datasets():
    """Load both ARC Challenge and Easy datasets."""
    print("Loading ARC Challenge dataset...")
    arc_data = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="test")
    arc_df = arc_data.to_pandas()
    arc_df = arc_df.drop_duplicates(subset=['question'])
    arc_df["choices"] = arc_df["choices"].apply(lambda x: x["text"])
    arc_df["subject"] = "science"
    print(f"ARC Challenge shape: {arc_df.shape}")

    print("Loading ARC Easy dataset...")
    arc_easy_data = load_dataset("allenai/ai2_arc", "ARC-Easy", split="test")
    arc_easy_df = arc_easy_data.to_pandas()
    arc_easy_df = arc_easy_df.drop_duplicates(subset=['question'])
    arc_easy_df["choices"] = arc_easy_df["choices"].apply(lambda x: x["text"])
    arc_easy_df["subject"] = "science"
    print(f"ARC Easy shape: {arc_easy_df.shape}")

    return arc_df, arc_easy_df

#==============================================================================
# PREPROCESSING SYSTEM
#==============================================================================

class PreprocessingSystem:
    """
    Use both models to analyze choices and filter out low-confidence options
    """

    def __init__(self, confidence_threshold=0.10):
        self.confidence_threshold = confidence_threshold
        self.phi2_model = None
        self.phi2_tokenizer = None
        self.qwen2_model = None
        self.qwen2_tokenizer = None

    def load_preprocessing_models(self):
        """Load both models for preprocessing."""
        print("Loading Phi-2 for preprocessing...")
        self.phi2_model = AutoModelForCausalLM.from_pretrained(
            "microsoft/phi-2",
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            low_cpu_mem_usage=True,
            device_map="auto" if torch.cuda.is_available() else "cpu",
            trust_remote_code=True
        )
        self.phi2_tokenizer = AutoTokenizer.from_pretrained(
            "microsoft/phi-2",
            trust_remote_code=True
        )
        if self.phi2_tokenizer.pad_token is None:
            self.phi2_tokenizer.pad_token = self.phi2_tokenizer.eos_token

        print("Loading Qwen2-1.5B for preprocessing...")
        self.qwen2_model = AutoModelForCausalLM.from_pretrained(
            "Qwen/Qwen2-1.5B-Instruct",
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            low_cpu_mem_usage=True,
            device_map="auto" if torch.cuda.is_available() else "cpu",
            trust_remote_code=True
        )
        self.qwen2_tokenizer = AutoTokenizer.from_pretrained(
            "Qwen/Qwen2-1.5B-Instruct",
            trust_remote_code=True
        )
        if self.qwen2_tokenizer.pad_token is None:
            self.qwen2_tokenizer.pad_token = self.qwen2_tokenizer.eos_token

        print("Preprocessing models loaded successfully!")

    def build_analysis_prompt(self, question, choices):
        """Build prompt for confidence analysis."""
        prompt = f"""The following is a multiple choice science question. Analyze all choices and select the most likely correct answer.

Question: {question}
"""
        for i, choice in enumerate(choices):
            prompt += f"{chr(65+i)}. {choice}\n"
        prompt += "\nAnswer:"
        return prompt

    def get_model_confidence_scores(self, model, tokenizer, prompt, num_choices):
        """Get confidence scores from a model for answer choices."""
        try:
            inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)

            if torch.cuda.is_available():
                inputs = {k: v.to(model.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits[0, -1]

            # Get probabilities for A, B, C, D based on number of choices
            choice_logits = []
            for i in range(num_choices):
                letter = chr(65 + i)  # A, B, C, D
                token_id = tokenizer.encode(letter, add_special_tokens=False)[-1]
                choice_logits.append(logits[token_id].item())

            # Convert to probabilities
            choice_logits = torch.tensor(choice_logits)
            probs = torch.nn.functional.softmax(choice_logits, dim=0).numpy()

            return probs

        except Exception as e:
            print(f"Error getting confidence scores: {e}")
            # Return uniform distribution as fallback
            return np.ones(num_choices) / num_choices

    def analyze_all_choices(self, question, choices):
        """
        Both models analyze all choices → combined probabilities
        """
        prompt = self.build_analysis_prompt(question, choices)
        num_choices = len(choices)

        # Get confidence scores from both models
        phi2_probs = self.get_model_confidence_scores(
            self.phi2_model, self.phi2_tokenizer, prompt, num_choices
        )
        qwen2_probs = self.get_model_confidence_scores(
            self.qwen2_model, self.qwen2_tokenizer, prompt, num_choices
        )

        # Combined with 50/50 weighting
        combined_probs = 0.5 * phi2_probs + 0.5 * qwen2_probs

        return {
            'phi2_probs': phi2_probs,
            'qwen2_probs': qwen2_probs,
            'combined_probs': combined_probs
        }

    def filter_low_confidence_choices(self, choices, combined_probs):
        """
        Filter out choices with prob < threshold
        """
        # Find choices above threshold
        high_confidence_indices = [
            i for i, prob in enumerate(combined_probs)
            if prob >= self.confidence_threshold
        ]

        # Fallback: keep all choices if none meet threshold
        if not high_confidence_indices:
            high_confidence_indices = list(range(len(choices)))

        # Extract high-confidence choices
        filtered_choices = [choices[i] for i in high_confidence_indices]
        filtered_probs = [combined_probs[i] for i in high_confidence_indices]

        return filtered_choices, filtered_probs, high_confidence_indices

    def randomize_choice_order(self, choices, probs, original_indices):
        """Randomly shuffle choices to prevent positional bias."""
        combined = list(zip(choices, probs, original_indices))
        random.shuffle(combined)

        shuffled_choices, shuffled_probs, shuffled_original_indices = zip(*combined)
        position_mapping = {i: orig_idx for i, orig_idx in enumerate(shuffled_original_indices)}

        return list(shuffled_choices), list(shuffled_probs), position_mapping

    def preprocess_question(self, question, choices, answer_key=None):
        """
        Complete preprocessing pipeline
        """
        # Analyze all choices
        analysis_results = self.analyze_all_choices(question, choices)

        # Filter low-confidence choices
        filtered_choices, filtered_probs, high_confidence_indices = self.filter_low_confidence_choices(
            choices, analysis_results['combined_probs']
        )

        # Randomize to prevent bias
        final_choices, final_probs, position_mapping = self.randomize_choice_order(
            filtered_choices, filtered_probs, high_confidence_indices
        )

        # Update answer key if provided
        new_answer_key = None
        if answer_key:
            original_answer_idx = ord(answer_key) - ord('A')
            if original_answer_idx in high_confidence_indices:
                for new_pos, orig_pos in position_mapping.items():
                    if orig_pos == original_answer_idx:
                        new_answer_key = chr(ord('A') + new_pos)
                        break

        return {
            'original_question': question,
            'original_choices': choices,
            'filtered_choices': final_choices,
            'analysis_results': analysis_results,
            'position_mapping': position_mapping,
            'high_confidence_indices': high_confidence_indices,
            'original_answer_key': answer_key,
            'new_answer_key': new_answer_key,
            'filtering_applied': len(final_choices) < len(choices)
        }

    def cleanup_preprocessing_models(self):
        """Clean up preprocessing models."""
        if self.phi2_model:
            del self.phi2_model
        if self.phi2_tokenizer:
            del self.phi2_tokenizer
        if self.qwen2_model:
            del self.qwen2_model
        if self.qwen2_tokenizer:
            del self.qwen2_tokenizer
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

#==============================================================================
# ADVERSARIAL FRAMEWORK
#==============================================================================

class AdversarialFramework:
    """
    Adversarial training between Phi-2 (Generator) and Qwen2-1.5B (Discriminator)
    """

    def __init__(self):
        self.generator_model = None      # Phi-2
        self.generator_tokenizer = None
        self.discriminator_model = None  # Qwen2-1.5B
        self.discriminator_tokenizer = None

    def setup_adversarial_models(self, phi2_model, phi2_tokenizer, qwen2_model, qwen2_tokenizer):
        """
        Set up adversarial roles
        Generator: Phi-2 (tries to pick correct answers)
        Discriminator: Qwen2-1.5B (tries to evaluate correctness)
        """
        self.generator_model = phi2_model         # Phi-2 as Generator
        self.generator_tokenizer = phi2_tokenizer
        self.discriminator_model = qwen2_model    # Qwen2-1.5B as Discriminator
        self.discriminator_tokenizer = qwen2_tokenizer

        print("Adversarial roles set:")
        print("  Generator: Phi-2")
        print("  Discriminator: Qwen2-1.5B-Instruct")

    def format_subject(self, subject):
        """Format subject string."""
        return " ".join(subject.split("_"))

    def build_generator_prompt(self, subject, question, choices, get_correct):
        """Build prompt for generator."""
        prompt = f"The following are multiple choice questions (with answers) about {self.format_subject(subject)}.\n\n"
        prompt += f"{question}"

        for i, choice in enumerate(choices):
            prompt += f"\n{chr(65+i)}. {choice}"

        if get_correct:
            prompt += "\nAnswer:"
        else:
            prompt += "\nIncorrect Answer:"
        return prompt

    def get_generator_probabilities(self, prompt_text, num_choices):
        """Get generator answer probabilities."""
        try:
            inputs = self.generator_tokenizer(prompt_text, return_tensors="pt", truncation=True, max_length=512)

            if torch.cuda.is_available():
                inputs = {k: v.to(self.generator_model.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.generator_model(**inputs)
                logits = outputs.logits[0, -1]

            # Get probabilities for A, B, C, D...
            choice_logits = []
            for i in range(num_choices):
                letter = chr(65 + i)
                token_id = self.generator_tokenizer.encode(letter, add_special_tokens=False)[-1]
                choice_logits.append(logits[token_id].item())

            choice_logits = torch.tensor(choice_logits)
            probs = torch.nn.functional.softmax(choice_logits, dim=0).numpy()

            return probs

        except Exception as e:
            print(f"Error in generator probabilities: {e}")
            return np.ones(num_choices) / num_choices

    def get_generator_initial_probs(self, question, choices, subject):
        """Get initial generator probabilities for correct/incorrect."""
        gen_init = {"correct": {}, "incorrect": {}}
        candidates = [f"{chr(65+i)}" for i in range(len(choices))]

        for get_correct in [True, False]:
            prompt = self.build_generator_prompt(subject, question, choices, get_correct)
            probs = self.get_generator_probabilities(prompt, len(choices))

            for i, candidate in enumerate(candidates):
                if get_correct:
                    gen_init["correct"][candidate] = probs[i]
                else:
                    gen_init["incorrect"][candidate] = probs[i]

        return gen_init

    def build_discriminator_prompt(self, subject, question, proposed_answer):
        """Build prompt for discriminator."""
        prompt = f"""You are an expert evaluator of questions about {self.format_subject(subject)}.
Determine if the proposed answer is correct. Output ONLY 'A' or 'B'.
Question: {question}
Proposed Answer: {proposed_answer}

Is this answer correct? Respond ONLY with:
A. Correct
B. Incorrect

Answer:"""
        return prompt

    def get_discriminator_probabilities(self, prompt_text):
        """Get discriminator probabilities."""
        try:
            inputs = self.discriminator_tokenizer(prompt_text, return_tensors="pt", truncation=True, max_length=512)

            if torch.cuda.is_available():
                inputs = {k: v.to(self.discriminator_model.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.discriminator_model(**inputs)
                logits = outputs.logits[0, -1]

            a_token = self.discriminator_tokenizer.encode("A", add_special_tokens=False)[-1]
            b_token = self.discriminator_tokenizer.encode("B", add_special_tokens=False)[-1]

            choice_logits = torch.tensor([logits[a_token], logits[b_token]])
            probs = torch.nn.functional.softmax(choice_logits, dim=0).numpy()

            return {"correct": float(probs[0]), "incorrect": float(probs[1])}

        except Exception as e:
            print(f"Error in discriminator probabilities: {e}")
            return {"correct": 0.5, "incorrect": 0.5}

    def get_discriminator_initial_probs(self, question, choices, subject):
        """Get initial discriminator probabilities for each choice."""
        results = {}

        for idx, answer in enumerate(choices):
            prompt = self.build_discriminator_prompt(subject, question, answer)
            probs = self.get_discriminator_probabilities(prompt)

            candidate = f"{chr(65+idx)}"
            results[candidate] = probs

        return results

    def softmax(self, arr):
        """Numerically stable softmax."""
        m = np.max(arr)
        exp_vals = np.exp(arr - m)
        return exp_vals / np.sum(exp_vals)

    def equilibrium_search(self, gen_init, disc_init, candidates, T=20,
                          eta_G=0.1, eta_D=0.1, lam_G=0.1, lam_D=0.01):
        """
        Equilibrium search finds Nash equilibrium
        """
        gen = {"correct": dict(gen_init["correct"]),
               "incorrect": dict(gen_init["incorrect"])}
        disc = {}
        for y in candidates:
            disc[y] = dict(disc_init[y])

        Qg = {"correct": {y: 0.0 for y in candidates},
              "incorrect": {y: 0.0 for y in candidates}}
        Qd = {y: {"correct": 0.0, "incorrect": 0.0} for y in candidates}

        for t in range(1, T+1):
            # Update Q values
            for v in ["correct", "incorrect"]:
                for y in candidates:
                    Qg[v][y] += (1.0/(2.0*t)) * disc[y][v]

            for y in candidates:
                for v in ["correct", "incorrect"]:
                    Qd[y][v] += (1.0/(2.0*t)) * gen[v][y]

            # Update generator policy
            for v in ["correct", "incorrect"]:
                logits = []
                for y in candidates:
                    val = (Qg[v][y] + lam_G * math.log(gen_init[v][y] + 1e-12)) / (1/eta_G + lam_G)
                    logits.append(val)

                new_probs = self.softmax(np.array(logits))
                for i, y in enumerate(candidates):
                    gen[v][y] = new_probs[i]

            # Update discriminator policy
            for y in candidates:
                logits = [
                    (Qd[y]["correct"] + lam_D * math.log(disc_init[y]["correct"] + 1e-12)) / (1/eta_D + lam_D),
                    (Qd[y]["incorrect"] + lam_D * math.log(disc_init[y]["incorrect"] + 1e-12)) / (1/eta_D + lam_D)
                ]
                probs = self.softmax(np.array(logits))
                disc[y]["correct"] = probs[0]
                disc[y]["incorrect"] = probs[1]

        return gen, disc

    def get_final_answers(self, gen_final, disc_final, candidates):
        """
        Get final answers from Generator and Discriminator
        """
        # Generator's final answer
        gen_answer = None
        best_gen_prob = -1.0
        for y in candidates:
            p = gen_final["correct"][y]
            if p > best_gen_prob:
                best_gen_prob = p
                gen_answer = y

        # Discriminator's final answer
        disc_answer = None
        best_disc_prob = -1.0
        for y in candidates:
            p = disc_final[y]["correct"]
            if p > best_disc_prob:
                best_disc_prob = p
                disc_answer = y

        return gen_answer, disc_answer

#==============================================================================
# INTEGRATED PIPELINE
#==============================================================================

class IntegratedPipeline:
    """
    Complete pipeline combining preprocessing + adversarial training
    """

    def __init__(self, confidence_threshold=0.10):
        self.preprocessing = PreprocessingSystem(confidence_threshold)
        self.adversarial = AdversarialFramework()
        self.models_loaded = False

    def initialize_all_models(self):
        """Initialize all models."""
        if not self.models_loaded:
            print("Initializing complete pipeline...")

            # Load preprocessing models
            self.preprocessing.load_preprocessing_models()

            # Set up adversarial framework with same models
            self.adversarial.setup_adversarial_models(
                self.preprocessing.phi2_model,
                self.preprocessing.phi2_tokenizer,
                self.preprocessing.qwen2_model,
                self.preprocessing.qwen2_tokenizer
            )

            self.models_loaded = True
            print("All models initialized successfully!")

    def process_single_question(self, question, choices, answer_key, subject):
        """Process a single question through the complete pipeline."""

        # Preprocessing
        preprocessing_result = self.preprocessing.preprocess_question(question, choices, answer_key)

        filtered_choices = preprocessing_result["filtered_choices"]
        new_answer_key = preprocessing_result.get("new_answer_key")

        # Check if only one choice remains (automatic consensus)
        if len(filtered_choices) == 1:
            consensus_answer = "A"
            accuracy = 1.0 if new_answer_key == "A" else 0.0
            return {
                **preprocessing_result,
                'consensus_achieved': True,
                'gen_final_answer': consensus_answer,
                'disc_final_answer': consensus_answer,
                'accuracy': accuracy
            }

        # Adversarial training
        candidates = [f"{chr(65+i)}" for i in range(len(filtered_choices))]

        # Get initial probabilities
        gen_init = self.adversarial.get_generator_initial_probs(question, filtered_choices, subject)
        disc_init = self.adversarial.get_discriminator_initial_probs(question, filtered_choices, subject)

        # Equilibrium search
        gen_final, disc_final = self.adversarial.equilibrium_search(
            gen_init, disc_init, candidates, T=20
        )

        # Get final answers
        gen_answer, disc_answer = self.adversarial.get_final_answers(gen_final, disc_final, candidates)

        # Calculate accuracy (using generator answer as primary)
        accuracy = 1.0 if gen_answer == new_answer_key else 0.0

        return {
            **preprocessing_result,
            'consensus_achieved': False,
            'gen_final_answer': gen_answer,
            'disc_final_answer': disc_answer,
            'accuracy': accuracy
        }

    def process_dataset(self, df, dataset_name="Dataset"):
        """Process complete dataset through pipeline."""
        self.initialize_all_models()

        results = []

        # Tracking metrics
        total_questions = 0
        correct_predictions = 0
        consensus_questions = 0

        print(f"\nProcessing {dataset_name}...")
        print(f"Total questions: {len(df)}")

        for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {dataset_name}"):
            total_questions += 1

            # Process through complete pipeline
            result = self.process_single_question(
                row["question"],
                row["choices"],
                row.get("answerKey"),
                row["subject"]
            )

            # Add original row data
            result.update(row.to_dict())

            # Track metrics
            if result['consensus_achieved']:
                consensus_questions += 1

            if result.get('accuracy', 0) == 1.0:
                correct_predictions += 1

            results.append(result)

            # Memory management
            if total_questions % 100 == 0:
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

        # Print summary
        accuracy = correct_predictions / total_questions * 100
        print(f"\n{dataset_name} Results:")
        print(f"  Total Questions: {total_questions}")
        print(f"  Correct Predictions: {correct_predictions}")
        print(f"  Accuracy: {accuracy:.2f}%")
        print(f"  Preprocessing Consensus: {consensus_questions} ({consensus_questions/total_questions*100:.1f}%)")

        return pd.DataFrame(results)

    def cleanup_all_models(self):
        """Clean up all models."""
        self.preprocessing.cleanup_preprocessing_models()
        self.models_loaded = False

#==============================================================================
# MAIN EXECUTION
#==============================================================================

def main():
    """Main execution for full dataset processing."""

    print("="*70)
    print("CONSENSUS GAME WITH PREPROCESSING")
    print("="*70)
    print("Pipeline:")
    print("  1. Load ARC datasets")
    print("  2. Preprocess: Filter low-confidence choices")
    print("  3. Consensus Game: Phi-2 (Generator) vs Qwen2-1.5B (Discriminator)")
    print("  4. Save results")
    print("="*70)

    # Load datasets
    arc_df, arc_easy_df = load_arc_datasets()

    # Initialize pipeline
    pipeline = IntegratedPipeline(confidence_threshold=0.10)

    # Process ARC Challenge
    results_challenge = pipeline.process_dataset(arc_df, "ARC Challenge")

    # Save results
    os.makedirs("results", exist_ok=True)
    challenge_filename = "results/arc_challenge_preprocessing_consensus.csv"
    results_challenge.to_csv(challenge_filename, index=False)
    print(f"\nARC Challenge results saved to {challenge_filename}")

    # Process ARC Easy
    results_easy = pipeline.process_dataset(arc_easy_df, "ARC Easy")

    # Save results
    easy_filename = "results/arc_easy_preprocessing_consensus.csv"
    results_easy.to_csv(easy_filename, index=False)
    print(f"ARC Easy results saved to {easy_filename}")

    # Final cleanup
    pipeline.cleanup_all_models()

    print("\n" + "="*70)
    print("PROCESSING COMPLETE")
    print("="*70)

    return results_challenge, results_easy

if __name__ == "__main__":
    main()

CONSENSUS GAME WITH PREPROCESSING
Pipeline:
  1. Load ARC datasets
  2. Preprocess: Filter low-confidence choices
  3. Consensus Game: Phi-2 (Generator) vs Qwen2-1.5B (Discriminator)
  4. Save results
Loading ARC Challenge dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

ARC-Challenge/train-00000-of-00001.parqu(…):   0%|          | 0.00/190k [00:00<?, ?B/s]

ARC-Challenge/test-00000-of-00001.parque(…):   0%|          | 0.00/204k [00:00<?, ?B/s]

ARC-Challenge/validation-00000-of-00001.(…):   0%|          | 0.00/55.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1119 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1172 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/299 [00:00<?, ? examples/s]

ARC Challenge shape: (1170, 5)
Loading ARC Easy dataset...


ARC-Easy/train-00000-of-00001.parquet:   0%|          | 0.00/331k [00:00<?, ?B/s]

ARC-Easy/test-00000-of-00001.parquet:   0%|          | 0.00/346k [00:00<?, ?B/s]

ARC-Easy/validation-00000-of-00001.parqu(…):   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2376 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/570 [00:00<?, ? examples/s]

ARC Easy shape: (2371, 5)
Initializing complete pipeline...
Loading Phi-2 for preprocessing...


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]