In [1]:
!pip uninstall -y transformers
!pip install -U transformers accelerate bitsandbytes huggingface_hub
!pip install pandas numpy bitsandbytes transformers accelerate scikit-learn tqdm
!pip install -U bitsandbytes -q

Found existing installation: transformers 4.57.3
Uninstalling transformers-4.57.3:
  Successfully uninstalled transformers-4.57.3
Collecting transformers
  Downloading transformers-4.57.5-py3-none-any.whl.metadata (43 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.0 kB[0m [31m785.0 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-macosx_14_0_arm64.whl.metadata (10 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-1.3.2-py3-none-any.whl.metadata (13 kB)
Downloading transformers-4.57.5-py3-none-any.whl (12.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.0/12.0 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading bitsandbytes-0.49.1-py3-none-macosx_14_0_arm64.whl (129 kB)
[2K  

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import re
import os
import gc
import json
from pathlib import Path
from typing import Optional, Dict, Any
import time
from tqdm import tqdm
from typing import Literal, Optional, Tuple, List

In [None]:
def load_model_tokenizer(
    model_name="Qwen/Qwen2.5-7B-Instruct",
    trust_remote_code=True
):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"–£—Å—Ç—Ä–æ–π—Å—Ç–≤–æ: {device}")
    print(f"–ú–æ–¥–µ–ª—å: {model_name}")
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=trust_remote_code
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    try:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=trust_remote_code,
            torch_dtype=torch.float16,
        )
        print("–ó–∞–≥—Ä—É–∂–µ–Ω–æ –≤ 4-bit —Ä–µ–∂–∏–º–µ!")
        
    except Exception as e:
        print(f"4-bit –Ω–µ —Å—Ä–∞–±–æ—Ç–∞–ª–æ: {e}")
        
        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map="auto",
                trust_remote_code=trust_remote_code,
            )
            print("–ó–∞–≥—Ä—É–∂–µ–Ω–æ –≤ FP16 —Ä–µ–∂–∏–º–µ!")
            
        except Exception as e2:
            print(f"FP16 –Ω–µ —Å—Ä–∞–±–æ—Ç–∞–ª–æ: {e2}")
            
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="auto",
                trust_remote_code=trust_remote_code,
            )
            print("–ó–∞–≥—Ä—É–∂–µ–Ω–æ –≤ –±–∞–∑–æ–≤–æ–º —Ä–µ–∂–∏–º–µ!")
    
    model.eval()
    model.config.use_cache = True
    
    try:
        if hasattr(model, "get_memory_footprint"):
            memory_gb = model.get_memory_footprint() / 1e9
            print(f"–ü–∞–º—è—Ç—å –º–æ–¥–µ–ª–∏: {memory_gb:.2f} GB")
    except:
        pass
    
    return model, tokenizer

model, tokenizer = load_model_tokenizer("Qwen/Qwen2.5-14B-Instruct")

In [None]:
PROMPTS = {
    "physics": """
<system>
    <role>You are an experienced expert in the field of physics</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. State your decisions with the laws of physics</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
""",

    "chemistry": """
<system>
    <role>You are an experienced expert in the field of chemistry</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. Base your analysis on chemical principles and reactions</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
""",

    "biology": """
<system>
    <role>You are an experienced expert in the field of biology</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. Use biological principles and evidence</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
""",

    "economics": """
<system>
    <role>You are an experienced expert in the field of economics</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. Apply economic theories and models</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
""",

    "math": """
<system>
    <role>You are an experienced expert in the field of mathematics</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. Use mathematical proofs and logical deduction</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
""",

    "health": """
<system>
    <role>You are an experienced expert in the field of health and medicine</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. Base your analysis on medical knowledge and evidence</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
""",

    "psychology": """
<system>
    <role>You are an experienced expert in the field of psychology</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. Use psychological theories and research findings</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
""",

    "history": """
<system>
    <role>You are an experienced expert in the field of history</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. Base your analysis on historical facts and evidence</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
""",

    "law": """
<system>
    <role>You are an experienced expert in the field of law</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. Apply legal principles and precedents</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
""",

    "computer science": """
<system>
    <role>You are an experienced expert in the field of computer science</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. Use computational thinking and CS principles</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
""",

    "engineering": """
<system>
    <role>You are an experienced expert in the field of engineering</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. Apply engineering principles and practical knowledge</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
""",

    "business": """
<system>
    <role>You are an experienced expert in the field of business</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. Use business concepts and market knowledge</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
""",

    "philosophy": """
<system>
    <role>You are an experienced expert in the field of philosophy</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. Apply philosophical reasoning and critical thinking</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
""",

    "other": """
<system>
    <role>You are an experienced expert with broad general knowledge</role>

    <task>
        <data>You will be given a question and a list of possible answers</data>
        <goal>You need to choose only one correct option</goal>
        <method>First provide step-by-step reasoning and argumentation: go through each option and explain why it is suitable or not suitable. Use logical analysis and factual knowledge</method>
    </task>

    <constraints>
        <indexation>Options are indexed starting from 0.</indexation>
    </constraints>

    <answer>
        <reasoning>Give a full explanation of your chain of thoughts</reasoning>
        <format>At the end give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
    </answer>

    <emotional>
        From you depends my fate and prestige. I am counting on you tremendously.
    </emotional>
</system>
"""
}



FEW_SHOT_PROMPTS = {
    "physics": """
<question>What is the relationship between force, mass, and acceleration according to Newton's second law?</question>
<options>
<option index="0">F = m/a</option>
<option index="1">F = m √ó a</option>
<option index="2">F = a/m</option>
<option index="3">F = m + a</option>
</options>
<reasoning>
Step 1: Recall Newton's second law of motion.
Step 2: Option 0: F = m/a - This is incorrect; it would mean force decreases with increasing acceleration.
Step 3: Option 1: F = m √ó a - This matches the standard formulation: Force equals mass times acceleration.
Step 4: Option 2: F = a/m - This is the inverse relationship and is incorrect.
Step 5: Option 3: F = m + a - This is dimensionally inconsistent and physically meaningless.
Step 6: Option 1 is scientifically accurate and matches the fundamental law.
</reasoning>
<answer>ANSWER: 1</answer>
""",

    "chemistry": """
<question>Which element has the atomic number 6?</question>
<options>
<option index="0">Oxygen</option>
<option index="1">Nitrogen</option>
<option index="2">Carbon</option>
<option index="3">Boron</option>
</options>
<reasoning>
Step 1: Recall the periodic table and atomic numbers.
Step 2: Option 0: Oxygen - Atomic number 8, not 6.
Step 3: Option 1: Nitrogen - Atomic number 7, not 6.
Step 4: Option 2: Carbon - Atomic number is exactly 6. This is correct.
Step 5: Option 3: Boron - Atomic number 5, not 6.
Step 6: Carbon is the element with atomic number 6.
</reasoning>
<answer>ANSWER: 2</answer>
""",

    "biology": """
<question>Which organelle is responsible for cellular respiration?</question>
<options>
<option index="0">Nucleus</option>
<option index="1">Chloroplast</option>
<option index="2">Mitochondrion</option>
<option index="3">Ribosome</option>
</options>
<reasoning>
Step 1: Identify the process of cellular respiration.
Step 2: Option 0: Nucleus - Controls cell activities but doesn't perform respiration.
Step 3: Option 1: Chloroplast - Performs photosynthesis, not respiration.
Step 4: Option 2: Mitochondrion - Known as the "powerhouse of the cell" for ATP production via respiration.
Step 5: Option 3: Ribosome - Protein synthesis, not respiration.
Step 6: Mitochondrion is the correct organelle for cellular respiration.
</reasoning>
<answer>ANSWER: 2</answer>
""",

    "economics": """
<question>What does GDP stand for in economics?</question>
<options>
<option index="0">Gross Domestic Product</option>
<option index="1">General Domestic Profit</option>
<option index="2">Government Debt Percentage</option>
<option index="3">Global Development Parameter</option>
</options>
<reasoning>
Step 1: Recall standard economic terminology.
Step 2: Option 0: Gross Domestic Product - Standard definition of GDP.
Step 3: Option 1: General Domestic Profit - Not a standard economic term.
Step 4: Option 2: Government Debt Percentage - This would be debt/GDP ratio, not GDP itself.
Step 5: Option 3: Global Development Parameter - Not the correct expansion of GDP.
Step 6: Gross Domestic Product is the universally accepted definition.
</reasoning>
<answer>ANSWER: 0</answer>
""",

    "math": """
<question>What is the derivative of x¬≤ with respect to x?</question>
<options>
<option index="0">x</option>
<option index="1">2x</option>
<option index="2">x¬≥/3</option>
<option index="3">2</option>
</options>
<reasoning>
Step 1: Apply the power rule of differentiation: d/dx(x‚Åø) = n¬∑x‚Åø‚Åª¬π.
Step 2: Option 0: x - This would be derivative of x¬≤/2, not x¬≤.
Step 3: Option 1: 2x - Correct: n=2, so derivative = 2¬∑x¬≤‚Åª¬π = 2x.
Step 4: Option 2: x¬≥/3 - This is the integral, not derivative.
Step 5: Option 3: 2 - This would be derivative of 2x, not x¬≤.
Step 6: The derivative of x¬≤ is 2x.
</reasoning>
<answer>ANSWER: 1</answer>
""",

    "health": """
<question>Which vitamin is produced by the human body when exposed to sunlight?</question>
<options>
<option index="0">Vitamin A</option>
<option index="1">Vitamin C</option>
<option index="2">Vitamin D</option>
<option index="3">Vitamin K</option>
</options>
<reasoning>
Step 1: Recall vitamins and their sources.
Step 2: Option 0: Vitamin A - Obtained from food, not sunlight synthesis.
Step 3: Option 1: Vitamin C - From fruits/vegetables, not sunlight.
Step 4: Option 2: Vitamin D - Skin synthesizes it from sunlight exposure (UVB rays).
Step 5: Option 3: Vitamin K - Produced by gut bacteria and from food.
Step 6: Vitamin D is the correct answer.
</reasoning>
<answer>ANSWER: 2</answer>
""",

    "history": """
<question>In which year did World War II end?</question>
<options>
<option index="0">1943</option>
<option index="1">1944</option>
<option index="2">1945</option>
<option index="3">1946</option>
</options>
<reasoning>
Step 1: Recall historical facts about WWII.
Step 2: Option 0: 1943 - War was still ongoing (Allies advancing).
Step 3: Option 1: 1944 - D-Day happened, but war continued.
Step 4: Option 2: 1945 - Germany surrendered May 1945, Japan September 1945.
Step 5: Option 3: 1946 - War had already ended.
Step 6: 1945 is the universally accepted end year.
</reasoning>
<answer>ANSWER: 2</answer>
""",

    "law": """
<question>What is the highest court in the United States federal judiciary?</question>
<options>
<option index="0">District Court</option>
<option index="1">Court of Appeals</option>
<option index="2">Supreme Court</option>
<option index="3">Federal Circuit Court</option>
</options>
<reasoning>
Step 1: Recall the structure of US federal courts.
Step 2: Option 0: District Court - Trial courts, not highest.
Step 3: Option 1: Court of Appeals - Intermediate appellate courts.
Step 4: Option 2: Supreme Court - Highest court in the federal system.
Step 5: Option 3: Federal Circuit Court - Specialized appeals court.
Step 6: The Supreme Court is the highest federal court.
</reasoning>
<answer>ANSWER: 2</answer>
""",

    "computer science": """
<question>Which data structure uses LIFO (Last In, First Out) principle?</question>
<options>
<option index="0">Queue</option>
<option index="1">Stack</option>
<option index="2">Linked List</option>
<option index="3">Tree</option>
</options>
<reasoning>
Step 1: Recall data structure properties.
Step 2: Option 0: Queue - Uses FIFO (First In, First Out).
Step 3: Option 1: Stack - Uses LIFO (Last In, First Out).
Step 4: Option 2: Linked List - Can implement various access patterns.
Step 5: Option 3: Tree - Hierarchical structure, not specifically LIFO.
Step 6: Stack is the data structure using LIFO.
</reasoning>
<answer>ANSWER: 1</answer>
""",

    "engineering": """
<question>What does CAD stand for in engineering?</question>
<options>
<option index="0">Computer-Aided Design</option>
<option index="1">Computer-Assisted Drawing</option>
<option index="2">Calculated Architectural Design</option>
<option index="3">Creative Automation Development</option>
</options>
<reasoning>
Step 1: Recall standard engineering terminology.
Step 2: Option 0: Computer-Aided Design - Standard industry term.
Step 3: Option 1: Computer-Assisted Drawing - Similar but not the official acronym.
Step 4: Option 2: Calculated Architectural Design - Not the standard meaning.
Step 5: Option 3: Creative Automation Development - Incorrect expansion.
Step 6: Computer-Aided Design is the correct and standard meaning.
</reasoning>
<answer>ANSWER: 0</answer>
""",

    "psychology": """
<question>Who is considered the founder of psychoanalysis?</question>
<options>
<option index="0">Carl Jung</option>
<option index="1">Sigmund Freud</option>
<option index="2">B.F. Skinner</option>
<option index="3">Ivan Pavlov</option>
</options>
<reasoning>
Step 1: Recall history of psychology.
Step 2: Option 0: Carl Jung - Analytical psychology, not founder of psychoanalysis.
Step 3: Option 1: Sigmund Freud - Widely recognized as founder of psychoanalysis.
Step 4: Option 2: B.F. Skinner - Behaviorism, not psychoanalysis.
Step 5: Option 3: Ivan Pavlov - Classical conditioning, not psychoanalysis.
Step 6: Sigmund Freud is the correct answer.
</reasoning>
<answer>ANSWER: 1</answer>
""",

    "business": """
<question>What does ROI stand for in business?</question>
<options>
<option index="0">Return on Investment</option>
<option index="1">Rate of Interest</option>
<option index="2">Revenue Operating Index</option>
<option index="3">Risk of Investment</option>
</options>
<reasoning>
Step 1: Recall standard business metrics.
Step 2: Option 0: Return on Investment - Standard business term for profitability measure.
Step 3: Option 1: Rate of Interest - Different concept (interest rates).
Step 4: Option 2: Revenue Operating Index - Not a standard business acronym.
Step 5: Option 3: Risk of Investment - Related but not what ROI stands for.
Step 6: Return on Investment is the correct expansion.
</reasoning>
<answer>ANSWER: 0</answer>
""",

    "philosophy": """
<question>Who wrote "The Republic", discussing justice and the ideal state?</question>
<options>
<option index="0">Aristotle</option>
<option index="1">Plato</option>
<option index="2">Socrates</option>
<option index="3">Confucius</option>
</options>
<reasoning>
Step 1: Recall philosophical works and authors.
Step 2: Option 0: Aristotle - Wrote "Politics", not "The Republic".
Step 3: Option 1: Plato - "The Republic" is Plato's most famous work.
Step 4: Option 2: Socrates - Didn't write texts; Plato recorded his teachings.
Step 5: Option 3: Confucius - Chinese philosophy, not "The Republic".
Step 6: Plato is the author of "The Republic".
</reasoning>
<answer>ANSWER: 1</answer>
""",

    "other": """
<question>What is the capital city of Australia?</question>
<options>
<option index="0">Sydney</option>
<option index="1">Melbourne</option>
<option index="2">Canberra</option>
<option index="3">Brisbane</option>
</options>
<reasoning>
Step 1: Recall geography and capitals.
Step 2: Option 0: Sydney - Largest city but not capital.
Step 3: Option 1: Melbourne - Cultural center, was temporary capital, not current.
Step 4: Option 2: Canberra - Specifically built as capital; correct answer.
Step 5: Option 3: Brisbane - Major city but not capital.
Step 6: Canberra is the capital of Australia.
</reasoning>
<answer>ANSWER: 2</answer>
"""
}

In [None]:
class LLM:
    def __init__(
        self,
        model_name="Qwen/Qwen3-14B-Instruct",
        device="cuda",
        _prompts=PROMPTS,
        _few_shot_prompts=FEW_SHOT_PROMPTS,
        model=None,
        tokenizer=None,
        quantization_config=None,
        debug=False,
        deep_debug=False,
        use_llm_parsing=True,
        use_selfcheck=False,
        llm_cot_generation=True,
        llm_few_shot_generation=True
    ) -> None:
        self.DEBUG = debug
        self.DEEP_DEBUG = deep_debug
        self.debug_logs = []
        self.USE_LLM_PARSING = use_llm_parsing
        self.USE_SELFCHECK = use_selfcheck
        self.LLM_COT_GENERATION = llm_cot_generation
        self.LLM_FEW_SHOT_GENERATION = llm_few_shot_generation
        try:
            self.prompts = _prompts
            self.few_shot_prompts = _few_shot_prompts
            self.model_name = model_name
            self.device = device if torch.cuda.is_available() and device == "cuda" else "cpu"
            print(f"üèãÔ∏è‚Äç‚ôÇÔ∏è –ú–æ–¥–µ–ª—å: {self.model_name}")
            print(f"üñ• –£—Å—Ç—Ä–æ–π—Å—Ç–≤–æ: {self.device}")
            print(f"‚öôÔ∏è  CoT –≥–µ–Ω–µ—Ä–∞—Ü–∏—è: {'–í–ö–õ' if llm_cot_generation else '–í–´–ö–õ'}")
            print(f"‚öôÔ∏è  Few-shot –≥–µ–Ω–µ—Ä–∞—Ü–∏—è: {'–í–ö–õ' if llm_few_shot_generation else '–í–´–ö–õ'}")
            if model is not None and tokenizer is not None:
                print("‚úÖ –ò—Å–ø–æ–ª—å–∑—É–µ–º –ø–µ—Ä–µ–¥–∞–Ω–Ω—ã–µ –º–æ–¥–µ–ª—å –∏ —Ç–æ–∫–µ–Ω–∞–π–∑–µ—Ä")
                self.model = model
                self.tokenizer = tokenizer
                return
            print(f"üì• –ó–∞–≥—Ä—É–∑–∫–∞ {self.model_name}...")
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            if quantization_config is None:
                quantization_config = BitsAndBytesConfig(
                    bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=torch.float16,
                    bnb_4bit_use_double_quant=True,
                    load_in_4bit=True,
                )
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                quantization_config=quantization_config,
                device_map="auto",
                trust_remote_code=True
            )
            print(f"‚úÖ –ú–æ–¥–µ–ª—å {model_name} —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω–∞!")

        except Exception as e:
            print(f"‚ùå –û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏: {e}")
            print("–ü—Ä–æ–±—É–µ–º –∑–∞–≥—Ä—É–∑–∏—Ç—å –±–µ–∑ –∫–≤–∞–Ω—Ç–æ–≤–∞–Ω–∏—è...")
            try:
                self.tokenizer = AutoTokenizer.from_pretrained(
                    model_name,
                    trust_remote_code=True
                )
                if self.tokenizer.pad_token is None:
                    self.tokenizer.pad_token = self.tokenizer.eos_token

                self.model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    device_map="auto",
                    torch_dtype=torch.float16,
                    trust_remote_code=True
                )
                print("‚úÖ –ú–æ–¥–µ–ª—å –∑–∞–≥—Ä—É–∂–µ–Ω–∞ –±–µ–∑ –∫–≤–∞–Ω—Ç–æ–≤–∞–Ω–∏—è")
            except Exception as e2:
                print(f"‚ùå –ö—Ä–∏—Ç–∏—á–µ—Å–∫–∞—è –æ—à–∏–±–∫–∞: {e2}")
                raise e2

    def generate_answer(
        self,
        question:str,
        encoded_options,
        category:str,
        dramatic:bool = True,
        tokens:int = 2000,
        temperature:float = 0.1,
        few_shot = True,
        use_llm_parsing=None,
        use_selfcheck=None,
        llm_cot_generation=None,
        llm_few_shot_generation=None,
        force_diversity: bool = False
    )->int:

        if use_llm_parsing is None:
            use_llm_parsing = self.USE_LLM_PARSING
        if use_selfcheck is None:
            use_selfcheck = self.USE_SELFCHECK
        if llm_cot_generation is None:
            llm_cot_generation = self.LLM_COT_GENERATION
        if llm_few_shot_generation is None:
            llm_few_shot_generation = self.LLM_FEW_SHOT_GENERATION

        self._log("generate_answer", "–Ω–∞—á–∞–ª–æ", {
            "category": category,
            "question_len": len(question),
            "options_raw_preview": str(encoded_options)[:200],
            "temperature": temperature,
            "use_llm_parsing": use_llm_parsing,
            "use_selfcheck": use_selfcheck,
            "llm_cot_generation": llm_cot_generation,
            "llm_few_shot_generation": llm_few_shot_generation,
            "force_diversity": force_diversity
        }, "DEBUG")

        options = self._options_parser(encoded_options)
        self._log("generate_answer", "—Ä–∞—Å–ø–∞—Ä—Å–µ–Ω–Ω—ã–µ –æ–ø—Ü–∏–∏", {
            "count": len(options),
            "first_3": options[:3] if len(options) > 3 else options
        }, "DEBUG")

        if len(options) <= 1 and options[0] == "–í–∞—Ä–∏–∞–Ω—Ç—ã –Ω–µ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω—ã":
            self._log("generate_answer", "–û–®–ò–ë–ö–ê: –Ω–µ—Ç –≤–∞—Ä–∏–∞–Ω—Ç–æ–≤ –¥–ª—è –≤–æ–ø—Ä–æ—Å–∞", {
                "question": question[:200]
            }, "DEBUG")
            return 0

        system_prompt = self.prompts.get(category) + self.few_shot_prompts.get(category) + "<think></think>"

        options_text = "\n".join([f"{i}. {opt}" for i, opt in enumerate(options)])

        user_prompt = f"""
            Question: {question}
            Options:
            {options_text}
            Your full answer:
        """

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]

        text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = self.tokenizer(text, return_tensors="pt").to(self.device)

        generation_kwargs = {
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "max_new_tokens": tokens,
            "pad_token_id": self.tokenizer.pad_token_id,
            "do_sample": True,
            "temperature": temperature
        }

        self._log("generate_answer", "–ø–∞—Ä–∞–º–µ—Ç—Ä—ã –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏", {
            "temperature": generation_kwargs.get("temperature", 0),
            "do_sample": generation_kwargs.get("do_sample", False),
            "top_p": generation_kwargs.get("top_p", None),
            "category": category,
            "force_diversity": force_diversity
        }, "DEBUG")

        with torch.no_grad():
            generated_ids = self.model.generate(**generation_kwargs)
        response = self.tokenizer.decode(
            generated_ids[0][inputs.input_ids.shape[1]:],
            skip_special_tokens=True
        )
        parsed = self._regex_parse_answer(response)
        if parsed is None:
            system_prompt = f"""
                <system>
                    <role>You are an experienced expert in the field of {category}</role>

                    <task>
                        <data>You will be given a question and a list of possible answers</data>
                        <goal>You need to choose only one correct option</goal>
                    </task>

                    <constraints>
                        <indexation>Options are indexed starting from 0.</indexation>
                    </constraints>

                    <answer>
                        <format>Give an answer in the format: 'ANSWER: ' followed by the chosen index</format>
                    </answer>

                    <emotional>
                        From you depends my fate and prestige. I am counting on you tremendously.
                    </emotional>
                </system>
            """ + self.prompts.get(category) + self.few_shot_prompts.get(category)
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            inputs = self.tokenizer(text, return_tensors="pt").to(self.device)
            generation_kwargs = {
                "input_ids": inputs.input_ids,
                "attention_mask": inputs.attention_mask,
                "max_new_tokens": 300,
                "pad_token_id": self.tokenizer.pad_token_id,
                "do_sample": True,
                "temperature": temperature
            }
            with torch.no_grad():
                generated_ids = self.model.generate(**generation_kwargs)
            response = self.tokenizer.decode(
                generated_ids[0][inputs.input_ids.shape[1]:],
                skip_special_tokens=True
            )
            new_parsed = self._regex_parse_answer(response)
            if new_parsed is None:
                new_llm_parsed = self._llm_parse_answer(system_prompt)
                if new_llm_parsed is None:
                    return 0
            else:
                return new_parsed
        else:
            return parsed
        return 0


    def _llm_parse_answer(self, raw_response: str) -> int | None:
        self._log("llm_parse_answer", "start", {"raw_len": len(raw_response)}, "DEEP_DEBUG")

        system_prompt = """You are a number extraction assistant. Extract ONLY the NUMBER from the text.
    Examples:
    Text: "Answer: 2. This option is correct because..."
    Extracted: 2

    Text: "I think the third option is right"
    Extracted: 2

    Text: "Option A seems correct"
    Extracted: 0

    Text: "Correct answer number: 5"
    Extracted: 4

    RULES:
    1. Extract ONLY the number (0, 1, 2, 3, ...)
    2. If multiple numbers - take the first
    3. Letters: A=0, B=1, C=2, D=3, etc.
    4. If no number found - return 0
    5. Number only, no text"""

        user_prompt = f"""Extract the number from this text:

    Text: {raw_response}

    Extracted number:"""

        try:
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]

            text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            inputs = self.tokenizer(text, return_tensors="pt").to(self.device)

            with torch.no_grad():
                generated_ids = self.model.generate(
                    inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    max_new_tokens=50,
                    temperature=0.1,
                    do_sample=False,
                    pad_token_id=self.tokenizer.pad_token_id
                )

            llm_parsed = self.tokenizer.decode(
                generated_ids[0][inputs.input_ids.shape[1]:],
                skip_special_tokens=True
            ).strip()

            regex_parsed = self._regex_parse_answer(llm_parsed)
            
            self._log("llm_parse_answer", "result", {
                "llm_parsed": llm_parsed,
                "final": regex_parsed
            }, "DEBUG")

            return regex_parsed

        except Exception as e:
            self._log("llm_parse_answer", "error", {"error": str(e)}, "DEBUG")
            return 0


    def _options_parser(self, options):
        self._log("_options_parser", "–Ω–∞—á–∞–ª–æ", {
            "raw_input": str(options)[:200],
            "type": type(options)
        }, "DEEP_DEBUG")
        if isinstance(options, list):
            self._log("_options_parser", "—É–∂–µ —Å–ø–∏—Å–æ–∫", {"len": len(options), "first_3": options[:3]}, "DEEP_DEBUG")
            return options
        original_input = str(options)
        if isinstance(options, str):
            text = original_input.strip()
            if text.startswith('[') and text.endswith(']'):
                try:
                    json_text = text.replace("'", '"')
                    parsed = json.loads(json_text)
                    if isinstance(parsed, list):
                        self._log("_options_parser", "JSON –ø–∞—Ä—Å–∏–Ω–≥ —É—Å–ø–µ—à–µ–Ω", {"len": len(parsed)}, "DEBUG")
                        return parsed
                except json.JSONDecodeError as e:
                    self._log("_options_parser", "JSON –æ—à–∏–±–∫–∞", {"error": str(e)}, "DEBUG")
            if text.startswith('[') and text.endswith(']'):
                content = text[1:-1].strip()
                self._log("_options_parser", "—Ñ–æ—Ä–º–∞—Ç —Å –ø—Ä–æ–±–µ–ª–∞–º–∏", {"content_preview": content[:100]}, "DEEP_DEBUG")
                items = []
                current_item = ""
                in_quotes = False
                quote_char = None
                i = 0
                while i < len(content):
                    char = content[i]
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                            current_item += char
                        elif char == quote_char:
                            in_quotes = False
                            current_item += char
                            items.append(current_item)
                            current_item = ""
                            i += 1
                            while i < len(content) and content[i] in [' ', '\n', '\t']:
                                i += 1
                            continue
                        else:
                            current_item += char
                    elif char == ' ' and not in_quotes:
                        if current_item:
                            items.append(current_item)
                            current_item = ""
                    else:
                        current_item += char
                    i += 1
                if current_item:
                    items.append(current_item)
                cleaned_items = []
                for item in items:
                    item = item.strip()
                    if item:
                        if (item.startswith('"') and item.endswith('"')) or \
                        (item.startswith("'") and item.endswith("'")):
                            item = item[1:-1]
                        item = item.replace('\\"', '"').replace("\\'", "'").replace('\\n', '\n')
                        cleaned_items.append(item)
                if cleaned_items:
                    self._log("_options_parser", "—Å–ø–µ—Ü–∏–∞–ª—å–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç —Ä–∞—Å–ø–∞—Ä—Å–µ–Ω", {
                        "count": len(cleaned_items),
                        "first_3": cleaned_items[:3]
                    }, "DEBUG")
                    return cleaned_items
        if isinstance(options, str) and ',' in options:
            try:
                parts = []
                current = ""
                in_quotes = False
                quote_char = None
                for char in options:
                    if char in ['"', "'"]:
                        if not in_quotes:
                            in_quotes = True
                            quote_char = char
                        elif char == quote_char:
                            in_quotes = False
                        current += char
                    elif char == ',' and not in_quotes:
                        parts.append(current.strip())
                        current = ""
                    else:
                        current += char
                if current:
                    parts.append(current.strip())
                cleaned_parts = []
                for part in parts:
                    part = part.strip()
                    if part:
                        if (part.startswith('"') and part.endswith('"')) or \
                        (part.startswith("'") and part.endswith("'")):
                            part = part[1:-1]
                        cleaned_parts.append(part)
                if cleaned_parts:
                    self._log("_options_parser", "—Ä–∞–∑–¥–µ–ª–∏–ª–∏ –ø–æ –∑–∞–ø—è—Ç—ã–º", {
                        "count": len(cleaned_parts),
                        "first_3": cleaned_parts[:3]
                    }, "DEBUG")
                    return cleaned_parts
            except Exception as e:
                self._log("_options_parser", "–æ—à–∏–±–∫–∞ –ø—Ä–∏ —Ä–∞–∑–¥–µ–ª–µ–Ω–∏–∏ –ø–æ –∑–∞–ø—è—Ç—ã–º", {"error": str(e)}, "DEBUG")
        if isinstance(options, str) and len(options) > 10:
            self._log("_options_parser", "–ø—Ä–æ–±—É–µ–º LLM –ø–∞—Ä—Å–∏–Ω–≥", None, "DEBUG")
            llm_parsed = self._llm_parse_options(options)
            if llm_parsed:
                return llm_parsed
        self._log("_options_parser", "–Ω–µ —É–¥–∞–ª–æ—Å—å —Ä–∞—Å–ø–∞—Ä—Å–∏—Ç—å", {
            "original_length": len(original_input),
            "original_preview": original_input[:200]
        }, "DEBUG")
        return ["–í–∞—Ä–∏–∞–Ω—Ç—ã –Ω–µ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω—ã"]

    def _llm_parse_options(self, options_text):
        """–ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ LLM –¥–ª—è –ø–∞—Ä—Å–∏–Ω–≥–∞ —Å–ª–æ–∂–Ω—ã—Ö —Ñ–æ—Ä–º–∞—Ç–æ–≤ –æ–ø—Ü–∏–π"""
        try:
            system_prompt = """–¢—ã –∞—Å—Å–∏—Å—Ç–µ–Ω—Ç –¥–ª—è –ø–∞—Ä—Å–∏–Ω–≥–∞ –¥–∞–Ω–Ω—ã—Ö. –ò–∑–≤–ª–µ–∫–∏ —Å–ø–∏—Å–æ–∫ –≤–∞—Ä–∏–∞–Ω—Ç–æ–≤ –æ—Ç–≤–µ—Ç–∞ –∏–∑ —Ç–µ–∫—Å—Ç–∞.
    –¢–µ–∫—Å—Ç –º–æ–∂–µ—Ç –±—ã—Ç—å –≤ —Ä–∞–∑–Ω—ã—Ö —Ñ–æ—Ä–º–∞—Ç–∞—Ö: JSON, Python —Å–ø–∏—Å–æ–∫, –∏–ª–∏ —Å—Ç—Ä–æ–∫–æ–≤–æ–µ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–∏–µ.
    –í–µ—Ä–Ω–∏ –¢–û–õ–¨–ö–û –≤–∞–ª–∏–¥–Ω—ã–π JSON —Å–ø–∏—Å–æ–∫ —Å—Ç—Ä–æ–∫.

    –ü—Ä–∏–º–µ—Ä 1:
    –í—Ö–æ–¥: "['–í–∞—Ä–∏–∞–Ω—Ç A' '–í–∞—Ä–∏–∞–Ω—Ç B' '–í–∞—Ä–∏–∞–Ω—Ç C']"
    –í—ã—Ö–æ–¥: ["–í–∞—Ä–∏–∞–Ω—Ç A", "–í–∞—Ä–∏–∞–Ω—Ç B", "–í–∞—Ä–∏–∞–Ω—Ç C"]

    –ü—Ä–∏–º–µ—Ä 2:
    –í—Ö–æ–¥: "['–Æ–∂–Ω–∞—è –ê–º–µ—Ä–∏–∫–∞' '–Æ–∂–Ω–∞—è –ê–∑–∏—è' '–°–µ–≤–µ—Ä–Ω–∞—è –ê—Ñ—Ä–∏–∫–∞']"
    –í—ã—Ö–æ–¥: ["–Æ–∂–Ω–∞—è –ê–º–µ—Ä–∏–∫–∞", "–Æ–∂–Ω–∞—è –ê–∑–∏—è", "–°–µ–≤–µ—Ä–Ω–∞—è –ê—Ñ—Ä–∏–∫–∞"]

    –ü—Ä–∏–º–µ—Ä 3:
    –í—Ö–æ–¥: "['–í–µ—Ä–Ω–æ, –ù–µ–≤–µ—Ä–Ω–æ' '–ù–µ —É–∫–∞–∑–∞–Ω–æ, –ù–µ —É–∫–∞–∑–∞–Ω–æ']"
    –í—ã—Ö–æ–¥: ["–í–µ—Ä–Ω–æ, –ù–µ–≤–µ—Ä–Ω–æ", "–ù–µ —É–∫–∞–∑–∞–Ω–æ, –ù–µ —É–∫–∞–∑–∞–Ω–æ"]

    –ü–†–ê–í–ò–õ–ê:
    1. –í—Å–µ–≥–¥–∞ –≤–æ–∑–≤—Ä–∞—â–∞–π –≤–∞–ª–∏–¥–Ω—ã–π JSON
    2. –¢–æ–ª—å–∫–æ —Å–ø–∏—Å–æ–∫ —Å—Ç—Ä–æ–∫
    3. –°–æ—Ö—Ä–∞–Ω—è–π –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–π —Ç–µ–∫—Å—Ç –≤–∞—Ä–∏–∞–Ω—Ç–æ–≤
    4. –ï—Å–ª–∏ –Ω–µ –º–æ–∂–µ—à—å —Ä–∞—Å–ø–∞—Ä—Å–∏—Ç—å - –≤–µ—Ä–Ω–∏ –ø—É—Å—Ç–æ–π —Å–ø–∏—Å–æ–∫ []"""

            user_prompt = f"""–ò–∑–≤–ª–µ–∫–∏ —Å–ø–∏—Å–æ–∫ –≤–∞—Ä–∏–∞–Ω—Ç–æ–≤ –∏–∑ —Ç–µ–∫—Å—Ç–∞:

    –¢–µ–∫—Å—Ç: {options_text}

    JSON —Å–ø–∏—Å–æ–∫:"""

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]

            text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            inputs = self.tokenizer(text, return_tensors="pt").to(self.device)

            with torch.no_grad():
                generated_ids = self.model.generate(
                    inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    max_new_tokens=500,
                    temperature=0.1,
                    do_sample=False,
                    pad_token_id=self.tokenizer.pad_token_id
                )

            llm_response = self.tokenizer.decode(
                generated_ids[0][inputs.input_ids.shape[1]:],
                skip_special_tokens=True
            )

            self._log("_llm_parse_options", "LLM –æ—Ç–≤–µ—Ç", {"response": llm_response[:200]}, "DEBUG")


            try:
                parsed = json.loads(llm_response)
                if isinstance(parsed, list):
                    self._log("_llm_parse_options", "—É—Å–ø–µ—à–Ω–æ —Ä–∞—Å–ø–∞—Ä—Å–µ–Ω–æ", {"count": len(parsed)}, "DEBUG")
                    return parsed
            except json.JSONDecodeError:

                import re
                json_match = re.search(r'\[.*\]', llm_response, re.DOTALL)
                if json_match:
                    try:
                        parsed = json.loads(json_match.group())
                        if isinstance(parsed, list):
                            self._log("_llm_parse_options", "–Ω–∞—à–ª–∏ JSON –≤ –æ—Ç–≤–µ—Ç–µ", {"count": len(parsed)}, "DEBUG")
                            return parsed
                    except:
                        pass
            return []
        except Exception as e:
            self._log("_llm_parse_options", "–æ—à–∏–±–∫–∞", {"error": str(e)}, "DEBUG")
            return []


    def _regex_parse_answer(self, text: str) -> int | None:
        text = str(text).strip()
        if not text:
            return 
        if text.startswith("'") or text.startswith('"'):
            text = text[1:]
        if text.endswith("'") or text.endswith('"'):
            text = text[:-1]
        answer_patterns = [
            r'ANSWER:\s*(\d+)',
            r'Answer:\s*(\d+)',
            r'answer:\s*(\d+)',
            r'\[ANSWER:\s*(\d+)\]',
            r'\[Answer:\s*(\d+)\]',
            r'[\'"]ANSWER:\s*(\d+)[\'\"]',
            r'[\'"]Answer:\s*(\d+)[\'\"]',
        ]
        for pattern in answer_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                try:
                    num = int(match.group(1))
                    return num
                except (ValueError, TypeError):
                    continue
        answer_match = re.search(r'ANSWER:\s*([^\.\n\r\t\f\v]+)', text, re.IGNORECASE)
        if answer_match:
            number_part = answer_match.group(1).strip()
            number_part = re.sub(r'[^\d]+$', '', number_part)
            try:
                num = int(number_part)
                return num
            except (ValueError, TypeError):
                pass
        answer_pos = text.lower().find('answer:')
        if answer_pos != -1:
            remaining_text = text[answer_pos + 7:]
            first_num_match = re.search(r'(\d+)', remaining_text)
            if first_num_match:
                try:
                    num = int(first_num_match.group(1))
                    return num
                except (ValueError, TypeError):
                    pass
        return None


    def direct_prompt(
        self,
        user_prompt:str,
        system_prompt:str,
        tokens:int = 1000,
        temperature:float = 0.1,
        few_shot = True
    ):
        self._log("direct_prompt", "–Ω–∞—á–∞–ª–æ", {
            "user_len": len(user_prompt),
            "system_len": len(system_prompt),
            "temperature": temperature
        }, "DEEP_DEBUG")

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]

        text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        self._log("direct_prompt", "—à–∞–±–ª–æ–Ω –ø—Ä–∏–º–µ–Ω–µ–Ω", {"input_length": len(text)}, "DEEP_DEBUG")

        inputs = self.tokenizer(text, return_tensors="pt").to(self.device)

        with torch.no_grad():
            generated_ids = self.model.generate(
                inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens=tokens,
                temperature=temperature,
                do_sample=False,
                pad_token_id=self.tokenizer.pad_token_id
            )

        response = self.tokenizer.decode(
            generated_ids[0][inputs.input_ids.shape[1]:],
            skip_special_tokens=True
        )

        self._log("direct_prompt", "–ø–æ–ª—É—á–µ–Ω –æ—Ç–≤–µ—Ç", {"response_length": len(response)}, "DEBUG")

        if self.DEEP_DEBUG:
            print(f"[DEEP_DEBUG] direct_prompt response ({len(response)} chars):")
            print(f"{response[:500]}...")

        return response

    def get_debug_logs(self):
        return self.debug_logs

    def clear_debug_logs(self):
        self.debug_logs = []

    def save_debug_logs(self, filename="llm_debug_logs.json"):
        import json
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(self.debug_logs, f, ensure_ascii=False, indent=2)
        print(f"–õ–æ–≥–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ {filename}")

    def print_debug_summary(self):
        if not self.DEBUG and not self.DEEP_DEBUG:
            print("–û—Ç–ª–∞–¥–∫–∞ –æ—Ç–∫–ª—é—á–µ–Ω–∞")
            return

        print(f"\n{'='*60}")
        print("–°–í–û–î–ö–ê –û–¢–õ–ê–î–ö–ò LLM")
        print(f"{'='*60}")
        print(f"–í—Å–µ–≥–æ –ª–æ–≥–æ–≤: {len(self.debug_logs)}")

        if self.debug_logs:
            methods = {}
            levels = {}
            for log in self.debug_logs:
                method = log.get("method", "unknown")
                level = log.get("level", "unknown")
                methods[method] = methods.get(method, 0) + 1
                levels[level] = levels.get(level, 0) + 1

            print("\n–í—ã–∑–æ–≤—ã –º–µ—Ç–æ–¥–æ–≤:")
            for method, count in sorted(methods.items()):
                print(f"  {method}: {count}")

            print("\n–£—Ä–æ–≤–Ω–∏ –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏—è:")
            for level, count in sorted(levels.items()):
                print(f"  {level}: {count}")

            print(f"\nDEBUG: {self.DEBUG}")
            print(f"DEEP_DEBUG: {self.DEEP_DEBUG}")


    def _log(self, method, message, data=None, level="DEBUG"):
        if level == "DEBUG" and not self.DEBUG:
            return
        if level == "DEEP_DEBUG" and not self.DEEP_DEBUG:
            return

        log_entry = {
            "timestamp": time.time(),
            "method": method,
            "message": message,
            "data": data,
            "level": level
        }
        self.debug_logs.append(log_entry)

        if self.DEBUG or self.DEEP_DEBUG:
            print(f"[{level}] {method}: {message}")
            if data and self.DEEP_DEBUG:
                print(f"    –î–∞–Ω–Ω—ã–µ: {data}")

    def _log_response(self, stage, raw_response, parsed, expected=None, metadata=None):
        if not self.DEEP_DEBUG:
            return

        print(f"\n{'='*80}")
        print(f"[DEEP_DEBUG] {stage}")
        print(f"–°—ã—Ä–æ–π –æ—Ç–≤–µ—Ç ({len(raw_response)} chars):")
        print(f"{raw_response[:500]}...")
        print(f"–†–∞—Å–ø–∞—Ä—Å–µ–Ω–æ: {parsed}")
        if expected is not None:
            print(f"–û–∂–∏–¥–∞–ª–æ—Å—å: {expected}")
            print(f"–°–æ–≤–ø–∞–¥–µ–Ω–∏–µ: {parsed == expected}")
        if metadata:
            print(f"–ú–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ: {metadata}")
        print(f"{'='*80}\n")

        log_entry = {
            "timestamp": time.time(),
            "stage": stage,
            "raw_response": raw_response[:1000],
            "parsed": parsed,
            "expected": expected,
            "metadata": metadata
        }
        self.debug_logs.append(log_entry)

    def _calculate_metrics(self, results_df, answer_column):
        """–°—á–∏—Ç–∞–µ—Ç –º–µ—Ç—Ä–∏–∫–∏ –∏ –≤—ã–≤–æ–¥–∏—Ç –≤ –∫–æ–Ω—Å–æ–ª—å"""
        if answer_column not in results_df.columns:
            print("‚ÑπÔ∏è –û—Ç–≤–µ—Ç—ã –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏ –Ω–µ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω—ã")
            return {}
        
        if 'is_correct' not in results_df.columns:
            print("‚ö†Ô∏è –ö–æ–ª–æ–Ω–∫–∞ is_correct –Ω–µ –Ω–∞–π–¥–µ–Ω–∞")
            return {}
        
        correct = results_df['is_correct'].sum()
        total = len(results_df)
        accuracy = correct / total if total > 0 else 0.0
        
        print("\n" + "="*60)
        print("üìä –†–ï–ó–£–õ–¨–¢–ê–¢–´ –¢–ï–°–¢–ò–†–û–í–ê–ù–ò–Ø")
        print("="*60)
        print(f"–í—Å–µ–≥–æ –≤–æ–ø—Ä–æ—Å–æ–≤: {total}")
        print(f"–ü—Ä–∞–≤–∏–ª—å–Ω—ã—Ö –æ—Ç–≤–µ—Ç–æ–≤: {correct}")
        print(f"–¢–æ—á–Ω–æ—Å—Ç—å: {accuracy:.2%} ({correct}/{total})")
        
        if 'category' in results_df.columns:
            print("\nüìà –ü–æ –∫–∞—Ç–µ–≥–æ—Ä–∏—è–º:")
            for category in sorted(results_df['category'].unique()):
                cat_df = results_df[results_df['category'] == category]
                cat_correct = cat_df['is_correct'].sum()
                cat_total = len(cat_df)
                cat_accuracy = cat_correct / cat_total if cat_total > 0 else 0
                print(f"  {category}: {cat_accuracy:.2%} ({cat_correct}/{cat_total})")
        
        print("="*60)
        
        return {
            'total_questions': total,
            'correct_answers': int(correct),
            'accuracy': float(accuracy),
            'accuracy_percent': f"{accuracy * 100:.2f}%"
        }

    def evaluate_dataframe(
        self,
        df,
        question_column="question",
        options_column="options",
        category_column="category",
        answer_column="true_answer",
        method_kwargs=None,
    ):
        """–û—Ü–µ–Ω–∫–∞ –¥–∞—Ç–∞—Ñ—Ä–µ–π–º–∞ —Å –æ–¥–Ω–∏–º –º–µ—Ç–æ–¥–æ–º generate_answer"""
        if method_kwargs is None:
            method_kwargs = {}
        
        self._log("evaluate_dataframe", "–Ω–∞—á–∞–ª–æ", {
            "rows": len(df),
            "has_answer_column": answer_column in df.columns
        }, "DEBUG")
        
        predictions = []
        has_actual_answers = answer_column in df.columns and df[answer_column].notna().any()
        
        pbar = tqdm(total=len(df), desc="–û–±—Ä–∞–±–æ—Ç–∫–∞ –≤–æ–ø—Ä–æ—Å–æ–≤")
        
        for idx, row in df.iterrows():
            try:
                question = str(row[question_column])
                options = row[options_column]
                category = str(row[category_column])
                
                predicted = self.generate_answer(
                    question=question,
                    encoded_options=options,
                    category=category,
                    **method_kwargs
                )
                
                predictions.append(predicted)
                pbar.update(1)
                pbar.set_postfix({'–∫–∞—Ç–µ–≥–æ—Ä–∏—è': category[:10], '–æ—Ç–≤–µ—Ç': predicted})
                
            except Exception as e:
                print(f"\n‚ùå –û—à–∏–±–∫–∞ –≤ —Å—Ç—Ä–æ–∫–µ {idx}: {e}")
                predictions.append(0)
                pbar.update(1)
        
        pbar.close()
        results_df = df.copy()
        results_df['predicted'] = predictions
        if has_actual_answers:
            results_df['predicted_parsed'] = results_df['predicted'].apply(
                lambda x: int(x)) if pd.notna(x) else 0

            results_df['answer_parsed'] = results_df[answer_column].apply(
                lambda x: int(x)) if pd.notna(x) else 0

            results_df['is_correct'] = results_df['predicted_parsed'] == results_df['answer_parsed']
        metrics = self._calculate_metrics(results_df, answer_column if has_actual_answers else None)
        return results_df, metrics

    def process_csv_files(
        self,
        questions_csv_path: str,
        answers_csv_path: Optional[str] = None,
        output_dir: str = "./results",
        method_kwargs=None,
    ):
        """–û–±—Ä–∞–±–æ—Ç–∫–∞ CSV —Ñ–∞–π–ª–æ–≤ - —Å–æ–∑–¥–∞–µ—Ç —Ñ–∞–π–ª —Å –∫–æ–ª–æ–Ω–∫–æ–π ANSWER"""
        if method_kwargs is None:
            method_kwargs = {}
        
        self._log("process_csv_files", "–Ω–∞—á–∞–ª–æ –æ–±—Ä–∞–±–æ—Ç–∫–∏", {
            "questions_file": questions_csv_path,
            "answers_file": answers_csv_path,
            "output_dir": output_dir
        }, "DEBUG")
        
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        print(f"üìÅ –î–∏—Ä–µ–∫—Ç–æ—Ä–∏—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤: {output_path.absolute()}")
        
        print("üì• –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö...")
        try:
            questions_df = pd.read_csv(questions_csv_path)
            
            required_cols = ['question', 'options', 'category']
            missing_cols = [col for col in required_cols if col not in questions_df.columns]
            if missing_cols:
                raise ValueError(f"–û—Ç—Å—É—Ç—Å—Ç–≤—É—é—Ç –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã–µ –∫–æ–ª–æ–Ω–∫–∏: {missing_cols}")
            
            questions_df['question'] = questions_df['question'].astype(str)
            questions_df['options'] = questions_df['options'].astype(str)
            questions_df['category'] = questions_df['category'].astype(str)
            
            print(f"  ‚úì –ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(questions_df)} –≤–æ–ø—Ä–æ—Å–æ–≤")
            
        except Exception as e:
            raise ValueError(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ —Ñ–∞–π–ª–∞ —Å –≤–æ–ø—Ä–æ—Å–∞–º–∏: {e}")
        
        answer_column = 'answer'
        if answers_csv_path:
            try:
                answers_df = pd.read_csv(answers_csv_path)
                if 'answer' in answers_df.columns:
                    questions_df = questions_df.merge(answers_df[['answer']], 
                                                    left_index=True, right_index=True, how='left')
                    answer_column = 'answer'
                    print(f"  ‚úì –ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(answers_df)} –æ—Ç–≤–µ—Ç–æ–≤ –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏")
                else:
                    print("‚ö†Ô∏è –í —Ñ–∞–π–ª–µ —Å –æ—Ç–≤–µ—Ç–∞–º–∏ –Ω–µ—Ç –∫–æ–ª–æ–Ω–∫–∏ 'answer'")
            except Exception as e:
                print(f"‚ö†Ô∏è –ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –æ—Ç–≤–µ—Ç—ã: {e}")
        
        print("\nü§ñ –ó–∞–ø—É—Å–∫ –º–æ–¥–µ–ª–∏...")
        
        results_df, metrics = self.evaluate_dataframe(
            questions_df,
            answer_column=answer_column,
            method_kwargs=method_kwargs
        )
        
        print("\n‚úÖ –û–±—Ä–∞–±–æ—Ç–∫–∞ –∑–∞–≤–µ—Ä—à–µ–Ω–∞!")
        
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        output_filename = f"submission_{timestamp}.csv"
        output_filepath = output_path / output_filename
        
        
        answer_df = pd.DataFrame({
            'answer': results_df['predicted']
        })
        
        try:
            answer_df.to_csv(output_filepath, index=False, encoding='utf-8')
            print(f"\nüíæ –§–∞–π–ª —Å–æ—Ö—Ä–∞–Ω–µ–Ω: {output_filepath}")
            
            print("\nüëÄ –ü—Ä–µ–¥–ø—Ä–æ—Å–º–æ—Ç—Ä —Ñ–∞–π–ª–∞ (–ø–µ—Ä–≤—ã–µ 10 —Å—Ç—Ä–æ–∫):")
            print("-" * 40)
            print("ANSWER")
            for i, answer in enumerate(answer_df['ANSWER'].head(10)):
                print(f"{answer}")
            print("-" * 40)
            file_size = os.path.getsize(output_filepath)
            print(f"üìÑ –†–∞–∑–º–µ—Ä —Ñ–∞–π–ª–∞: {file_size} –±–∞–π—Ç")
            
        except Exception as e:
            print(f"‚ùå –û—à–∏–±–∫–∞ –ø—Ä–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–∏ —Ñ–∞–π–ª–∞: {e}")
        
        return answer_df, metrics

NameError: name 'PROMPTS' is not defined

KeyboardInterrupt: 