# Self-Refine Test 

## 1. Load environment

In [1]:
%pip install dashscope python-dotenv urllib3

Defaulting to user installation because normal site-packages is not writeable
Collecting dashscope
  Using cached dashscope-1.22.2-py3-none-any.whl.metadata (6.8 kB)
Using cached dashscope-1.22.2-py3-none-any.whl (1.3 MB)
Installing collected packages: dashscope
Successfully installed dashscope-1.22.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [39]:
import os
import dashscope
import sys
import platform
import dashscope
import time
import re
from dotenv import load_dotenv
from typing import Tuple, Optional, Dict, Union
from dashscope.api_entities.dashscope_response import Message

In [None]:
# 1) Load environment (DASHSCOPE_API_KEY)

load_dotenv("dashscope_api_key.env")
api_key = os.getenv("DASHSCOPE_API_KEY")
if not api_key:
    print("DASHSCOPE_API_KEY not found!")

# Print environment information
print("Current environment information")
print("-" * 40)
print(f"Python version: {sys.version}")
print(f"Platform system: {platform.system()} {platform.release()}")
print(f"DASHSCOPE_API_KEY read successfully: {'✅ Yes' if api_key else '❌ No'}")

✅ Current environment information
----------------------------------------
Python version: 3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)]
Platform system: Windows 10
DASHSCOPE_API_KEY read successfully: ✅ Yes


## 2. Call Function

In [46]:
def call_model(
    prompt: str,
    model_name: str = "qwen2.5-math-1.5b-instruct",
    system_prompt: str = (
        "You are a helpful and precise math tutor. "
        "Always solve the problem step-by-step. "
        "Then, at the very end of your answer, write on a new line:\n"
        "**Final Answer: <your numeric result>**\n"
        "Do not forget this final answer format. It is required."
    ),
    temperature: float = 0.7,
    max_retries: int = 3,
    retry_delay: float = 1.0,
    verbose: bool = False
) -> Tuple[Optional[str], Optional[dict]]:
    """
    Call Qwen via DashScope with retry support and structured prompt.
    Returns (content, usage) if successful, else (None, None).
    """
    api_key = os.getenv("DASHSCOPE_API_KEY")
    if not api_key:
        print("DASHSCOPE_API_KEY not found!")
        return None, None

    messages = [
        Message(role="system", content=system_prompt),
        Message(role="user", content=prompt)
    ]

    for attempt in range(1, max_retries + 1):
        try:
            response = dashscope.Generation.call(
                api_key=api_key,
                model=model_name,
                messages=messages,
                temperature=temperature,
                result_format="message",
                timeout=60
            )

            status_code = response["status_code"]  # type: ignore
            if status_code == 429:
                print(f"⚠️[Attempt {attempt}/{max_retries}] Rate limit! Retrying in {retry_delay}s...")
                time.sleep(retry_delay)
                continue

            content = response["output"]["choices"][0]["message"]["content"]  # type: ignore
            usage = response.get("usage", {})  # type: ignore
            # if verbose:
            #     print("Prompt:\n", prompt)
            #     print("\nModel Response:\n", content)
            #     print("\nToken Usage:", usage)
            #     print(response)
            return content, usage

        except Exception as e:
            print(f"⚠️[Attempt {attempt}/{max_retries}] Exception occurred: {e}")
            time.sleep(retry_delay)

    print(f"All {max_retries} attempts failed.")
    return None, None

In [45]:
question = "A baker makes 20 cupcakes. He eats 2 and gives away 3. How many does he have left?"

content, usage = call_model(
    prompt=f"Solve the following problem step-by-step:\n{question}",
    verbose=True
)
content, usage

("To determine how many cupcakes the baker has left, we need to follow these steps:\n\n1. Start with the initial number of cupcakes the baker has, which is 20.\n2. Subtract the number of cupcakes the baker eats, which is 2.\n3. Subtract the number of cupcakes the baker gives away, which is 3.\n\nLet's perform the calculations step-by-step:\n\n1. Initial number of cupcakes: 20\n2. After eating 2 cupcakes: \\(20 - 2 = 18\\)\n3. After giving away 3 cupcakes: \\(18 - 3 = 15\\)\n\nSo, the number of cupcakes the baker has left is \\(\\boxed{15}\\).",
 GenerationUsage(input_tokens=102, output_tokens=149))

## 3. Extract Answer

In [None]:
def extract_final_answer(response_text: str) -> str:
    """
    Extract the final numeric or concise answer from the given response text.

    The order of priority is as follows:
    1. Check "Final Answer: ..." (case-insensitive).
    2. Check the last occurrence of "\boxed{...}" (LaTeX style). From that box, parse advanced numeric forms.
    3. If no box found or no valid numeric in it, scan the entire text for the last recognized numeric form.
    4. If nothing is found at all, return the entire trimmed response text.
    """

    # ----------------------------
    # 1) Check for "Final Answer: ..."
    #    (Now has the highest priority)
    # ----------------------------
    fa_match = re.search(r"(?i)final answer\s*:\s*([^\n]+)", response_text)
    if fa_match:
        candidate = fa_match.group(1).strip()
        # If we want to parse advanced numeric forms from this candidate,
        # we can do so here. Otherwise, we return directly.
        return candidate

    # ----------------------------
    # 2) Check for the last occurrence of "\boxed{...}"
    #    We'll parse advanced numeric forms from within it.
    #    If multiple boxes exist, we take the last one.
    # ----------------------------
    # Regex to match all \boxed{...} blocks (including nested braces is tricky, but we do a simple approach)
    all_boxes = re.findall(r"\\boxed\{([^}]*)\}", response_text, flags=re.DOTALL)
    if all_boxes:
        box_content = all_boxes[-1]  # take the last box
        # Try to parse advanced numeric forms from the box content
        numbers_in_box = parse_numeric_expressions(box_content)
        if numbers_in_box:
            # If multiple numbers found, return the last one
            return numbers_in_box[-1].strip()
        # If no recognized numeric, return the raw content
        return box_content.strip()

    # ----------------------------
    # 3) Fallback: parse the entire text for recognized numeric forms
    #    Return the last one if available.
    # ----------------------------
    all_numbers = parse_numeric_expressions(response_text)
    if all_numbers:
        return all_numbers[-1].strip()

    # ----------------------------
    # 4) Final fallback: return entire string
    # ----------------------------
    return response_text.strip()


def parse_numeric_expressions(text: str):
    """
    Parse various numeric expressions from the text, including:
    - Optional sign: [+/-]
    - Integers or decimals (e.g., 123, -3.14)
    - Simple LaTeX fraction: \frac{number}{number}

    Returns a list of all matched string forms in order of appearance.
    """
    # We'll combine a few patterns:
    # 1) signed integer or decimal: [+-]?\d+(?:\.\d+)?
    # 2) simple fraction form: \frac\s*\{?\s*[+-]?\d+(\.\d+)?\s*\}?\s*/\s*\{?\s*[+-]?\d+(\.\d+)?\s*\}?
    pattern = r"""
        [+-]?\d+(?:\.\d+)?              # e.g. -3, 4.5, +2.0
        |
        \\frac\s*\{?\s*[+-]?\d+(?:\.\d+)?\s*\}?\s*/\s*\{?\s*[+-]?\d+(?:\.\d+)?\s*\}?  # e.g. \frac{3}{5}, \frac{-2.5}{10}
    """

    # Using VERBOSE flag to allow inline comments
    matches = re.findall(pattern, text, flags=re.VERBOSE)
    return matches

## 4. Self-Refine

### Self-Refine: Iterative Feedback and Refinement

The **`self_refine`** function is designed to generate and refine a solution for a mathematical problem using multiple rounds of self-critique and improvement. It carefully orchestrates how the model:

1. **Produces an Initial Draft**  
   - The function prompts the model to solve the math problem in a rigorous, step-by-step manner.
   - This initial draft serves as the baseline that will be critiqued in the subsequent steps.

2. **Generates Feedback**  
   - In each round, the model is presented with its own prior output (the “draft answer”) along with the original problem.
   - It is asked to analyze correctness, clarity, and completeness, suggesting specific improvements if errors are found.  
   - If the model indicates “no improvement needed,” and at least one round of refinement has already occurred, the process may stop early.

3. **Refines the Answer**  
   - Following the feedback, the model refines its previous draft to address the identified gaps or errors.
   - This new refined answer replaces the old draft and can be further iterated upon if additional rounds are available or required.

4. **At-Least-One-Round Condition**  
   - The function ensures that the model goes through at least one refinement cycle. If the model declares “everything is correct” on the very first round, the function still carries out that round to solidify an improved version of the draft (if needed) before checking for early termination.

5. **Early Stopping**  
   - In subsequent rounds (after the first has completed), if the model’s feedback repeatedly indicates no more changes are necessary or the draft is already correct, the iteration halts. This “early stopping” heuristic reduces unnecessary computation.

6. **Usage and Round Tracking**  
   - Every call to the model (for either feedback or refinement) accumulates usage data—such as the total token count—into a running total.
   - The function also records which round triggered the exit, whether due to reaching the maximum number of allowed rounds or stopping early after an “all good” declaration.

7. **Extracting the Final Answer**  
   - Once the iterations conclude, the final, refined text is passed through a function (e.g., `extract_final_answer`) to parse out the numeric result or short answer if desired.
   - This extracted value, along with the full refined text, the total token usage, and the index of the last refinement round, is returned to the caller.

### Key Points & Considerations

- **Rigorous System Prompt**  
  - The model is cast in the role of a “rigorous math tutor,” ensuring detailed reasoning and clarity in every generation.
  - >Since both temperature and top_p can control the diversity of generated text, it is recommended that you only set one of them. 
    - So I lowered the temperature a bit, reduced the diversity appropriately, and ensured the rigor of the mathematical results.

- **Feedback-Refine Loop**  
  The model alternates between critiquing its own output and improving it, thereby aiming to catch logical or computational mistakes in a structured manner.

- **Mandatory Improvement Round**  
  The function will not accept an immediate “no improvement” response for the first refinement attempt. This ensures that, at a minimum, the model has made one pass to reevaluate its initial draft.

- **Early Stop Heuristic**  
  After at least one refinement, if the model states that no further changes are needed, the loop terminates early. This method is a convenience to reduce overhead when the model is sufficiently confident.

- **No Absolute Guarantee of Correctness**  
  While the approach often enhances solution quality, the model’s own judgment may still fail to catch subtler errors. For high-stakes scenarios, external validation (e.g., test suites or domain experts) is recommended.

- **Return Structure**  
  The final output includes:
  1. **`final_text`**: The model’s last refined draft (full textual answer).  
  2. **`final_answer`**: The numeric or short result extracted from `final_text`.  
  3. **`ended_on_round`**: Which round triggered the exit, either because of reaching the maximum iteration or early stopping.  
  4. **`total_usage`**: A cumulative usage metric (e.g., total tokens).

This procedure illustrates a practical way to nudge large language models toward more self-critical and iteratively improved answers, particularly for math-oriented tasks.

In [None]:
def self_refine(
    question: str,
    rounds: int = 3,
    verbose: bool = True,
    model_name: str = "qwen2.5-math-1.5b-instruct",
    temperature: float = 0.2,
) -> Dict:
    """
    Conduct multiple rounds of 'Self-Refine' for a math problem, returning the final refined answer.

    Args:
        question (str): The math problem to be solved.
        rounds (int): The maximum number of refinement iterations.
        verbose (bool): Whether to print intermediate steps.
        model_name (str): The name of the language model to call.
        temperature (float): Sampling temperature for controlling randomness in the output.

    Returns:
        dict with keys:
          {
              "final_text"      : str or None, 
              "final_answer"    : str or None,
              "ended_on_round"  : int,
              "total_usage"     : int  # or float, depending on actual usage measure
          }
    """

    # -----------------------------
    # System prompt (for rigorous step-by-step math reasoning)
    # -----------------------------
    system_prompt = (
        "You are a rigorous math tutor. "
        "You solve problems step by step in detail and ensure mathematical correctness. "
        "Then, at the end, on a new line, write:\n"
        "Final Answer: <numeric result>\n"
        "Do not include any additional text after that line."
    )

    # -----------------------------
    # Template for the initial draft generation
    # -----------------------------
    first_prompt_template = (
        "Solve the following math problem with detailed reasoning. "
        "Explain your steps clearly and logically. "
        "At the end, please write:\n"
        "Final Answer: <number>\n\n"
        "{question}"
    )

    # -----------------------------
    # Template for generating feedback about the draft answer
    # -----------------------------
    feedback_prompt_template = (
        "Below is a math problem and a draft solution:\n\n"
        "Problem:\n{question}\n\n"
        "Draft Answer:\n{draft}\n\n"
        "Please analyze the draft solution for correctness, clarity, and completeness. "
        "Point out any errors or omissions, and suggest specific improvements. "
        "If the solution is already correct and needs no improvement, state that explicitly."
    )

    # -----------------------------
    # Template for refining the draft answer based on feedback
    # -----------------------------
    refine_prompt_template = (
        "Here is a math problem, a draft solution, and some feedback:\n\n"
        "Problem:\n{question}\n\n"
        "Draft Answer:\n{draft}\n\n"
        "Feedback:\n{feedback}\n\n"
        "Please refine the draft solution to correct errors and address the feedback. "
        "At the end, write 'Final Answer: <number>'. "
        "Ensure the steps are logically sound and the numeric result is correct."
    )

    # =============================
    # 0) Setup usage tracking, etc.
    # =============================
    total_usage_tokens = 0
    ended_on_round = 0

    # -----------------------------
    # 1) Generate the initial draft
    # -----------------------------
    init_prompt = first_prompt_template.format(question=question)
    draft, usage_init = call_model(
        prompt=init_prompt,
        model_name=model_name,
        system_prompt=system_prompt,
        temperature=temperature,
        verbose=verbose
    )

    if usage_init is None:
        usage_init = {"total_tokens": 0}
    total_usage_tokens += usage_init['total_tokens']

    if draft is None:
        if verbose:
            print("❌[Error] Initial draft generation returned None.")
        return {
            "final_text": None,
            "final_answer": None,
            "ended_on_round": 0,
            "total_usage": total_usage_tokens
        }

    if verbose:
        print("\n✅━━ [Initial Draft Generated] ━━✅")
        print(draft)
        print("")  # Extra line

    current_answer = draft

    # -----------------------------
    # 2) Iterative refinement
    # -----------------------------
    for i in range(rounds):
        round_index = i + 1
        if verbose:
            print(f"[Refinement Round {i+1} of {rounds}]")

        # 2a) Generate feedback
        feedback_prompt = feedback_prompt_template.format(
            question=question,
            draft=current_answer
        )
        feedback, usage_fb = call_model(
            prompt=feedback_prompt,
            model_name=model_name,
            system_prompt=system_prompt,
            temperature=temperature,
            verbose=verbose
        )

        if usage_fb is None:
            usage_fb = {"total_tokens": 0}
        total_usage_tokens += usage_fb['total_tokens']

        if feedback is None:
            if verbose:
                print("❌[Error] Feedback generation returned None. Stopping refinement.")
            ended_on_round = round_index
            break

        if verbose:
            print("✅ [Feedback Received]")
            print(feedback)
            print("")

        # 2b) Check if feedback suggests no improvement, at least one round.
        lower_fb = feedback.lower()
        if round_index > 1 and ("no improvement" in lower_fb or "solution is correct" in lower_fb or "already correct" in lower_fb):
            if verbose:
                print("✅[Notice] Feedback indicates no further improvement is needed. Stopping early.")
            ended_on_round = round_index
            break

        # 2c) Refine the answer
        refine_prompt = refine_prompt_template.format(
            question=question,
            draft=current_answer,
            feedback=feedback
        )
        refined, usage_refine = call_model(
            prompt=refine_prompt,
            model_name=model_name,
            system_prompt=system_prompt,
            temperature=temperature,
            verbose=verbose
        )

        if usage_refine is None:
            usage_refine = {"total_tokens": 0}
        total_usage_tokens += usage_refine['total_tokens']

        if refined is None:
            if verbose:
                print("⚠️[Warning] Refinement returned None. Using current version and stopping.")
            ended_on_round = round_index
            break

        current_answer = refined  # Update the answer
        if verbose:
            print("✅ [Refined Answer]")
            print(current_answer)
            print("")
            
        ended_on_round = round_index  # if we never break, we'll end after final iteration

    # =============================
    # 3) Extract final numeric answer if requested
    # =============================
    final_answer = extract_final_answer(current_answer)

    # =============================
    # 4) Return collected info
    # =============================
    return {
        "final_text": current_answer,
        "final_answer": final_answer,
        "ended_on_round": ended_on_round,
        "total_usage": total_usage_tokens
    }

## 5. Single Demo Question Test

In [48]:
if __name__ == "__main__":
    question = "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"

    # Gold answer
    gold_answer = "18"

    # Do self-consistency
    result = self_refine(
        question=question,
        model_name="qwen2.5-math-1.5b-instruct",
        rounds=3
    )
    final_text = result["final_text"]
    final_answer = result["final_answer"]
    ended_on_round = result["ended_on_round"]
    total_usage = result["total_usage"]

    # Compare with gold
    is_correct = (final_answer.strip() == gold_answer.strip())
    response_length = len(final_text)

    
    # Print the info
    print("\n━━━ ★  SINGLE QUESTION TEST  ★ ━━━")
    print(f"📌  Question:\n{question}\n")
    print(f"💡  Predicted Final Answer: {final_answer}")
    print(f"🎯  Gold Answer: {gold_answer}")
    print(f"✅  Correct? {is_correct}")
    print(f"🔎  Response Length (chars): {response_length}")
    print(f"🌀  Ended on Round: {ended_on_round}")
    print(f"📊  Total Usage (tokens): {total_usage}")
    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n")


✅━━ [Initial Draft Generated] ━━✅
To determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the total number of eggs laid by the ducks each day.
2. Determine how many eggs Janet eats for breakfast and bakes for her friends.
3. Find out how many eggs are left for sale at the farmers' market.
4. Calculate the revenue from selling these eggs at $2 per egg.

Let's start with the first step:

1. The total number of eggs laid by the ducks each day is 16.

2. Janet eats 3 eggs for breakfast every morning and bakes 4 eggs for her friends every day. So, the total number of eggs she uses or sells immediately is:
   \[
   3 + 4 = 7 \text{ eggs}
   \]

3. The number of eggs left for sale at the farmers' market is:
   \[
   16 - 7 = 9 \text{ eggs}
   \]

4. Since each egg is sold for $2, the revenue from selling 9 eggs is:
   \[
   9 \times 2 = 18 \text{ dollars}
   \]

Therefore, the amount Janet makes every day at the farmers' market is:
\[
