In [2]:
import pandas as pd
import numpy as np
from datasets import load_dataset, DatasetDict, Dataset
import sys
sys.path.append("src/utils/")
import text_processing_utils
import component_formatting_utils
import evaluate_utils
from concurrent.futures import ThreadPoolExecutor, as_completed
from engine import get_engine, invoke_engine
import os
import time, re, csv
import langchain_core
from tqdm import tqdm

  from pandas.core import (


In [3]:
# df = pd.read_csv("/Users/emilbiju/Downloads/base_model_output_gsm8k_DeepSeek-R1-Distill-Qwen-7B.csv")
# accuracy, num_problems_attempted, num_problems_incomplete, res_df_complete = evaluate_utils.get_eval_metrics(df)

In [4]:
def last_boxed_only_string(string: str):
    idx = string.rfind("\\boxed")
    if "\\boxed " in string:
        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
    if idx < 0:
        idx = string.rfind("\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(string):
        if string[i] == "{":
            num_left_braces_open += 1
        if string[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1

    if right_brace_idx is None:
        retval = None
    else:
        retval = string[idx : right_brace_idx + 1]

    return retval

def remove_boxed(s: str) -> str:
    if "\\boxed " in s:
        left = "\\boxed "
        assert s[: len(left)] == left
        return s[len(left) :]

    left = "\\boxed{"

    assert s[: len(left)] == left
    assert s[-1] == "}"

    return s[len(left) : -1]

In [5]:
model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
inference_server_url = "http://34.173.72.140:8000/v1/"
dataset_name = "emilbiju/OpenThoughts-correct"
split = "train"
test_subset = "math"
ds = load_dataset(dataset_name, test_subset)
ds = ds[split]

In [6]:
ds = ds.map(lambda x: {'final_answer': evaluate_utils.remove_boxed(evaluate_utils.last_boxed_only_string(x['solution']))})


In [7]:
engine = get_engine(model, temperature=0.6, inference_server_url=inference_server_url)
# ds = load_dataset(dataset_name, test_subset)
# ds = ds[split]
num_samples = len(ds)

ds_cols = ds.column_names
question_col = [col for col in ds_cols if col in ["Question", "question", "Problem", "problem"]]
assert len(question_col) == 1, f"Found multiple question columns: {question_col}"
question_col = question_col[0]

final_answer_col = [col for col in ds_cols if col in ["FinalAnswer", "final_answer", "Answer", "answer"]]
assert len(final_answer_col) == 1, f"Found multiple final answer columns: {final_answer_col}"
final_answer_col = final_answer_col[0]

# problem_idx_col = [col for col in ds_cols if col in ["ProblemIdx", "problemidx", "unique_id"]]
# assert len(problem_idx_col) == 1, f"Found multiple problem index columns: {problem_idx_col}"
# problem_idx_col = problem_idx_col[0]
# if "/" in ds[problem_idx_col][0]:
#     ds = ds.map(lambda x: {'unique_id' : os.path.splitext(os.path.basename(x['unique_id']))[0]})

In [8]:
prompt = """Your role as an assistant involves thoroughly exploring questions through a systematic long thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution. In the Thought section, detail your reasoning process using the specified format: <think> \{thought with steps separated with '\n\n'\} </think> Each step should include detailed considerations such as analisying questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The solution should remain a logical, accurate, concise expression style and detail necessary step needed to reach the conclusion. Now, try to solve the following question through the above guidelines:

Return your final response within \\boxed{}.
"""

In [9]:
logs_dir = os.getenv("LOGS_DIR")                                                
base_model_output_dir = os.path.join(logs_dir, f"base_model_output_{model.replace('/', '_')}/{os.path.basename(dataset_name)}_{os.path.basename(test_subset)}")

In [10]:
os.makedirs(base_model_output_dir, exist_ok=True)

In [11]:
def process_question(i, engine):
    for retry in range(3):
        try:
            question = ds[question_col][i]
            # problem_idx = ds[problem_idx_col][i]
            true_final_answer = ds[final_answer_col][i]
            
            start_time = time.time()
            response = invoke_engine(engine, prompt+" "+question+"\n\n<think>\n")
            if type(response)==langchain_core.messages.ai.AIMessage:
                response = response.content
            end_time = time.time()
            elapsed_time = end_time - start_time
            
            try:
                predicted_final_answer = remove_boxed(last_boxed_only_string(response))
                if predicted_final_answer is not None and predicted_final_answer.strip() == "":
                    predicted_final_answer = None
            except:
                predicted_final_answer = None
            
            with open(os.path.join(base_model_output_dir, "base_model_output.csv"), mode='a', newline='') as file:
                writer = csv.writer(file)
                if file.tell() == 0:  # Check if file is empty to write the header
                    writer.writerow(["ProblemIdx", "PredictedFinalAnswer", "TrueFinalAnswer", "ElapsedTime", "Question", "ModelResponse"])
                writer.writerow([i, predicted_final_answer, true_final_answer, elapsed_time, question, response])
                
            break
        except Exception as e:
            import traceback
            print(f"Failed to process question with problem_idx={i} with error: {e}")
            traceback.print_exc()
            print(f"Using retry {retry+1}/3")
    return

In [12]:
with ThreadPoolExecutor(max_workers=20) as executor:
    futures = [executor.submit(process_question, j, engine) for j in tqdm(range(5110, num_samples))]
    try:
        for future in as_completed(futures, timeout=200):
            future.result()
    except Exception as e:
        print(f"Failed to process with error: {e}")

100%|██████████| 1153/1153 [00:00<00:00, 6612.55it/s]


Failed to process with error: 970 (of 1153) futures unfinished
