In [None]:
# credits:
# https://www.kaggle.com/code/olyatsimboy/aimo-openmath-mistral-baseline
# https://www.kaggle.com/code/aatiffraz/prompt-prediction-w-mixtral-mistral7b-gemma-llama
# https://www.kaggle.com/code/thedrcat/aimo-mixtral-baseline

In [2]:
import pandas as pd
import numpy as np
import re
import random
from transformers import pipeline
from tqdm import tqdm

2024-04-18 01:23:30.476015: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-18 01:23:30.476299: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-18 01:23:30.757764: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Zero-shot MMOS-DeepSeekMath-7B with self-consistency and generated code reasoning evaluation

Self-consistency is a modification of the standard greedy decoding in reasoning pipelines via sampling several diverse answers followed by aggregation, e.g., most common answer ([SC-CoT paper](https://arxiv.org/pdf/2203.11171.pdf)).

In this kernel, we will consider MMOS-DeepSeekMath-7B RL-tuned backbone; in my experiments, this model produces more consistent code reasoning and the code block execution will allow us to decrease arithmetic hallucinations.

In [3]:
!pip install -U /kaggle/input/bitsandbytes-0-42-0-py3-none-any-whl/bitsandbytes-0.42.0-py3-none-any.whl -qq

In [4]:
import torch
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    AutoConfig,
    set_seed
)

set_seed(42)

MODEL_PATH = "/kaggle/input/open-math-mistral"

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

config = AutoConfig.from_pretrained(MODEL_PATH)
config.gradient_checkpointing = True


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
#     quantization_config=quantization_config,
    config=config
)
pipeline= pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype='auto',
    device_map='auto',

)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
import torch
torch.backends.cuda.enable_mem_efficient_sdp(False) # this script processes data, solves problems, handles exceptions

In [6]:
train = pd.read_csv("/kaggle/input/ai-mathematical-olympiad-prize/train.csv")
train.head()

test = pd.read_csv("/kaggle/input/ai-mathematical-olympiad-prize/test.csv")
test.head()

Unnamed: 0,id,problem
0,000aaa,What is $1-1$?
1,111bbb,What is $0\times10$?
2,222ccc,Solve $4+x=4$ for $x$.


In [7]:
prompt = """You are Math Professor, an exceptionally intelligent Professor tasked with solving intricate mathematical problems.

Math Professor will be given a mathematical problem written in LaTeX, and will provide a precise answer, adhering to the following rules:
- Math Professor guarantees a correct answer, always within the range of 0 to 999.
- Answers will be concise and limited to a single number.
- Math Professor will respond even when unsure or when the question is not fully understood.
- Only the final result will be provided, no additional information.
- The answer written in the "Answer" section must be concise and only include the final result.
- Math Professor will always follow these rules.

# Question
Let $D(n)$ denote the number of ways of writing the positive integer $n$ as a product\[n = f_1\cdot f_2\cdots f_k,\]where $k\ge1$, the $f_i$ are integers strictly greater than $1$, and the order in which the factors are listed matters (that is, two representations that differ only in the order of the factors are counted as distinct). For example, the number $6$ can be written as $6$, $2\cdot 3$, and $3\cdot2$, so $D(6) = 3$. What is $D(96)$? 

# Answer (only one number between 0 and 999)
112

{examples}


# Question
{question}

# Answer (only one number between 0 and 999)
"""

In [8]:
def out(examples_df: pd.DataFrame | None , df, template):
    submission = {"id": [], "answer": []}

    examples = ""
    if examples_df is not None and not examples_df.empty:
        examples = []
        for idx, row in examples_df.iterrows():
            examples.append("# Question")
            examples.append(str(row["problem"]))
            examples.append("# Answer (only one number between 0 and 999)")
            examples.append(str(row["answer"]))
        examples = "\n".join(examples)
    
    for idx, row in tqdm(df.iterrows()):
        try:
            model_input = template.format(examples=examples, question=row["problem"])
            response = pipeline(model_input, do_sample=False, max_new_tokens=3)
            output = response[0]['generated_text']
            
            output = int(re.sub(r"[^0-9]", "", output))

            submission["id"].append(row["id"])
            submission["answer"].append(output)
            torch.cuda.empty_cache()
            gc.collect()
        except Exception as e:
            print(f"Exception: {e}")
            submission["id"].append(row["id"])
            submission["answer"].append(random.randint(0, 999))
            
    submission_df = pd.DataFrame(submission)
    submission_df["answer"] = submission_df["answer"].apply(lambda x: abs(x) % 1000)
    return submission_df

linkcode
The out function generates submissions based on provided examples, a main DataFrame, a template, and a machine learning model. Here's how it works:

Input Parameters:

model: The machine learning model used to generate answers.
examples_df: A DataFrame containing example questions and answers (optional).
df: The main DataFrame containing questions.
template: A template string used to format questions and generate answers.
Example Formatting:

If examples_df is provided and not empty, the function formats it into a string.
Submission Generation:

For each row in the main DataFrame (df):
It attempts to generate an answer using the provided template and model.
If successful, it appends the answer to the submission dictionary.
If an exception occurs during answer generation, it prints the exception and adds a random answer instead.
Final Submission DataFrame:

The function converts the submission dictionary into a DataFrame.
It ensures that the answer values are within the range of 0 to 999.
Output:

The function returns the submission DataFrame containing question IDs and their respective answers.

In [9]:
output = out(train, test, prompt)
output

0it [00:00, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
1it [00:10, 10.57s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Exception: name 'gc' is not defined


2it [00:18,  8.85s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Exception: name 'gc' is not defined


3it [00:25,  8.63s/it]

Exception: name 'gc' is not defined





Unnamed: 0,id,answer
0,000aaa,990
1,000aaa,654
2,111bbb,990
3,111bbb,114
4,222ccc,990
5,222ccc,25
