# Import Libraries 

In [2]:
import os
import re
import torch

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
tqdm.pandas()

import plotly.graph_objs as go
import plotly.express as px
from IPython.display import display, Markdown

import torch
import transformers
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    AutoConfig,
    set_seed
)

# Configuration

In [22]:
!pip install -U /kaggle/input/bitsandbytes-0-42-0-py3-none-any-whl/bitsandbytes-0.42.0-py3-none-any.whl -qq

  pid, fd = os.forkpty()


In [4]:
set_seed(42)

2024-07-07 04:03:10.716628: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-07 04:03:10.716727: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-07 04:03:10.850063: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Data

In [32]:
df1 = pd.read_csv("/kaggle/input/all-data/combined_dataset.csv")

In [35]:
df2 = pd.read_csv("/kaggle/input/ai-mathematical-olympiad-prize/train.csv")

In [36]:
def clean_output(txt):
    try:
        txt = txt[-30:]
        pattern = r"(\d+)$"
        ans_cln = re.sub(r"\D", " ", txt).strip()
        matches = re.findall(pattern, ans_cln)
        return int(matches[0])
    except:
        return np.NaN

In [37]:
ans = []
for i, sol in df1[df1['answers'].isna()][['solutions']].iterrows():
    df1.loc[i, 'answers'] = clean_output(sol['solutions'])

In [10]:
df1 = df1.rename(columns={"questions":"problem", "answers":"answer", "solutions":"solution"})

In [11]:
def clean_sol(text):
    return re.sub(r'[\-~][^\s]*$', '', text).strip()


In [12]:
df1['solution'] = df1['solution'].fillna("").apply(clean_sol).values

In [13]:
df1 = df1[~df1['problem'].isna()].copy()

In [14]:
df1.shape

(13502, 3)

In [15]:
df = df1.copy()

In [16]:
def is_integer(text):
    try:
        if int(text) >= 0:
            return True
        else:
            return False
    except ValueError:
        return False
    
df["is_integer"] = df.answer.map(is_integer)
df = df[df.is_integer].reset_index(drop=True)
df.head(2)

Unnamed: 0,problem,solution,answer,is_integer
0,Every morning Aya goes for a $9$ -kilometer-lo...,$\frac{9}{s} + t = 4$ in hours and $\frac{9}{s...,204,True
1,Alice and Bob play the following game. A stack...,Let's first try some experimentation. Alice ob...,809,True


In [17]:
template = """Role:\nYou are an advanced AI system with exceptional mathematical reasoning and problem-solving capabilities, specifically designed to solve tricky math problems (whose answer is a non-negative integer) written in LaTeX format from the AI Mathematical Olympiad (AIMO) competition. Your task is to accurately analyze and solve intricate mathematical problems, demonstrating a deep understanding of mathematical concepts and a strong ability to apply logical reasoning strategies.\n\nInstruction:
1. Carefully read and comprehend the problem statement provided in the "Problem" section.
2. In the "Solution" section, provide a solution of the problem with detailed explanation of your logical reasoning process. Keep in mind that answer must be a non-negative integer number.
3. At the end, create a "Answer" section where you will state only the final numerical or algebraic answer, without any additional text or narrative."""

In [18]:
df.shape

(8300, 4)

In [19]:
df['solution'].values

array(['$\\frac{9}{s} + t = 4$ in hours and $\\frac{9}{s+2} + t = 2.4$ in hours. Subtracting the second equation from the first, we get, $\\frac{9}{s} - \\frac{9}{s+2} = 1.6$  Multiplying by $(s)(s+2)$ , we get $9s+18-9s=18=1.6s^{2} + 3.2s$  Multiplying by 5/2 on both sides, we get $0 = 4s^{2} + 8s - 45$  Factoring gives us $(2s-5)(2s+9) = 0$ , of which the solution we want is $s=2.5$ . Substituting this back to the first equation, we can find that $t = 0.4$ hours. Lastly, $s + \\frac{1}{2} = 3$ kilometers per hour, so $\\frac{9}{3} + 0.4 = 3.4$ hours, or $\\framebox{204}$ minutes',
       "Let's first try some experimentation. Alice obviously wins if there is one coin. She will just take it and win. If there are 2 remaining, then Alice will take one and then Bob will take one, so Bob wins. If there are $3$ , Alice will take $1$ , Bob will take one, and Alice will take the final one. If there are $4$ , Alice will just remove all $4$ at once. If there are $5$ , no matter what Alice does

In [20]:
df["prompt"] = df.progress_apply(lambda row: template.format(problem=row.problem,
                                                             solution=f"{row.solution}\n\nAnswer:\n{row.answer}"),
                                                             axis=1)
data = df.prompt.tolist()

  0%|          | 0/8300 [00:00<?, ?it/s]

In [23]:
MODEL_PATH = "deepseek-ai/deepseek-math-7b-rl"

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

config = AutoConfig.from_pretrained(MODEL_PATH)
config.gradient_checkpointing = True


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    config=config
)



config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors.index.json:   0%|          | 0.00/23.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/5.23G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

In [24]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype='auto',
    device_map="auto",
)

In [25]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [26]:
def extract_output(ans):
    pattern = r"(\d+)$"
    ans_cln = re.sub(r"\D", " ", ans).strip()
    matches = re.findall(pattern, ans_cln)
    return int(matches[0])

In [27]:
df2.iloc[0]['problem']

'The United States Postal Service charges an extra $\\$0.11$ in postage if the length of an envelope, in inches, divided by its height, in inches, is less than $1.3$ or greater than $2.5.$ For how many of these four envelopes must the extra $\\$0.11$ in postage be paid? \\begin{tabular}[t]{ccc}\nEnvelope & Length in inches & Height in inches\\\\\\hline\nA &6 &4\\\\\nB &9 &3\\\\\nC &6 &6\\\\\nD &11 &4\n\\end{tabular}'

## Zero Shot

In [28]:
def solve(problem, template):
    prompt = template + f"""\n\nProblem:\n{problem}\n\nSolution:\nAnswer:"""

    messages = [
        {"role": "user", "content": prompt}
    ]

    query_prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False
        )

    raw_output = pipeline(
                    query_prompt, 
                    max_new_tokens=10000, 
                    do_sample=True, 
                    temperature=0.8,
                    return_full_text=False
                )
    raw_output = raw_output[0]['generated_text']
    torch.cuda.empty_cache()
    return extract_output(raw_output)

In [38]:
actual_ans = []
pred_ans = []

for i, row in tqdm(df2.iterrows()):
    actual_ans.append(row.answer)
    pred_ans.append(solve(row.problem, template))

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


In [39]:
actual_ans, pred_ans

([52, 250, 702, 800, 211, 199, 185, 320, 480, 199],
 [2, 12, 19, 0, 496, 99, 1921, 100, 479, 3999])

accuracy = 0

## Few Shot

In [81]:
context = df.sample(3)

In [82]:
context

Unnamed: 0,problem,solution,answer,is_integer,prompt
2007,What is the remainder when $2001 \cdot 2002 \c...,"Reducing each factor modulo 19 first, we see t...",11,True,Role:\nYou are an advanced AI system with exce...
5472,Solve for $x$: $\frac{x}2 + \frac{x}3 = 5$,Writing the left-hand side with a common denom...,6,True,Role:\nYou are an advanced AI system with exce...
4044,A line with slope 3 intersects a line with slo...,The two lines have equations \[\ny -15=3(x-10)...,2,True,Role:\nYou are an advanced AI system with exce...


In [83]:
template_fs = """Role:
You are an advanced AI system with exceptional mathematical reasoning and problem-solving capabilities, specifically designed to solve tricky math problems (whose answer is a non-negative integer) written in LaTeX format from the AI Mathematical Olympiad (AIMO) competition. Your task is to accurately analyze and solve intricate mathematical problems, demonstrating a deep understanding of mathematical concepts and a strong ability to apply logical reasoning strategies.

Instruction:
1. Carefully read and comprehend the problem statement provided in the "Problem" section.
2. In the "Solution" section, provide a solution of the problem with detailed explanation of your logical reasoning process. Keep in mind that answer must be a non-negative integer number.
3. At the end, create a "Answer" section where you will state only the final numerical or algebraic answer, without any additional text or narrative.

Below are few eamples, 
"""

for i, (_, row) in enumerate(context.iterrows()):
    template_fs += f"Example {i+1}:\n"
    template_fs += f"Problem:\n{row['problem']}\n"
    template_fs += f"Soution:\n{row['solution']}\n"
    template_fs += f"Answer:\n{row['answer']}\n\n"
template_fs += "Now your turn"

In [84]:
print(template_fs)

Role:
You are an advanced AI system with exceptional mathematical reasoning and problem-solving capabilities, specifically designed to solve tricky math problems (whose answer is a non-negative integer) written in LaTeX format from the AI Mathematical Olympiad (AIMO) competition. Your task is to accurately analyze and solve intricate mathematical problems, demonstrating a deep understanding of mathematical concepts and a strong ability to apply logical reasoning strategies.

Instruction:
1. Carefully read and comprehend the problem statement provided in the "Problem" section.
2. In the "Solution" section, provide a solution of the problem with detailed explanation of your logical reasoning process. Keep in mind that answer must be a non-negative integer number.
3. At the end, create a "Answer" section where you will state only the final numerical or algebraic answer, without any additional text or narrative.

Below are few eamples, 
Example 1:
Problem:
What is the remainder when $2001 

In [85]:
torch.cuda.empty_cache()

In [86]:
actual_ans = []
pred_ans = []

for i, row in tqdm(df2.iterrows()):
    actual_ans.append(row.answer)
    pred_ans.append(solve(row.problem, template_fs))

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


In [87]:
actual_ans, pred_ans

([52, 250, 702, 800, 211, 199, 185, 320, 480, 199],
 [5151, 12, 999, 2800, 1321, 100, 449, 320, 400, 72])

accuracy = 10

## RAG

In [45]:
from transformers import AutoTokenizer, AutoModel

In [46]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [47]:
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased').to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [48]:
df

Unnamed: 0,problem,solution,answer,is_integer,prompt
0,Every morning Aya goes for a $9$ -kilometer-lo...,$\frac{9}{s} + t = 4$ in hours and $\frac{9}{s...,204,True,Role:\nYou are an advanced AI system with exce...
1,Alice and Bob play the following game. A stack...,Let's first try some experimentation. Alice ob...,809,True,Role:\nYou are an advanced AI system with exce...
2,Jen enters a lottery by picking $4$ distinct n...,This is a conditional probability problem. Bay...,116,True,Role:\nYou are an advanced AI system with exce...
3,Rectangles $ABCD$ and $EFGH$ are drawn such th...,We use simple geometry to solve this problem. ...,104,True,Role:\nYou are an advanced AI system with exce...
4,Consider the paths of length $16$ that follow ...,We divide the path into eight “ $R$ ” movement...,294,True,Role:\nYou are an advanced AI system with exce...
...,...,...,...,...,...
8295,Find the number of real roots of\n\[2x^{2001} ...,We can factor the given equation as\n\[(2x + 3...,1,True,Role:\nYou are an advanced AI system with exce...
8296,"For a complex number $z,$ compute the minimum ...","Geometrically, $|z + 5 - 3i|$ is the distance ...",13,True,Role:\nYou are an advanced AI system with exce...
8297,Compute the smallest positive integer $x$ grea...,Let $q$ and $r$ be the remainder when $x$ is d...,1700,True,Role:\nYou are an advanced AI system with exce...
8298,"For positive real numbers $a,$ $b,$ $c,$ and $...","Let $S$ denote the given sum. First, we apply...",9,True,Role:\nYou are an advanced AI system with exce...


In [49]:
!pip install faiss-gpu

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [50]:
import faiss

In [51]:
encoded_questions = []
for i, item in tqdm(df.iterrows()):
    inputs = bert_tokenizer(item['problem'], return_tensors='pt', max_length=512).to(device)
    outputs = bert_model(**inputs)
    encoded_questions.append(outputs.last_hidden_state.mean(dim=1).cpu().detach().numpy())

encoded_questions = np.vstack(encoded_questions)

index = faiss.IndexFlatL2(768)
index.add(encoded_questions)

0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [52]:
def retrieve_relevant_questions(query, index, dataset, tokenizer, model, top_k=3):
    inputs = tokenizer(query, return_tensors='pt', max_length=512, truncation=True).to(device)
    outputs = model(**inputs)
    query_vector = outputs.last_hidden_state.mean(dim=1).cpu().detach().numpy()

    D, I = index.search(query_vector, k=top_k)
    retrieved_docs = [dataset.iloc[i] for i in I[0]]
    return retrieved_docs

In [53]:
def generate_answer(query, index, dataset, tokenizer, model, template):
    top_k = 3
    retrieved_docs = retrieve_relevant_questions(query, index, dataset, tokenizer, model, top_k)
    
    prompt = template
    for i in range (1, top_k+1):
        prompt += f"\nExample {i}:"
        prompt += f"\nProblem:\n{retrieved_docs[i-1]['problem']}\n"
        prompt += f"\nSolution:\n{retrieved_docs[i-1]['solution']}\n"
        prompt += f"\Answer:\n{retrieved_docs[i-1]['answer']}\n"

    prompt += f"\nNow for you:\nProblem: \n{query}\nSolution:\nAnswer:\n"
    
    messages = [
        {"role": "user", "content": prompt}
    ]

    query_prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False
        )

    raw_output = pipeline(
                    query_prompt, 
                    max_new_tokens=10000, 
                    do_sample=True, 
                    temperature=0.8,
                    return_full_text=False
                )
    raw_output = raw_output[0]['generated_text']
    torch.cuda.empty_cache()
    return extract_output(raw_output)

In [56]:
actual_ans = []
pred_ans = []

for i, row in tqdm(df2.iterrows()):
    actual_ans.append(row.answer)
    pred_ans.append(generate_answer(row.problem, index, df, bert_tokenizer, bert_model, template))

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


In [57]:
actual_ans, pred_ans

([52, 250, 702, 800, 211, 199, 185, 320, 480, 199],
 [36, 614, 14, 0, 511, 4, 11, 2, 40, 199])

accuracy = 10%