In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
os.environ["KERAS_BACKEND"] = "torch" # you can also use tensorflow or torch
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.9" # avoid memory fragmentation on JAX backend.

import keras
import keras_nlp

from tqdm.notebook import tqdm
tqdm.pandas() # progress bar for pandas

import plotly.graph_objs as go
import plotly.express as px
from IPython.display import display, Markdown
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mammoth-7b-mistral/mammoth_7B_mistral_pretrained_model_pytorch/model.safetensors.index.json
/kaggle/input/mammoth-7b-mistral/mammoth_7B_mistral_pretrained_model_pytorch/model-00003-of-00003.safetensors
/kaggle/input/mammoth-7b-mistral/mammoth_7B_mistral_pretrained_model_pytorch/config.json
/kaggle/input/mammoth-7b-mistral/mammoth_7B_mistral_pretrained_model_pytorch/tokenizer.json
/kaggle/input/mammoth-7b-mistral/mammoth_7B_mistral_pretrained_model_pytorch/model-00001-of-00003.safetensors
/kaggle/input/mammoth-7b-mistral/mammoth_7B_mistral_pretrained_model_pytorch/tokenizer_config.json
/kaggle/input/mammoth-7b-mistral/mammoth_7B_mistral_pretrained_model_pytorch/model-00002-of-00003.safetensors
/kaggle/input/mammoth-7b-mistral/mammoth_7B_mistral_pretrained_model_pytorch/special_tokens_map.json
/kaggle/input/mammoth-7b-mistral/mammoth_7B_mistral_pretrained_model_pytorch/tokenizer.model
/kaggle/input/mammoth-7b-mistral/mammoth_7B_mistral_pretrained_model_pytorch/added_tokens.

2024-06-10 13:32:37.641379: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-10 13:32:37.641554: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-10 13:32:37.784657: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
!pip install pylatexenc

  pid, fd = os.forkpty()




In [4]:
class CFG:
    seed = 42
    dataset_path = "/kaggle/input/ai-mathematical-olympiad-prize"
    preset = "gemma_1.1_instruct_2b_en" # name of pretrained Gemma
    sequence_length = 512 # max size of input sequence for training
    batch_size = 1 # size of the input batch in training
    epochs = 1 # number of epochs to train
keras.utils.set_random_seed(CFG.seed)

In [6]:
df1 = pd.read_csv("/kaggle/input/math-qsa-dataset/train.csv")
df2 = pd.read_csv("/kaggle/input/math-qsa-dataset/test.csv")
df = pd.concat([df1, df2], axis=0)

In [7]:
def is_integer(text):
    try:
        if int(text) >= 0:
            return True
        else:
            return False
    except ValueError:
        return False
    
df["is_integer"] = df.answer.map(is_integer)
df = df[df.is_integer].reset_index(drop=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7356 entries, 0 to 7355
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   problem     7356 non-null   object
 1   level       7356 non-null   object
 2   type        7356 non-null   object
 3   solution    7356 non-null   object
 4   answer      7356 non-null   object
 5   is_integer  7356 non-null   bool  
dtypes: bool(1), object(5)
memory usage: 294.7+ KB


# Preprocessing Pipeline

In [9]:
import re
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# Ensure the necessary NLTK data files are downloaded
nltk.download('punkt')
nltk.download('stopwords')

class Preprocessing:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))

    def convert_draw_command(self, draw_command):
        pattern_pentagon = re.compile(r'draw\(\((.*?)\)--\((.*?)\)--\((.*?)\)--\((.*?)\)--\((.*?)\)--\((.*?)\)--cycle.*?\);')
        match_pentagon = pattern_pentagon.match(draw_command)
        if match_pentagon:
            coords = match_pentagon.groups()
            return f"A regular pentagon with vertices at {coords[0]}, {coords[1]}, {coords[2]}, {coords[3]}, and {coords[4]}."
        
        pattern_hexagon = re.compile(r'draw\(\((.*?)\)--\((.*?)\)--\((.*?)\)--\((.*?)\)--\((.*?)\),.*?\);')
        match_hexagon = pattern_hexagon.match(draw_command)
        if match_hexagon:
            coords = match_hexagon.groups()
            return f"A regular hexagon with vertices at {coords[0]}, {coords[1]}, {coords[2]}, {coords[3]}, and {coords[4]}."
        
        return ""

    def convert_dot_label_commands(self, text):
        pattern_dot = re.compile(r'dot\(\((.*?)\)\);')
        text = pattern_dot.sub(r'A point at \1.', text)
        
        pattern_label = re.compile(r'label\("(.*?)",\((.*?)\),.*?\);')
        text = pattern_label.sub(r'The point \1 is at coordinates \2.', text)
        
        return text

    def preprocess_text(self, text):
        # Remove the [asy] tags
        text = re.sub(r'\[asy\]', '', text)
        text = re.sub(r'\[\/asy\]', '', text)

        # Split the text into commands
        commands = text.split('\n')

        readable_text = []
        for command in commands:
            if 'draw' in command:
                readable_text.append(self.convert_draw_command(command))
            else:
                readable_text.append(self.convert_dot_label_commands(command))

        readable_text = ' '.join(readable_text)

        # Tokenize into sentences
        sentences = sent_tokenize(readable_text)

        # Remove stop words and tokenize the remaining words
        filtered_sentences = []
        for sentence in sentences:
            word_tokens = word_tokenize(sentence)
            filtered_sentence = [word for word in word_tokens if word.lower() not in self.stop_words]
            filtered_sentences.append(' '.join(filtered_sentence))

        filtered_text = ' '.join(filtered_sentences)
        return filtered_text

    def process_dataframe(self, df, text_column):
        df[f'{text_column}'] = df[text_column].apply(self.preprocess_text)
        return df



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
pipeline = Preprocessing()
a = pipeline.process_dataframe(df, "problem")

a['problem'][4]

'regular pentagon vertices 0,2.5 , 0,7.5 , 4,10 , 8,7.5 , 8,2.5 . regular hexagon vertices 8,2.5 , 11.5 , -1 , 9 , -5 , 5 , -4.5 , 4,0 . point 0,2.5 . point 0,7.5. dot ( ( 4,10 ) ) ; point 8,7.5 . point 8,2.5 . point 4,0 . point coordinates 0,2.5 . point H coordinates 0,7.5 . point G coordinates 4,10 . point F coordinates 8,7.5 . point E coordinates 8,2.5 . point coordinates 4,0 . point 11.5 , -1 . point 9 , -5 . point 5 , -4.5 . point coordinates 11.5 , -1 . point C coordinates 9 , -5 . point B coordinates 5 , -4.5 .'

In [12]:
a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7356 entries, 0 to 7355
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   problem     7356 non-null   object
 1   level       7356 non-null   object
 2   type        7356 non-null   object
 3   solution    7356 non-null   object
 4   answer      7356 non-null   object
 5   is_integer  7356 non-null   bool  
dtypes: bool(1), object(5)
memory usage: 294.7+ KB


In [9]:
!pip install antlr4-python3-runtime

  pid, fd = os.forkpty()




In [10]:
import sympy as sp
from sympy.parsing.latex import parse_latex
!pip install antlr4-python3-runtime
def latex_to_math(latex_str):
    """
    Convert a LaTeX string to a SymPy expression.

    Parameters:
    latex_str (str): The LaTeX string to convert.

    Returns:
    sympy.Expr: The corresponding SymPy expression.
    """
    try:
        # Parse the LaTeX string
        sympy_expr = parse_latex(latex_str)
        return sympy_expr
    except Exception as e:
        print(f"Error parsing LaTeX: {e}")
        return None

# Example usage
latex_str = r"\frac{d}{dx} \left( x^2 + 2x + 1 \right)"
math_expr = latex_to_math(latex_str)
print(math_expr)


  pid, fd = os.forkpty()


^C
[31mERROR: Operation cancelled by user[0m[31m
[0mError parsing LaTeX: LaTeX parsing requires the antlr4 Python package, provided by pip (antlr4-python3-runtime) or conda (antlr-python-runtime), version 4.11
None


# Prompt Engineering

In [18]:
template = """Role:\nYou are an advanced AI system with exceptional mathematical reasoning and problem-solving capabilities, specifically designed to solve tricky math problems (whose answer is a non-negative integer) written in LaTeX format from the AI Mathematical Olympiad (AIMO) competition. Your task is to accurately analyze and solve intricate mathematical problems, demonstrating a deep understanding of mathematical concepts and a strong ability to apply logical reasoning strategies.\n\nInstruction:
1. Carefully read and comprehend the problem statement provided in the "Problem" section.
2. In the "Solution" section, provide a solution of the problem with detailed explanation of your logical reasoning process. Keep in mind that answer must be a non-negative integer number.
3. At the end, create a "Answer" section where you will state only the final numerical or algebraic answer, without any additional text or narrative.\n\nProblem:\n{problem}\n\nSolution:\n{solution}"""

In [19]:
df["prompt"] = df.progress_apply(lambda row: template.format(problem=row.problem,
                                                             solution=f"{row.solution}\n\nAnswer:\n{row.answer}"),
                                                             axis=1)
data = df.prompt.tolist()

  0%|          | 0/7356 [00:00<?, ?it/s]

In [20]:
def colorize_text(text):
    for word, color in zip(["Role", "Instruction", "Problem", "Solution", "Answer"],
                           ["blue", "yellow", "red", "cyan", "green"]):
        text = text.replace(f"{word}:", f"\n\n**<font color='{color}'>{word}:</font>**")
    return text

In [21]:
# Take a random sample
sample = data[12]

# Give colors to Instruction, Response and Category
sample = colorize_text(sample)

# Show sample in markdown
display(Markdown(sample))



**<font color='blue'>Role:</font>**
You are an advanced AI system with exceptional mathematical reasoning and problem-solving capabilities, specifically designed to solve tricky math problems (whose answer is a non-negative integer) written in LaTeX format from the AI Mathematical Olympiad (AIMO) competition. Your task is to accurately analyze and solve intricate mathematical problems, demonstrating a deep understanding of mathematical concepts and a strong ability to apply logical reasoning strategies.



**<font color='yellow'>Instruction:</font>**
1. Carefully read and comprehend the problem statement provided in the "Problem" section.
2. In the "Solution" section, provide a solution of the problem with detailed explanation of your logical reasoning process. Keep in mind that answer must be a non-negative integer number.
3. At the end, create a "Answer" section where you will state only the final numerical or algebraic answer, without any additional text or narrative.



**<font color='red'>Problem:</font>**
largest positive multiple $ 12 $ less $ 350 ? $



**<font color='cyan'>Solution:</font>**
Dividing $350$ by $12$ gives a quotient $29$ with a remainder of $2$. In other words, \[350=12\cdot29+2.\]Thus, $29\cdot12=\boxed{348}$ is the largest multiple of $12$ which is less than $350.$



**<font color='green'>Answer:</font>**
348

## Model Training

In [13]:
!pip install transformers
!pip install datasets

  pid, fd = os.forkpty()




In [15]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

In [14]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppresses all logs below ERROR level

import tensorflow as tf
# Your TensorFlow code goes here


In [None]:
model_name = "/kaggle/input/mammoth-7b-mistral/mammoth_7B_mistral_pretrained_model_pytorch"  
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
from datasets import Dataset

In [16]:
dataset = Dataset.from_dict(a)

In [17]:
def tokenize_function(examples):
    return tokenizer(examples["problem"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/7356 [00:00<?, ? examples/s]

NameError: name 'tokenizer' is not defined