In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install datasets bitsandbytes peft trl

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [13]:
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import login
from matplotlib import pyplot as plt
import numpy as np
from collections import Counter
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer, SFTConfig
import torch
import re
from tqdm import tqdm
import gc
from sklearn.metrics import f1_score
from dataclasses import dataclass

In [None]:
login("") # Insert HF API key here

In [None]:
!nvidia-smi

Loading the datasets. For this work, we used the Bac subset and the Comps subset from RoMath

In [None]:
ro_math_bac_set = load_dataset('cosmadrian/romath', 'bac')
ro_math_comp_set = load_dataset('cosmadrian/romath', 'comps')

Defining some global variables:
- Mathematical common latex tokens that will help us later
- The prompts for the model corresponding to their subsets


In [11]:
MATHEMATICAL_TOKENS = [
      "\\frac", "\\sqrt", "\\lim", "\\int", "\\sum", "\\prod",
      "\\sin", "\\cos", "\\tan", "\\log", "\\ln", "\\cdot",
      "\\rightarrow", "\\to", "\\leq", "\\geq", "\\neq",
      "\\in", "\\approx", "\\mathbf{R}", "\\mathbb{R}", "\\pi",
      "^", "_", "\\left", "\\right", "\\operatorname", "="
  ]

baccalaureate_prompt = "Ești un elev în clasa a 12-a care se pregătește pentru examenul de Bacalaureat la matematică. Analizează foarte bine întrebarea și răspunde la exercițiile următoare oferind doar răspunsul final, fără explicații suplimentare. Dacă este cazul, folosește notații matematice în LaTeX pentru a scrie corect rezultatul."
competition_prompt = "Ești un elev pasionat de matematică, antrenat pentru olimpiade și concursuri naționale. Analizează foarte bine întrebarea și răspunde la exercițiile următoare oferind doar răspunsul final, fără explicații suplimentare. Dacă este cazul, folosește notații matematice în LaTeX pentru a scrie corect rezultatul."

Doing some basic exploratory data analysis: 
- Visualizing the problems domain distribution across each subset
- Visualizing the distribution of the common mathematical tokens across each subset


To do that, we combined the training and test splits to get a better understanding of the distributions

In [7]:
ro_math_bac_set_combined = concatenate_datasets([ro_math_bac_set['train'], ro_math_bac_set['test']])
ro_math_comp_set_combined = concatenate_datasets([ro_math_comp_set['train'], ro_math_comp_set['test']])

In [8]:
class DatasetAnalyzer:
  """
    A class that helps with EDA.
    Currently supports:
    - Computing domain frequency distributions for each dataset subset (bac, comps).
    - Computing the distribution of special LaTex mathematical tokens
  """

  MATHEMATICAL_PATTERN_LATEX = re.compile(r"(\\[a-zA-Z]+|[=_^])")
  
  DOMAIN_COLUMN_NAME = 'domain'
  PROBLEM_COLUMN_NAME = 'problem'


  def __init__(self, ro_math_bac_set, ro_math_comp_set):
    self.data_subsets = {
        'bac': ro_math_bac_set,
        'comps': ro_math_comp_set
    }

    self.math_latex_tokens = MATHEMATICAL_TOKENS


  def get_domain_distribution(self, subset: str):
    if subset not in self.data_subsets:
      raise ValueError("Invalid data subset")

    analyzed_subset = self.data_subsets[subset]
    return dict(Counter(analyzed_subset[self.DOMAIN_COLUMN_NAME]))
  
  def get_mathematical_tokens_distribution(self, subset: str):
    if subset not in self.data_subsets:
      raise ValueError("Invalid data subset")

    analyzed_subset = self.data_subsets[subset][self.PROBLEM_COLUMN_NAME]
    math_tokens_frequency = {key: 0 for key in self.math_latex_tokens}

    for problem in analyzed_subset:
      math_symbols = self.MATHEMATICAL_PATTERN_LATEX.findall(problem)
      
      for symbol in math_symbols:
        if symbol in self.math_latex_tokens:
          math_tokens_frequency[symbol] += 1
    
    return math_tokens_frequency

In [9]:
class DatasetVisualizer(DatasetAnalyzer):

  """
    Inherits from DatasetAnalyzer. Adds visualization methods for domain and mathematical tokens distributions
    across BAC and COMP subsets.
  """

  def __init__(self, ro_math_bac_set, ro_math_comp_set):
    super().__init__(ro_math_bac_set, ro_math_comp_set)

  def visualize_domain_distributions(self):
    overall_domain_distributions = [self.get_domain_distribution(subset) for subset in self.data_subsets]

    fig, ax = plt.subplots(1, 2, figsize=(18, 5))
    bar_width = 0.6
    plot_colors = ['b', 'g']

    for idx, (subset, color) in enumerate(zip(self.data_subsets, plot_colors)):
      indices_x_axis = np.arange(len(overall_domain_distributions[idx]))
      labels = list(overall_domain_distributions[idx].keys())
      max_upper_bound_domain = max(overall_domain_distributions[idx].values())

      ax[idx].bar(indices_x_axis, overall_domain_distributions[idx].values(), color = color, width = bar_width)
      ax[idx].set_title(subset.upper() + ' domain distribution')
      ax[idx].set_xticks(indices_x_axis)
      ax[idx].set_xticklabels(labels, rotation = 70)
      ax[idx].set_ylim(0, max_upper_bound_domain * 1.1)
      ax[idx].set_ylabel('Domain counter')

    fig.suptitle("Domain Distribution Across Subsets", fontsize=16)
    plt.show()
  

  def visualize_token_distributions(self):
    overall_token_distributions = [self.get_mathematical_tokens_distribution(subset) for subset in self.data_subsets]

    fig, ax = plt.subplots(1, 2, figsize=(18, 5))
    bar_width = 0.6
    plot_colors = ['b', 'g']

    for idx, (subset, color) in enumerate(zip(self.data_subsets, plot_colors)):
      indices_x_axis = np.arange(len(overall_token_distributions[idx]))
      labels = list(overall_token_distributions[idx].keys())
      max_upper_bound_domain = max(overall_token_distributions[idx].values())

      ax[idx].bar(indices_x_axis, overall_token_distributions[idx].values(), color = color, width = bar_width)
      ax[idx].set_title(subset.upper() + ' mathematical tokens distribution')
      ax[idx].set_xticks(indices_x_axis)
      ax[idx].set_xticklabels(labels, rotation = 70)
      ax[idx].set_ylim(0, max_upper_bound_domain * 1.1)
      ax[idx].set_ylabel('Mathematical tokens counter')

    fig.suptitle("Mathematical tokens distribution across subsets", fontsize=16)
    plt.show()


In [None]:
dataset_visualizer = DatasetVisualizer(ro_math_bac_set_combined, ro_math_comp_set_combined)
dataset_visualizer.visualize_domain_distributions()
dataset_visualizer.visualize_token_distributions()

Loading the model and the tokenizer.

For a better memory usage we quantized the model to 4bit.

We narrowed the LLM choice down to Ro-Llama and Ro-Mistral in the instruct format

In [None]:
def load_model_and_tokenizer(model_name):
  bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)

  model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  return model, tokenizer

# model, tokenizer = load_model_and_tokenizer("OpenLLM-Ro/RoLlama2-7b-Instruct")
model, tokenizer = load_model_and_tokenizer("OpenLLM-Ro/RoMistral-7b-Instruct")

Now this is where the mathematical tokens come into play. Our idea is to add those mathematical tokens into the pre-trained tokenizer and then extend the model token embeddings

In [None]:
tokens_to_add = [token for token in MATHEMATICAL_TOKENS if token not in tokenizer.get_vocab()]
tokenizer.add_tokens(tokens_to_add)
model.resize_token_embeddings(len(tokenizer))

An example of how it works

In [None]:
tokenizer.tokenize('Să se calculeze \(\sin 2 \pi+\sin 4 \pi\).')

Formatting the dataset with the proper question-answering format. As an answer, we will stick to using the short answer to the problem, as reasoning for now is a little bit hard to do without any human supervision.

In [26]:
def format_questions_answers(data_samples, llm_prompt):
  EOS_TOKEN = tokenizer.eos_token

  questions_column = data_samples['problem']
  answers_column = data_samples['answer']
  

  formatted_texts = []

  for question, answer in zip(questions_column, answers_column):
    full_formatted_answer = f'Răspunsul este {answer.strip()}'

    messages_ro_llm = [
      {"role": "system", "content": llm_prompt},
      {"role": "user", "content": question},
      {"role": "assistant", "content": full_formatted_answer + EOS_TOKEN}
    ]

    formatted_message = tokenizer.apply_chat_template(
        messages_ro_llm,
        tokenize = False,
        add_generation_prompt = False
    )

    formatted_texts.append(formatted_message)

  return {"text": formatted_texts}

In [None]:
ro_math_bac_train = ro_math_bac_set['train']
ro_math_bac_eval = ro_math_bac_set['test']

ro_math_bac_train_updated = ro_math_bac_train.map(lambda x: format_questions_answers(x, baccalaureate_prompt), batched = True)
ro_math_bac_eval_updated = ro_math_bac_eval.map(lambda x: format_questions_answers(x, baccalaureate_prompt), batched = True)

In [None]:
ro_math_comp_train = ro_math_comp_set['train']
ro_math_comp_eval = ro_math_comp_set['test']

ro_math_comp_train_updated = ro_math_comp_train.map(lambda x: format_questions_answers(x, competition_prompt), batched = True)
ro_math_comp_eval_updated = ro_math_comp_eval.map(lambda x: format_questions_answers(x, competition_prompt), batched = True)

In [29]:
ro_math_bac_train_updated = ro_math_bac_train_updated.shuffle(seed = 42)
ro_math_comp_train_updated = ro_math_comp_train_updated.shuffle(seed = 42)

Using Low Rank Adaptation for fine tuning the models

For the changes, we applied the bias to lora_only allowing the biases to adapt during fine tuning
A dropout of 0.2 to introduce regularization and prevent overfitting

In [30]:
peft_config = LoraConfig(
                          lora_alpha=16,
                          lora_dropout=0.2,
                          r=32,
                          bias="lora_only",
                          task_type="CAUSAL_LM",
                          target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ]
  )

In [None]:
trainer = SFTTrainer(
    model = model,
    processing_class = tokenizer,
    train_dataset = ro_math_bac_train_updated,
    eval_dataset = ro_math_bac_eval_updated,
    peft_config = peft_config,
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 600,
        max_seq_length = 384, # use this for the comp dataset as the problem instructions are quite long
        learning_rate = 2e-4,
        fp16 = True,
        bf16 = False,
        logging_steps = 100,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "output_math_qa_results",
    ),
)

In [None]:
trainer.train()

Step,Training Loss
100,0.3828


In [57]:
def cleanup():
    "Free up GPU memory"
    
    gc.collect()
    torch.cuda.empty_cache()

In [58]:
cleanup()

In [None]:
def compute_llm_eval_answers(eval_set, chat_instruction, model_ftuned, tokenizer_ftuned):
    llm_clean_answers = []
    
    for _, problem_instruction in enumerate(tqdm(eval_set)):
        chat = [
            {"role": "system", "content": chat_instruction},
            {"role": "user", "content": problem_instruction}
        ]

        
        prompt = tokenizer_ftuned.apply_chat_template(chat, tokenize = False)
        inputs = tokenizer_ftuned.encode(prompt, add_special_tokens=False, return_tensors="pt").to(model_ftuned.device)
        outputs = model_ftuned.generate(input_ids = inputs, max_new_tokens = 64)

        out = tokenizer_ftuned.decode(outputs[0])
        clean_answer = out.split('[/INST]')[1]
        clean_answer = clean_answer.replace('</s>', '').strip()

        llm_clean_answers.append(clean_answer)

    return llm_clean_answers

llm_clean = compute_llm_eval_answers(ro_math_bac_eval_updated['problem'], baccalaureate_prompt, model, tokenizer)

In [None]:
def compute_predictions_encoding(llm_ans, ground_truth_ans):
    predictions_encoding = []
    for llm_answer, ground_truth_answer in zip(llm_ans, ground_truth_ans):
        ground_truth_answer_clean = ground_truth_answer.split('[/INST]')[1]
        ground_truth_answer_clean = ground_truth_answer_clean.replace('</s>', '').strip()

        predictions_encoding.append(1) if llm_answer == ground_truth_answer_clean else predictions_encoding.append(0)
        

    return predictions_encoding


In [None]:
encoded_predictions = compute_predictions_encoding(llm_clean, ro_math_bac_eval_updated['text'])
encoded_ground_truth = [1] * len(ro_math_bac_eval_updated)

f1_score(encoded_ground_truth, encoded_predictions)