In [1]:
!git clone https://github.com/huggingface/evaluate.git /kaggle/working/evaluate
!pip install evaluate

Cloning into '/kaggle/working/evaluate'...
remote: Enumerating objects: 8404, done.[K
remote: Counting objects: 100% (798/798), done.[K
remote: Compressing objects: 100% (162/162), done.[K
remote: Total 8404 (delta 669), reused 651 (delta 631), pack-reused 7606[K
Receiving objects: 100% (8404/8404), 2.19 MiB | 9.81 MiB/s, done.
Resolving deltas: 100% (5131/5131), done.
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [2]:
import pandas as pd
import evaluate

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, models, BartForConditionalGeneration, BartConfig, PreTrainedTokenizerFast

2024-06-28 00:46:02.067335: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-28 00:46:02.067457: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-28 00:46:02.201137: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Inspect Data

Let $k, l > 0$ be parameters. The parabola $y = kx^2 - 2kx + l$ intersects the line $y = 4$ at two points $A$ and $B$. These points are distance 6 apart. What is the sum of the squares of the distances from $A$ and $B$ to the origin?

In [3]:
train = pd.read_csv("/kaggle/input/dataset-for-solving-math-word-problems/train.csv")[["Problem", "linear_formula"]]
val = pd.read_csv("/kaggle/input/dataset-for-solving-math-word-problems/test.csv")[["Problem", "linear_formula"]]
test = pd.read_csv("/kaggle/input/ai-mathematical-olympiad-prize/test.csv")
# train["answer"] = train.answer.astype(str)
# val = train.loc[8:9,:]
# train = train.loc[0:7,:]
# val
train = train.rename({"linear_formula":"Solution"}, axis=1)
val = val.rename({"linear_formula":"Solution"}, axis=1)
test_problems = test[["problem"]].rename({"problem":"Problem"}, axis=1)
test_problems

Unnamed: 0,Problem
0,What is $1-1$?
1,What is $0\times10$?
2,Solve $4+x=4$ for $x$.


In [4]:
soln_vocab = []
solns = val["Solution"].map(lambda x: x.replace("(", " ").replace(",", " ").replace(")", " ").replace("|", "| ").split())
for soln in solns:
    for word in soln:
        if not word in soln_vocab:
            soln_vocab.append(word)
len(soln_vocab)
with open("/kaggle/working/vocab.txt", "w") as f:
    for v in soln_vocab:
        f.write(v+" ")

# Process Data

In [5]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors

def generate_decode_tokenizer(vocab):
    unk_token = "<unk>"

    # Initialize a Byte-level BPE Tokenizer for decoding outputs
    decoder_tokenizer = Tokenizer(models.BPE())

    # Customize pre-tokenization to split on whitespace
    decoder_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    
    # Set post-processor
    decoder_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
    
    # Define the special tokens
    special_tokens = ["<s>", "</s>", "<pad>", "<unk>"]

    # Train the tokenizer on your vocabulary
    #trainer = trainers.BpeTrainer(vocab_size=len(vocab)+len(special_tokens), special_tokens=special_tokens)
    #decoder_tokenizer.train(["/kaggle/working/vocab.txt"], trainer)
    decoder_tokenizer.add_tokens(vocab)
    decoder_tokenizer.add_special_tokens(special_tokens)

    # Save the decoder tokenizer
    decoder_tokenizer.save("/kaggle/working/decoder_math_tokenizer.json")

    return decoder_tokenizer

In [6]:
batch_size=32

# Convert to huggingface dataset
train_ds = Dataset.from_pandas(train)
val_ds = Dataset.from_pandas(val)
test_ds = Dataset.from_pandas(test_problems)

# Create tokenizers
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/transformers/facebook-bart-base")
generate_decode_tokenizer(soln_vocab)
decode_tokenizer = PreTrainedTokenizerFast(tokenizer_file="/kaggle/working/decoder_math_tokenizer.json")
decode_tokenizer.add_special_tokens({'pad_token': '<pad>', 'eos_token': '</s>', 'bos_token': '<s>', 'unk_token': '<unk>'})

# Create a new BART configuration with the new vocabulary size
config = BartConfig(
    vocab_size=len(soln_vocab)+4,
    max_position_embeddings=1024,
    d_model=768,
    encoder_layers=6,
    encoder_ffn_dim=3072,
    encoder_attention_heads=12,
    decoder_layers=6,
    decoder_ffn_dim=3072,
    decoder_attention_heads=12,
    activation_function="gelu",
    dropout=0.1,
    attention_dropout=0.1,
    classifier_dropout=0.0,
    init_std=0.02,
    encoder_layerdrop=0.0,
    decoder_layerdrop=0.0,
    scale_embedding=True,
    use_cache=True,
    pad_token_id=decode_tokenizer.pad_token_id,
    bos_token_id=decode_tokenizer.bos_token_id,
    eos_token_id=decode_tokenizer.eos_token_id,
)

# Load a pre-trained BART model with the new configuration
model = BartForConditionalGeneration(config)
model.resize_token_embeddings(len(soln_vocab)+4)

Embedding(113, 768, padding_idx=111)

In [7]:
decode_tokenizer("Hello world")

{'input_ids': [], 'token_type_ids': [], 'attention_mask': []}

In [8]:
type(decode_tokenizer)

transformers.tokenization_utils_fast.PreTrainedTokenizerFast

In [9]:
solns = solns.map(lambda x: " ".join(x))

In [10]:
def process_data_to_model_inputs(batch):
    '''
    Tokenize inputs and outputs for each batch
    '''
    inputs = tokenizer(batch["Problem"], padding=True, truncation=True)
    solns = [x.replace("(", " ").replace(",", " ").replace(")", " ").replace("|", "| ")
             for x in batch["Solution"]]
    outputs = decode_tokenizer(solns, padding=True, truncation=True)
    
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = outputs.input_ids
    
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
    
    return batch

In [11]:
def process_test_data_to_model_inputs(batch):
    inputs = tokenizer(batch["Problem"], padding=True, truncation=True, return_tensors="pt")
    
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    
    return batch

In [12]:
# Tokenize train data
train_data = train_ds.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["Problem", "Solution"]
)
train_data.set_format(type="torch")

# Tokenize validation data
val_data = val_ds.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["Problem", "Solution"]
)
val_data.set_format(type="torch")

# Tokenize validation data
test_data = test_ds.map(
    process_test_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["Problem"]
)
test_data.set_format(type="torch")

Map:   0%|          | 0/29837 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2985 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

# Create Model

In [13]:
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/input/transformers/facebook-bart-base")

  return self.fget.__get__(instance, owner)()


# Train Model

In [14]:
metric = evaluate.load("squad_v2")
#metric = evaluate.load("/kaggle/working/evaluate/metrics/accuracy/accuracy.py")
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    labels = labels[:, 1:],reshape(-1)
    preds = preds[:, :-1].reshape(-1)
    return metric.compute(predictions=preds, references=labels)

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

In [15]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(output_dir="/kaggle/working",
                                         evaluation_strategy="epoch",
                                         num_train_epochs=3,
                                         push_to_hub=False,
                                         report_to="none",
                                         save_strategy="no")
trainer = Trainer(model,
                         training_args,
                         train_dataset=train_data,
                         eval_dataset=val_data,
                         data_collator=DataCollatorForSeq2Seq(tokenizer))
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,0.4133,0.338499
2,0.3054,0.255534
3,0.2397,0.223144


TrainOutput(global_step=5595, training_loss=0.35356394688501436, metrics={'train_runtime': 2264.247, 'train_samples_per_second': 39.532, 'train_steps_per_second': 2.471, 'total_flos': 7585484166973440.0, 'train_loss': 0.35356394688501436, 'epoch': 3.0})

In [17]:
import shutil
trainer.save_model("/kaggle/working/model")
shutil.make_archive("/kaggle/working/saved/model", "zip", "/kaggle/working/model")

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


'/kaggle/working/saved/model.zip'

In [18]:
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/input/mwp-model")

In [19]:
import torch
from torch import device
import numpy as np
td = tokenizer(val_ds["Problem"][0:10], truncation=True, padding=True, return_tensors="pt")
td.to(device("cuda"))
cuda_model = model.to(device("cuda"))
with torch.no_grad():
    out = cuda_model(**td)

In [20]:
val_ds[0]

{'Problem': 'a shopkeeper sold an article offering a discount of 5 % and earned a profit of 31.1 % . what would have been the percentage of profit earned if no discount had been offered ?',
 'Solution': 'add(n1,const_100)|subtract(const_100,n0)|multiply(#0,const_100)|divide(#2,#1)|subtract(#3,const_100)|'}

In [21]:
for i in range(len(out.logits.cpu().detach().numpy()[0])):
    print(np.argmax(out.logits.cpu().detach().numpy()[0][i]))

0
4917
20836
1215
1640
20836
1566
1640
10
6720
1640
20836
6
6
20836
282
282
49275
282
49275
288
49275
49275
42527
3249
33
1102
4917
34545
1640
5
1640
114
10936
10936
1640
57
1661
1640
42527
2
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
1640
4791
1640
1640
1640
1640
1640
4791
4791
1640
1640
1640
1640
4791
4791
1640
4791
1640
4791
1640


In [22]:
answer = ""
for this in out.logits.cpu().detach().numpy()[0]:
    answer += decode_tokenizer.decode(np.argmax(this)) + " "
answer

'add        #1    multiply multiply            gcd     n0      speed    const_100                                                    '

In [23]:
decode_tokenizer.get_vocab()

{'volume_cube': 70,
 'quadrilateral_area': 88,
 'n8': 79,
 'n16': 82,
 'n5': 32,
 'n14': 98,
 'lcm': 67,
 '#5': 16,
 'power': 14,
 '|': 3,
 'const_360': 78,
 'surface_cylinder': 81,
 '#7': 34,
 'n23': 83,
 'const_3_6': 60,
 'negate_prob': 87,
 '#17': 101,
 'inverse': 46,
 'max': 84,
 'reminder': 63,
 'n2': 13,
 'floor': 25,
 'const_52': 72,
 'gcd': 33,
 'rhombus_area': 106,
 'const_0_33': 61,
 'const_0_6': 107,
 '#13': 42,
 'const_2.0': 18,
 'rectangle_area': 65,
 'n6': 35,
 'multiply': 6,
 'n1': 1,
 'const_10': 27,
 'divide': 8,
 '<unk>': 112,
 'cube_edge_by_volume': 69,
 'n7': 51,
 'const_5': 50,
 'const_6': 59,
 '#21': 104,
 'log': 58,
 'const_1000': 29,
 'const_3.0': 19,
 'const_1': 12,
 '<s>': 109,
 'volume_sphere': 95,
 'circle_area': 21,
 'const_180': 94,
 'speed': 57,
 'const_60': 49,
 'square_area': 74,
 '#15': 44,
 '#1': 10,
 '#10': 37,
 'rectangle_perimeter': 91,
 'min': 96,
 'triangle_area': 93,
 '#18': 105,
 'square_edge_by_perimeter': 108,
 'const_0.5': 90,
 'circumface':