In [1]:
# Install packages (this takes ~2 minutes)
!pip install -q transformers datasets torch accelerate sympy evaluate

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import numpy as np
import re
from datasets import load_dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
import time


‚úÖ Using device: cuda


In [None]:

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"‚úÖ Using device: {device}")

In [3]:
# Load MAWPS dataset from Hugging Face
ds = load_dataset("mwpt5/MAWPS")

print(f"Dataset structure: {ds}")
print(f"\nüìù Sample example:")
print(ds['train'][0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


MAWPS.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1772 [00:00<?, ? examples/s]

Dataset structure: DatasetDict({
    train: Dataset({
        features: ['Question', 'Equation', 'Answer', 'Numbers'],
        num_rows: 1772
    })
})

üìù Sample example:
{'Question': 'Mary is baking a cake . The recipe wants N_00 cups of flour . She already put in N_01 cups . How many cups does she need to add ?', 'Equation': 'N_00 - N_01', 'Answer': 6.0, 'Numbers': '8.0 2.0'}


In [4]:
def clean_equation(equation):
    """Clean and normalize equation strings"""
    if equation is None:
        return None

    equation = str(equation).strip()
    equation = equation.replace('=', '')
    equation = equation.replace('√ó', '*')
    equation = equation.replace('√∑', '/')
    equation = equation.replace('^', '**')
    equation = ' '.join(equation.split())

    return equation

# Helper functions defined

In [5]:
def prepare_dataset(raw_dataset, train_split=0.9):
    """
    Prepare MAWPS dataset for T5 fine-tuning
    """
    # Get the training data
    if 'train' in raw_dataset:
        data = raw_dataset['train']
    else:
        data = raw_dataset[list(raw_dataset.keys())[0]]

    print(f"\nOriginal dataset size: {len(data)}")
    print(f"Column names: {data.column_names}")

    # Identify column names
    question_field = None
    equation_field = None

    # Update the lists to include the actual column names in the dataset
    for field in ['Question', 'question', 'text', 'input', 'problem', 'sQuestion']:
        if field in data.column_names:
            question_field = field
            break

    for field in ['Equation', 'equation', 'target', 'lEquations', 'lSolutions', 'answer']:
        if field in data.column_names:
            equation_field = field
            break


    if not question_field or not equation_field:
        raise ValueError(f"Could not find question/equation fields. Available: {data.column_names}")

    print(f"‚úÖ Using fields: question='{question_field}', equation='{equation_field}'")

    # Process data
    train_inputs = []
    train_targets = []

    for i in range(len(data)):
        question = data[i][question_field]
        equation = data[i][equation_field]

        # Handle list of equations
        if isinstance(equation, list):
            equation = equation[0] if len(equation) > 0 else None

        # Clean equation
        equation = clean_equation(equation)

        # Validate
        if question and equation and len(equation) > 0:
            train_inputs.append(f"solve: {question}")
            train_targets.append(equation)

    print(f"‚úÖ Processed {len(train_inputs)} valid examples")

    # Split train/validation
    split_idx = int(len(train_inputs) * train_split)

    from datasets import Dataset, DatasetDict

    train_dataset = Dataset.from_dict({
        'input_text': train_inputs[:split_idx],
        'target_text': train_targets[:split_idx]
    })

    val_dataset = Dataset.from_dict({
        'input_text': train_inputs[split_idx:],
        'target_text': train_targets[split_idx:]
    })

    dataset = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset
    })

    print(f"\nüìä Final splits:")
    print(f"  Train: {len(dataset['train'])} examples")
    print(f"  Validation: {len(dataset['validation'])} examples")

    # Show sample
    print(f"\nüìù Sample training example:")
    print(f"  Input: {dataset['train'][0]['input_text']}")
    print(f"  Target: {dataset['train'][0]['target_text']}")

    return dataset

In [6]:
# Prepare dataset
dataset = prepare_dataset(ds, train_split=0.9)


Original dataset size: 1772
Column names: ['Question', 'Equation', 'Answer', 'Numbers']
‚úÖ Using fields: question='Question', equation='Equation'
‚úÖ Processed 1772 valid examples

üìä Final splits:
  Train: 1594 examples
  Validation: 178 examples

üìù Sample training example:
  Input: solve: Mary is baking a cake . The recipe wants N_00 cups of flour . She already put in N_01 cups . How many cups does she need to add ?
  Target: N_00 - N_01


In [7]:
# Initialize model
model_name = "t5-small"
print(f"Loading {model_name}...")

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.to(device)

print(f"‚úÖ Model loaded on {device}")
print(f"üìä Model parameters: {model.num_parameters():,}")

Loading t5-small...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

‚úÖ Model loaded on cuda
üìä Model parameters: 60,506,624


In [8]:
def preprocess_function(examples):
    """Tokenize inputs and targets"""
    model_inputs = tokenizer(
        examples['input_text'],
        max_length=512,
        truncation=True,
        padding='max_length'
    )

    labels = tokenizer(
        examples['target_text'],
        max_length=128,
        truncation=True,
        padding='max_length'
    )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset['train'].column_names,
    desc="Tokenizing"
)

print("‚úÖ Tokenization complete")
print(f"  Train: {len(tokenized_dataset['train'])} examples")
print(f"  Validation: {len(tokenized_dataset['validation'])} examples")

Tokenizing dataset...


Tokenizing:   0%|          | 0/1594 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/178 [00:00<?, ? examples/s]

‚úÖ Tokenization complete
  Train: 1594 examples
  Validation: 178 examples


In [9]:
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./hybridmath-checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,  # Reduce to 4 if OOM
    per_device_eval_batch_size=8,
    num_train_epochs=10,  # Increase to 15-20 for better results
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=50,
    warmup_steps=100,
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=2,
    report_to="none",
)
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Learning rate: {training_args.learning_rate}")

  Batch size: 8
  Epochs: 10
  Learning rate: 0.0003


In [10]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(


In [11]:
print("="*70)
print("üöÄ STARTING TRAINING")
print("="*70)
start_time = time.time()

# Train the model
train_result = trainer.train()

training_time = time.time() - start_time

print("\n" + "="*70)
print("‚úÖ TRAINING COMPLETE!")
print("="*70)
print(f"‚è±Ô∏è  Training time: {training_time/60:.2f} minutes")
print(f"üìâ Final train loss: {train_result.training_loss:.4f}")

üöÄ STARTING TRAINING


Epoch,Training Loss,Validation Loss
1,0.1169,0.03567
2,0.0315,0.021965
3,0.0243,0.018034
4,0.0191,0.014948
5,0.0176,0.013586
6,0.0152,0.012008
7,0.0146,0.011377
8,0.0129,0.011281
9,0.0118,0.011053
10,0.0118,0.010802


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



‚úÖ TRAINING COMPLETE!
‚è±Ô∏è  Training time: 12.27 minutes
üìâ Final train loss: 0.3770


In [12]:
print("="*70)
print("üìä EVALUATING MODEL")
print("="*70)

eval_results = trainer.evaluate()

print(f"\n‚úÖ Evaluation complete!")
print(f"üìâ Validation loss: {eval_results['eval_loss']:.4f}")

üìä EVALUATING MODEL



‚úÖ Evaluation complete!
üìâ Validation loss: 0.0108


In [13]:
print("="*70)
print("üß™ TESTING MODEL")
print("="*70)

model.eval()

test_problems = [
    "Joan found 70 seashells on the beach. She gave Sam 18 of her seashells. How many seashells does she have now?",
    "There are 96 oranges. 17 are rotten. How many good oranges are there?",
    "Sara has 31 red and 15 green marbles. Sandy has 12 red marbles. How many red marbles do they have in total?",
    "Tom had 27 pennies and 15 dimes in his bank. His dad gave him 33 dimes. How many dimes does he have now?"
]

for i, problem in enumerate(test_problems, 1):
    input_text = f"solve: {problem}"
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=128,
            num_beams=5,
            early_stopping=True
        )

    equation = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"\n{i}. Problem: {problem[:70]}...")
    print(f"   Generated Equation: {equation}")

    try:
        answer = eval(equation)
        print(f"   ‚úÖ Answer: {answer}")
    except Exception as e:
        print(f"   ‚ö†Ô∏è  Could not evaluate: {e}")

üß™ TESTING MODEL

1. Problem: Joan found 70 seashells on the beach. She gave Sam 18 of her seashells...
   Generated Equation: 70 - 18
   ‚úÖ Answer: 52

2. Problem: There are 96 oranges. 17 are rotten. How many good oranges are there?...
   Generated Equation: 96 - 17
   ‚úÖ Answer: 79

3. Problem: Sara has 31 red and 15 green marbles. Sandy has 12 red marbles. How ma...
   Generated Equation: 31 + 12
   ‚úÖ Answer: 43

4. Problem: Tom had 27 pennies and 15 dimes in his bank. His dad gave him 33 dimes...
   Generated Equation: 27 pennies - 33
   ‚ö†Ô∏è  Could not evaluate: invalid syntax (<string>, line 1)


In [14]:
print("="*70)
print("üíæ SAVING FINAL MODEL")
print("="*70)

save_path = "./hybridmath-final"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"‚úÖ Model saved successfully to: {save_path}")
print("\nüìÅ Files saved:")
print("  - config.json")
print("  - pytorch_model.bin")
print("  - tokenizer files")

üíæ SAVING FINAL MODEL
‚úÖ Model saved successfully to: ./hybridmath-final

üìÅ Files saved:
  - config.json
  - pytorch_model.bin
  - tokenizer files


In [15]:
# Download the model to your computer
print("="*70)
print("üì• DOWNLOAD MODEL")
print("="*70)
print("\nTo download the trained model:")
print("1. Go to Files panel (left sidebar)")
print("2. Find 'hybridmath-final' folder")
print("3. Right-click ‚Üí Download")
print("\nOr run this cell to create a zip file:")

# Create zip file
!zip -r hybridmath-final.zip hybridmath-final/

print("\n‚úÖ Created hybridmath-final.zip")
print("üì• Download it from the Files panel")

üì• DOWNLOAD MODEL

To download the trained model:
1. Go to Files panel (left sidebar)
2. Find 'hybridmath-final' folder
3. Right-click ‚Üí Download

Or run this cell to create a zip file:
  adding: hybridmath-final/ (stored 0%)
  adding: hybridmath-final/tokenizer_config.json (deflated 94%)
  adding: hybridmath-final/added_tokens.json (deflated 83%)
  adding: hybridmath-final/spiece.model (deflated 48%)
  adding: hybridmath-final/generation_config.json (deflated 27%)
  adding: hybridmath-final/special_tokens_map.json (deflated 85%)
  adding: hybridmath-final/model.safetensors (deflated 10%)
  adding: hybridmath-final/config.json (deflated 63%)

‚úÖ Created hybridmath-final.zip
üì• Download it from the Files panel


In [16]:
model.eval()

test_problems = [
    "Joan found 70 seashells on the beach. She gave Sam 18 of her seashells. How many seashells does she have now?",
    "There are 96 oranges. 17 are rotten. How many good oranges are there?",
    "Sara has 31 red and 15 green marbles. Sandy has 12 red marbles. How many red marbles do they have in total?",
    "Tom had 27 pennies and 15 dimes in his bank. His dad gave him 33 dimes. How many dimes does he have now?"
]

for i, problem in enumerate(test_problems, 1):
    input_text = f"solve: {problem}"
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=128,
            num_beams=5,
            early_stopping=True
        )

    equation = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"\n{i}. Problem: {problem[:70]}...")
    print(f"   Generated Equation: {equation}")

    try:
        answer = eval(equation)
        print(f"   ‚úÖ Answer: {answer}")
    except Exception as e:
        print(f"   ‚ö†Ô∏è  Could not evaluate: {e}")


1. Problem: Joan found 70 seashells on the beach. She gave Sam 18 of her seashells...
   Generated Equation: 70 - 18
   ‚úÖ Answer: 52

2. Problem: There are 96 oranges. 17 are rotten. How many good oranges are there?...
   Generated Equation: 96 - 17
   ‚úÖ Answer: 79

3. Problem: Sara has 31 red and 15 green marbles. Sandy has 12 red marbles. How ma...
   Generated Equation: 31 + 12
   ‚úÖ Answer: 43

4. Problem: Tom had 27 pennies and 15 dimes in his bank. His dad gave him 33 dimes...
   Generated Equation: 27 pennies - 33
   ‚ö†Ô∏è  Could not evaluate: invalid syntax (<string>, line 1)


In [17]:
print("="*70)
print("üíæ SAVING FINAL MODEL")
print("="*70)

save_path = "./hybridmath-final"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"‚úÖ Model saved successfully to: {save_path}")
print("\nüìÅ Files saved:")
print("  - config.json")
print("  - pytorch_model.bin")
print("  - tokenizer files")

üíæ SAVING FINAL MODEL
‚úÖ Model saved successfully to: ./hybridmath-final

üìÅ Files saved:
  - config.json
  - pytorch_model.bin
  - tokenizer files


In [18]:
# Download the model to your computer
print("="*70)
print("üì• DOWNLOAD MODEL")
print("="*70)
print("\nTo download the trained model:")
print("1. Go to Files panel (left sidebar)")
print("2. Find 'hybridmath-final' folder")
print("3. Right-click ‚Üí Download")
print("\nOr run this cell to create a zip file:")

# Create zip file
!zip -r hybridmath-final.zip hybridmath-final/

print("\n‚úÖ Created hybridmath-final.zip")
print("üì• Download it from the Files panel")

üì• DOWNLOAD MODEL

To download the trained model:
1. Go to Files panel (left sidebar)
2. Find 'hybridmath-final' folder
3. Right-click ‚Üí Download

Or run this cell to create a zip file:
updating: hybridmath-final/ (stored 0%)
updating: hybridmath-final/tokenizer_config.json (deflated 94%)
updating: hybridmath-final/added_tokens.json (deflated 83%)
updating: hybridmath-final/spiece.model (deflated 48%)
updating: hybridmath-final/generation_config.json (deflated 27%)
updating: hybridmath-final/special_tokens_map.json (deflated 85%)
updating: hybridmath-final/model.safetensors (deflated 10%)
updating: hybridmath-final/config.json (deflated 63%)

‚úÖ Created hybridmath-final.zip
üì• Download it from the Files panel
