In [1]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    pipeline
)
from datasets import load_dataset, DatasetDict
import numpy as np
from evaluate import load

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
bleu = load("bleu")

# Load the dataset directly from a CSV file
dataset = load_dataset('csv', data_files='data/dataset.csv')

# Ensure the dataset has 'code' and 'equation' columns
print("Dataset columns:", dataset['train'].column_names)

# Split the dataset into train, validation, and test sets
# First, split into train and test sets
dataset = dataset['train'].train_test_split(test_size=0.1)

# Then, split the test set equally into validation and test sets
test_valid = dataset['test'].train_test_split(test_size=0.5)

# Create a DatasetDict to hold the splits
dataset = DatasetDict({
    'train': dataset['train'],
    'validation': test_valid['train'],
    'test': test_valid['test'],
})

Generating train split: 73 examples [00:00, 14051.59 examples/s]

Dataset columns: ['code', 'equation']





In [43]:

# Initialize the tokenizer and model
model_name = "t5-small"  # You can replace this with another model like 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Tokenization function using 'text_target' for target sequences
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples['code'],
        max_length=512,
        truncation=True,
        padding=False,  # Padding is handled by the data collator
    )
    labels = tokenizer(
        text_target=examples['equation'],
        max_length=256,
        truncation=True,
        padding=False,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the tokenization function to the entire dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)



Map: 100%|██████████| 65/65 [00:00<00:00, 5475.26 examples/s]

Map: 100%|██████████| 4/4 [00:00<00:00, 614.17 examples/s]

Map: 100%|██████████| 4/4 [00:00<00:00, 618.86 examples/s]


In [44]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode the predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Post-process
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    decoded_labels = [[label] for label in decoded_labels]
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["bleu"]}


In [45]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    logging_dir="./logs",
    logging_steps=10,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model("t5-code-to-math")
tokenizer.save_pretrained("t5-code-to-math")

  trainer = Seq2SeqTrainer(
 10%|█         | 9/90 [30:55<4:38:19, 206.17s/it]

 10%|█         | 9/90 [00:10<01:07,  1.20it/s]

{'eval_loss': 1.665670394897461, 'eval_bleu': 0.3207024162225735, 'eval_runtime': 0.8512, 'eval_samples_per_second': 4.7, 'eval_steps_per_second': 1.175, 'epoch': 1.0}


 11%|█         | 10/90 [00:11<01:27,  1.09s/it]

{'loss': 2.0574, 'grad_norm': 7.57383394241333, 'learning_rate': 1.7777777777777777e-05, 'epoch': 1.11}


 20%|██        | 18/90 [00:18<00:55,  1.31it/s]
 20%|██        | 18/90 [00:19<00:55,  1.31it/s]

{'eval_loss': 1.538629412651062, 'eval_bleu': 0.3207024162225735, 'eval_runtime': 0.8254, 'eval_samples_per_second': 4.846, 'eval_steps_per_second': 1.212, 'epoch': 2.0}


 22%|██▏       | 20/90 [00:20<01:08,  1.02it/s]

{'loss': 2.046, 'grad_norm': 6.774435520172119, 'learning_rate': 1.555555555555556e-05, 'epoch': 2.22}


 30%|███       | 27/90 [00:26<00:48,  1.30it/s]
 30%|███       | 27/90 [00:27<00:48,  1.30it/s]

{'eval_loss': 1.4278111457824707, 'eval_bleu': 0.3207024162225735, 'eval_runtime': 0.862, 'eval_samples_per_second': 4.64, 'eval_steps_per_second': 1.16, 'epoch': 3.0}


 33%|███▎      | 30/90 [00:30<00:56,  1.06it/s]

{'loss': 1.6193, 'grad_norm': 6.778203010559082, 'learning_rate': 1.3333333333333333e-05, 'epoch': 3.33}


 40%|████      | 36/90 [00:35<00:43,  1.23it/s]
 40%|████      | 36/90 [00:36<00:43,  1.23it/s]

{'eval_loss': 1.344699501991272, 'eval_bleu': 0.3207024162225735, 'eval_runtime': 0.8676, 'eval_samples_per_second': 4.61, 'eval_steps_per_second': 1.153, 'epoch': 4.0}


 44%|████▍     | 40/90 [00:39<00:45,  1.09it/s]

{'loss': 1.6512, 'grad_norm': 4.185177326202393, 'learning_rate': 1.1111111111111113e-05, 'epoch': 4.44}


 50%|█████     | 45/90 [00:43<00:38,  1.17it/s]
 50%|█████     | 45/90 [00:44<00:38,  1.17it/s]

{'eval_loss': 1.2810853719711304, 'eval_bleu': 0.2984942907959129, 'eval_runtime': 0.8326, 'eval_samples_per_second': 4.804, 'eval_steps_per_second': 1.201, 'epoch': 5.0}


 56%|█████▌    | 50/90 [00:49<00:41,  1.03s/it]

{'loss': 1.7885, 'grad_norm': 5.181885242462158, 'learning_rate': 8.888888888888888e-06, 'epoch': 5.56}


 60%|██████    | 54/90 [00:53<00:30,  1.16it/s]
 60%|██████    | 54/90 [00:54<00:30,  1.16it/s]

{'eval_loss': 1.239700198173523, 'eval_bleu': 0.2984942907959129, 'eval_runtime': 0.9128, 'eval_samples_per_second': 4.382, 'eval_steps_per_second': 1.096, 'epoch': 6.0}


 67%|██████▋   | 60/90 [01:00<00:31,  1.04s/it]

{'loss': 1.5811, 'grad_norm': 5.7886128425598145, 'learning_rate': 6.666666666666667e-06, 'epoch': 6.67}


 70%|███████   | 63/90 [01:02<00:22,  1.22it/s]
 70%|███████   | 63/90 [01:03<00:22,  1.22it/s]

{'eval_loss': 1.2039539813995361, 'eval_bleu': 0.25185193909177644, 'eval_runtime': 0.8521, 'eval_samples_per_second': 4.694, 'eval_steps_per_second': 1.174, 'epoch': 7.0}


 78%|███████▊  | 70/90 [01:09<00:17,  1.12it/s]

{'loss': 1.4856, 'grad_norm': 4.890051364898682, 'learning_rate': 4.444444444444444e-06, 'epoch': 7.78}


 80%|████████  | 72/90 [01:10<00:13,  1.34it/s]
 80%|████████  | 72/90 [01:11<00:13,  1.34it/s]

{'eval_loss': 1.1835471391677856, 'eval_bleu': 0.2458423950688514, 'eval_runtime': 0.7949, 'eval_samples_per_second': 5.032, 'eval_steps_per_second': 1.258, 'epoch': 8.0}


 89%|████████▉ | 80/90 [01:19<00:10,  1.01s/it]

{'loss': 1.6199, 'grad_norm': 12.634102821350098, 'learning_rate': 2.222222222222222e-06, 'epoch': 8.89}


 90%|█████████ | 81/90 [01:19<00:07,  1.15it/s]
 90%|█████████ | 81/90 [01:20<00:07,  1.15it/s]

{'eval_loss': 1.1701003313064575, 'eval_bleu': 0.2458423950688514, 'eval_runtime': 0.9728, 'eval_samples_per_second': 4.112, 'eval_steps_per_second': 1.028, 'epoch': 9.0}


100%|██████████| 90/90 [01:28<00:00,  1.18it/s]

{'loss': 1.539, 'grad_norm': 7.847878932952881, 'learning_rate': 0.0, 'epoch': 10.0}



100%|██████████| 90/90 [01:31<00:00,  1.02s/it]


{'eval_loss': 1.1647580862045288, 'eval_bleu': 0.2458423950688514, 'eval_runtime': 1.0262, 'eval_samples_per_second': 3.898, 'eval_steps_per_second': 0.975, 'epoch': 10.0}
{'train_runtime': 91.4725, 'train_samples_per_second': 7.106, 'train_steps_per_second': 0.984, 'train_loss': 1.7097603585984973, 'epoch': 10.0}


('t5-code-to-math/tokenizer_config.json',
 't5-code-to-math/special_tokens_map.json',
 't5-code-to-math/tokenizer.json')

In [46]:
# Evaluate the model on the test set
results = trainer.evaluate(tokenized_datasets["test"])
print("Evaluation results:", results)

# Prepare the model for inference
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

100%|██████████| 1/1 [00:00<00:00, 129.56it/s]

Evaluation results: {'eval_loss': 1.0083293914794922, 'eval_bleu': 0.38694621926276307, 'eval_runtime': 0.9791, 'eval_samples_per_second': 4.086, 'eval_steps_per_second': 1.021, 'epoch': 10.0}





T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [57]:
# Create a text generation pipeline
equation_generator = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1  # 0 for GPU, -1 for CPU
)

In [59]:


def generate_equations(code_snippets, max_length=256, num_beams=4, early_stopping=True):
    """
    Generates equations from a list of code snippets.

    Args:
        code_snippets (list of str): List of code snippets to convert.
        max_length (int, optional): Maximum length of the generated equation. Defaults to 256.
        num_beams (int, optional): Number of beams for beam search. Defaults to 4.
        early_stopping (bool, optional): Whether to stop the beam search when at least `num_beams` sentences are finished. Defaults to True.

    Returns:
        list of str: Generated equations corresponding to the input code snippets.
    """
    try:
        if not isinstance(code_snippets, list):
            raise ValueError("Input should be a list of code snippets.")

        # Generate equations using the pipeline
        generated = equation_generator(
            code_snippets,
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=early_stopping,
            truncation=True  # Ensure inputs are truncated to the model's max input length
        )

        # Extract the generated text
        equations = [item['generated_text'] for item in generated]
        return equations

    except Exception as e:
        print(f"An error occurred during equation generation: {e}")
        return []
    
code_examples = [
    "def add(a, b): return a + b",
    "def multiply(x, y):\n    return x * y",
    "def divide(numerator, denominator):\n    return numerator / denominator"
]

generated_equations = generate_equations(code_examples)
for code, eq in zip(code_examples, generated_equations):
    print(f"Code:\n{code}\nGenerated Equation: {eq}\n")

Code:
def add(a, b): return a + b
Generated Equation: def add(a, b)

Code:
def multiply(x, y):
    return x * y
Generated Equation: multiply(x, y)

Code:
def divide(numerator, denominator):
    return numerator / denominator
Generated Equation: def divide(numerator, denominator): return numerator / denominator

