In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load pretrained T5 model and tokenizer
model_path = "./pretrained_t5_model/"  # Replace with your pretrained model directory
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

2024-12-23 19:22:14.233473: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import pandas as pd
from datasets import Dataset

# Load dataset
df = pd.read_csv("dataset.csv", names=['buggy', 'fixed']) 

In [4]:
dataset = Dataset.from_pandas(df)

In [5]:
# Tokenize the dataset
def preprocess_function(examples):
    inputs = tokenizer(
        examples["buggy"], 
        max_length=512, 
        padding="max_length", 
        truncation=True
    )
    targets = tokenizer(
        examples["fixed"], 
        max_length=512, 
        padding="max_length", 
        truncation=True
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Split into train and validation sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./fine_tuned_model", # Directory to save the fine-tuned model
    overwrite_output_dir=True,       # Evaluate at the end of each epoch
    num_train_epochs=2,              # Number of training epochs
    per_device_train_batch_size=2,   # Training batch size
    per_device_eval_batch_size=2,    # Evaluation batch size
    logging_dir="./logs",            # Directory to store logs
    logging_steps=50,                # Log every 10 steps
    warmup_steps=200,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
)

In [15]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [34]:
PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0

In [18]:
# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_bug_fix_model")
tokenizer.save_pretrained("./fine_tuned_bug_fix_model")

Step,Training Loss
50,0.5571
100,0.6423
150,0.4706
200,0.4173
250,0.1927
300,0.1514
350,0.1584
400,0.1115
450,0.1072
500,0.107


('./fine_tuned_bug_fix_model/tokenizer_config.json',
 './fine_tuned_bug_fix_model/special_tokens_map.json',
 './fine_tuned_bug_fix_model/spiece.model',
 './fine_tuned_bug_fix_model/added_tokens.json')