In [None]:
!pip uninstall -y transformers

In [None]:
!pip install transformers==4.30.0

In [None]:
!pip install datasets

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import pandas as pd

In [None]:
dataset = load_dataset("code_x_glue_cc_code_refinement",'small')

In [None]:
from transformers import RobertaTokenizer
model_name = "Salesforce/codet5-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
def preprocess(example):
    input_text = "fix: " + example['buggy']
    target_text = example['fixed']
    model_input = tokenizer(input_text, max_length=128, padding="max_length", truncation=True)
    label = tokenizer(target_text, max_length=128, padding="max_length", truncation=True)
    model_input["labels"] = label["input_ids"]
    return model_input

In [None]:
tokenized_dataset = dataset.map(preprocess, remove_columns=dataset["train"].column_names)

In [None]:
tokenized_train = tokenized_dataset["train"].select(range(1000))
tokenized_eval = tokenized_dataset["validation"].select(range(200))

In [None]:
training_args = TrainingArguments(
    output_dir="./codet5-fix-model",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),
    report_to=None,
    remove_unused_columns=False,
    gradient_accumulation_steps=4,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./codet5-fix-model")
tokenizer.save_pretrained("./codet5-fix-model")


In [None]:
from transformers import T5ForConditionalGeneration, RobertaTokenizer

model_path = "./codet5-fix-model"

tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
model.eval()


In [None]:
def fix_code(buggy_code):
    input_text = "fix: " + buggy_code
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=512)

    fixed_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return fixed_code

buggy = "def add(a,b):\nreturn a+b"
fixed = fix_code(buggy)
print("Fixed Code:\n", fixed)


In [None]:
from transformers import T5ForConditionalGeneration, RobertaTokenizer

model_path = "./codet5-fix-model"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-base")

buggy_code = "public static boolean isEmpty(String str) { return str.length() == 0; }"
input_text = "fix: " + buggy_code

inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
outputs = model.generate(
    **inputs,
    max_length=128,
    num_return_sequences=1,
    num_beams=5,
    early_stopping=True
)


fixed_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("🔧 Fixed Code:\n" + fixed_code.strip())

In [None]:
from google.colab import files

files.download("/content/codet5_model.zip")


In [None]:
import shutil

shutil.make_archive("/content/codet5_model", 'zip', './codet5-fix-model')


In [None]:
from datasets import load_dataset

dataset = load_dataset("code_x_glue_cc_code_refinement", "small")

print("Available splits:", dataset.keys())

print("\nExample from train split:")
print(dataset['train'][0])

print("\nExample from validation split:")
print(dataset['validation'][0])

print("\nExample from test split:")
print(dataset['test'][0])


In [None]:
!pip install datasets