In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import pandas as pd
import torch

In [None]:
df = pd.read_csv('programming_code_snippets.csv')

In [None]:
df.head()

In [None]:
df["Query"] = df["Query"].astype(str).str.lower()
df["Code_Snippet"] = df["Code_Snippet"].astype(str)

In [None]:
df.head()

### Tokenization

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [None]:
def preprocess_data(data):
    inputs = ["generate code: " + query for query in data['Query']]

    targets = [
        f"Language: {lang}\nTags: {tags}\nCode:\n{code}"
        for lang, tags, code in zip(
            data["Language"],
            data["Tags"],
            data["Code_Snippet"]
        )
    ]

    input_encodings = tokenizer(
        inputs,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors='pt',
    )

    target_encodings = tokenizer(
        targets,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_tensors='pt',
    )

    return input_encodings, target_encodings

In [None]:
input_encodings, target_encodings = preprocess_data(df)

In [None]:
target_encodings

In [None]:
input_encodings

### Data Preparation

In [None]:
class CodeSnippetDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx],
        }

dataset = CodeSnippetDataset(input_encodings, target_encodings)

### Fine Tune Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

In [None]:
# Training arguments with optimized settings
training_args = TrainingArguments(
    output_dir="./t5_finetuned",
    eval_strategy="epoch",  # Evaluation after each epoch
    save_strategy="epoch",        # Save after each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Reduce batch size to 4
    per_device_eval_batch_size=4,   # Reduce batch size to 4
    num_train_epochs=5,            # Train for fewer epochs (for testing)
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,             # Log less frequently
    save_total_limit=1,            # Limit the number of saved models
    load_best_model_at_end=True,
    fp16=True,                     # Enable mixed precision to speed up training
    report_to="wandb",
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
)

# Train the model
trainer.train()

### Saving Fine Tune Model

In [None]:
model.save_pretrained("/content/drive/MyDrive/t5_finetuned_model")
tokenizer.save_pretrained("/content/drive/MyDrive/t5_finetuned_model")

### Code Snipet Generation

In [None]:
# Loading Fine Tuned Model
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/t5_finetuned_model")
tokenizer = T5Tokenizer.from_pretrained("/content/drive/MyDrive/t5_finetuned_model")

In [None]:
def generate_code(query):
    query = query.lower()
    input_text = "generate code: " + query
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)

    outputs = model.generate(
        input_ids,
        max_length=512,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
generate_code("How to use a lambda function in Python?")