In [1]:
import pandas as pd

df = pd.read_csv("query&target.csv")
df

Unnamed: 0,Query,Code_Snippet,Language,Tags
0,How to create a list comprehension in Python?,[x**2 for x in range(10)],Python,tutorial
1,How to handle missing data in pandas?,"df.fillna(0, inplace=True)",Python,example
2,How to use a lambda function in Python?,lambda x: x + 2,Python,advanced
3,How to create a REST API in Flask?,from flask import Flask\napp = Flask(__name__)...,Python,tutorial
4,How to perform matrix multiplication in numpy?,"import numpy as np\nnp.dot(A, B)",Python,advanced
...,...,...,...,...
3015,How to implement a class in C++?,class MyClass {\npublic:\n void myMethod() ...,C++,tutorial
3016,How to use pointers in C++?,int x = 10;\nint* ptr = &x;,C++,advanced
3017,How to read a file in C++?,#include <fstream>\nstd::ifstream file('file.t...,C++,common-issues
3018,How to create a vector in C++?,"#include <vector>\nstd::vector<int> v = {1, 2,...",C++,tutorial


In [2]:
df['Language'].value_counts()

Language
SQL           627
Java          618
Shell         601
JavaScript    595
Python        574
C++             5
Name: count, dtype: int64

In [3]:
# Convert all text columns to lowercase
df['Query'] = df['Query'].str.lower()
df['Code_Snippet'] = df['Code_Snippet'].str.lower()
df['Tags'] = df['Tags'].str.lower()

# Tokenization

In [4]:
from transformers import T5Tokenizer

# Load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenize the dataset
def preprocess_data(data):
    inputs = ["generate code: " + query for query in data["Query"]]
    targets = data["Code_Snippet"].tolist()
    input_encodings = tokenizer(inputs, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    target_encodings = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(df)


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# Dataset Preparation

In [5]:
import torch

class CodeSnippetDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx],
        }

dataset = CodeSnippetDataset(input_encodings, target_encodings)


# fine Tune Model

In [6]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load T5 model and move it to the device (GPU or CPU)
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# Define training arguments with optimized settings
training_args = TrainingArguments(
    output_dir="./t5_finetuned",
    evaluation_strategy="epoch",  # Evaluation after each epoch
    save_strategy="epoch",        # Save after each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Reduce batch size to 4
    per_device_eval_batch_size=4,   # Reduce batch size to 4
    num_train_epochs=5,            # Train for fewer epochs (
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,             # Log less frequently
    save_total_limit=1,            # Limit the number of saved models
    load_best_model_at_end=True,
    fp16=True,                     # Enable mixed precision to speed up training
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
)

# Train the model
trainer.train()


  0%|          | 0/3775 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
 13%|█▎        | 500/3775 [19:35<2:08:58,  2.36s/it]

{'loss': 1.0757, 'grad_norm': 3.024446487426758, 'learning_rate': 4.337748344370861e-05, 'epoch': 0.66}


                                                       
 20%|██        | 755/3775 [1:13:07<1:48:03,  2.15s/it]

{'eval_loss': 0.06580698490142822, 'eval_runtime': 2075.7564, 'eval_samples_per_second': 1.455, 'eval_steps_per_second': 0.364, 'epoch': 1.0}


 26%|██▋       | 1000/3775 [1:22:16<1:49:48,  2.37s/it]  

{'loss': 0.2072, 'grad_norm': 0.4297788441181183, 'learning_rate': 3.675496688741722e-05, 'epoch': 1.32}


 40%|███▉      | 1500/3775 [1:47:07<1:45:36,  2.79s/it] 

{'loss': 0.0792, 'grad_norm': 0.5843868851661682, 'learning_rate': 3.0132450331125826e-05, 'epoch': 1.99}


                                                       
 40%|████      | 1510/3775 [1:56:19<1:40:54,  2.67s/it]

{'eval_loss': 0.011912385933101177, 'eval_runtime': 523.6683, 'eval_samples_per_second': 5.767, 'eval_steps_per_second': 1.442, 'epoch': 2.0}


 53%|█████▎    | 2000/3775 [4:27:14<1:30:37,  3.06s/it]     

{'loss': 0.0464, 'grad_norm': 0.2229871302843094, 'learning_rate': 2.3509933774834437e-05, 'epoch': 2.65}


                                                       
 60%|██████    | 2265/3775 [4:48:49<1:09:26,  2.76s/it]

{'eval_loss': 0.00926603190600872, 'eval_runtime': 530.365, 'eval_samples_per_second': 5.694, 'eval_steps_per_second': 1.424, 'epoch': 3.0}


 66%|██████▌   | 2500/3775 [5:00:15<1:03:07,  2.97s/it]  

{'loss': 0.0293, 'grad_norm': 0.33369773626327515, 'learning_rate': 1.688741721854305e-05, 'epoch': 3.31}


 79%|███████▉  | 3000/3775 [5:23:21<33:44,  2.61s/it]  

{'loss': 0.0266, 'grad_norm': 0.25985684990882874, 'learning_rate': 1.0264900662251655e-05, 'epoch': 3.97}


                                                         
 80%|████████  | 3020/3775 [5:43:56<7:34:07, 36.09s/it]

{'eval_loss': 0.008878319524228573, 'eval_runtime': 527.4107, 'eval_samples_per_second': 5.726, 'eval_steps_per_second': 1.432, 'epoch': 4.0}


 93%|█████████▎| 3500/3775 [14:50:09<32:57,  7.19s/it]      

{'loss': 0.0235, 'grad_norm': 0.1550307273864746, 'learning_rate': 3.642384105960265e-06, 'epoch': 4.64}


                                                           
100%|██████████| 3775/3775 [16:49:26<00:00,  2.04s/it]

{'eval_loss': 0.008746174164116383, 'eval_runtime': 397.7146, 'eval_samples_per_second': 7.593, 'eval_steps_per_second': 1.898, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 3775/3775 [16:49:27<00:00, 16.04s/it]

{'train_runtime': 60567.7384, 'train_samples_per_second': 0.249, 'train_steps_per_second': 0.062, 'train_loss': 0.19856325579005363, 'epoch': 5.0}





TrainOutput(global_step=3775, training_loss=0.19856325579005363, metrics={'train_runtime': 60567.7384, 'train_samples_per_second': 0.249, 'train_steps_per_second': 0.062, 'total_flos': 510915300556800.0, 'train_loss': 0.19856325579005363, 'epoch': 5.0})

# Save Fine Tuned Model

In [8]:
model.save_pretrained(r"C:\Users\Satchal Patil\DATASCIPRAC\DSP4codesnipp")

tokenizer.save_pretrained(r"C:\Users\Satchal Patil\DATASCIPRAC\DSP4codesnipp")

('C:\\Users\\Satchal Patil\\DATASCIPRAC\\DSP4codesnipp\\tokenizer_config.json',
 'C:\\Users\\Satchal Patil\\DATASCIPRAC\\DSP4codesnipp\\special_tokens_map.json',
 'C:\\Users\\Satchal Patil\\DATASCIPRAC\\DSP4codesnipp\\spiece.model',
 'C:\\Users\\Satchal Patil\\DATASCIPRAC\\DSP4codesnipp\\added_tokens.json')

# Searh Engine (Code Snipet Generation System)

In [12]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model
model = T5ForConditionalGeneration.from_pretrained(r"C:\Users\Satchal Patil\DATASCIPRAC\DSP4codesnipp\t5_finetuned\checkpoint-3775")
tokenizer = T5Tokenizer.from_pretrained(r"C:\Users\Satchal Patil\DATASCIPRAC\DSP4codesnipp\t5_finetuned\checkpoint-3775")

# Define a function for inference
def generate_code(query):
    query = query.lower()
    input_text = "generate code: " + query
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example Usage
while True:
  query = input("\n\nYour Query or type (exit) to leave : \n  ")
  if query == "exit":
    print("\nGood Bye...")
    break
  code = generate_code(query)
  print("\n Code Snippet : \n", code)


 Code Snipet : 
 async function fetchdata()  const response = await fetch(url); return await response.json();

 Code Snipet : 
 df.fillna(0, inplace=true)

Good Bye...
