# Load Dataset



In [15]:
import pandas as pd

df = pd.read_csv("query&target.csv")
df

Unnamed: 0,Query,Code_Snippet,Language,Tags
0,How to create a list comprehension in Python?,[x**2 for x in range(10)],Python,tutorial
1,How to handle missing data in pandas?,"df.fillna(0, inplace=True)",Python,example
2,How to use a lambda function in Python?,lambda x: x + 2,Python,advanced
3,How to create a REST API in Flask?,from flask import Flask\napp = Flask(__name__)...,Python,tutorial
4,How to perform matrix multiplication in numpy?,"import numpy as np\nnp.dot(A, B)",Python,advanced
...,...,...,...,...
3015,How to implement a class in C++?,class MyClass {\npublic:\n void myMethod() ...,C++,tutorial
3016,How to use pointers in C++?,int x = 10;\nint* ptr = &x;,C++,advanced
3017,How to read a file in C++?,#include <fstream>\nstd::ifstream file('file.t...,C++,common-issues
3018,How to create a vector in C++?,"#include <vector>\nstd::vector<int> v = {1, 2,...",C++,tutorial


In [16]:
df['Language'].value_counts()

Unnamed: 0_level_0,count
Language,Unnamed: 1_level_1
SQL,627
Java,618
Shell,601
JavaScript,595
Python,574
C++,5


In [17]:
# Convert all text columns to lowercase
df['Query'] = df['Query'].str.lower()
df['Code_Snippet'] = df['Code_Snippet'].str.lower()
df['Tags'] = df['Tags'].str.lower()

# Tokenization

In [18]:
from transformers import T5Tokenizer

# Load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenize the dataset
def preprocess_data(data):
    inputs = ["generate code: " + query for query in data["Query"]]
    targets = data["Code_Snippet"].tolist()
    input_encodings = tokenizer(inputs, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    target_encodings = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(df)


# Dataset Preparation

In [19]:
import torch

class CodeSnippetDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx],
        }

dataset = CodeSnippetDataset(input_encodings, target_encodings)


# fine Tune Model

In [20]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# Define training arguments with optimized settings
training_args = TrainingArguments(
    output_dir="./t5_finetuned",
    evaluation_strategy="epoch",  # Evaluation after each epoch
    save_strategy="epoch",        # Save after each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Reduce batch size to 4
    per_device_eval_batch_size=4,   # Reduce batch size to 4
    num_train_epochs=5,            # Train for fewer epochs (for testing)
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,             # Log less frequently
    save_total_limit=1,            # Limit the number of saved models
    load_best_model_at_end=True,
    fp16=True,                     # Enable mixed precision to speed up training
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
)

# Train the model
trainer.train()




Epoch,Training Loss,Validation Loss
1,1.2088,0.071258
2,0.0787,0.011764
3,0.0454,0.009069
4,0.026,0.008584
5,0.023,0.008444


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3775, training_loss=0.21656152434696424, metrics={'train_runtime': 481.3833, 'train_samples_per_second': 31.368, 'train_steps_per_second': 7.842, 'total_flos': 510915300556800.0, 'train_loss': 0.21656152434696424, 'epoch': 5.0})

# Save Fine Tuned Model

In [21]:
model.save_pretrained("C:\Users\Satchal Patil\DATASCIPRAC\DSP4codesnipp")
tokenizer.save_pretrained("C:\Users\Satchal Patil\DATASCIPRAC\DSP4codesnipp")

('/content/drive/MyDrive/t5_finetuned_model/tokenizer_config.json',
 '/content/drive/MyDrive/t5_finetuned_model/special_tokens_map.json',
 '/content/drive/MyDrive/t5_finetuned_model/spiece.model',
 '/content/drive/MyDrive/t5_finetuned_model/added_tokens.json')

# Searh Engine (Code Snipet Generation System)

In [22]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model
model = T5ForConditionalGeneration.from_pretrained("C:\Users\Satchal Patil\DATASCIPRAC\DSP4codesnipp\t5_finetuned_model")
tokenizer = T5Tokenizer.from_pretrained("C:\Users\Satchal Patil\DATASCIPRAC\DSP4codesnipp\t5_finetuned_model")

# Define a function for inference
def generate_code(query):
    query = query.lower()
    input_text = "generate code: " + query
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example Usage
while True:
  query = input("\n\nYour Query or type (exit) to leave : \n  ")
  if query == "exit":
    print("\nGood Bye...")
    break
  code = generate_code(query)
  print("\n Code Snipet : \n", code)



Your Query or type (exit) to leave : 
  how to create a list comprehension in python?	

 Code Snipet : 
 [x**2 for x in range(10)]


Your Query or type (exit) to leave : 
  	how to perform matrix multiplication in numpy?	

 Code Snipet : 
 import numpy as np np.dot(a, b)


Your Query or type (exit) to leave : 
  how to overload operators in c++?

 Code Snipet : 
 operator overload = overload => console.log('click', () => console.log('clicked!'));


Your Query or type (exit) to leave : 
  how to create a rest api in flask?

 Code Snipet : 
 from flask import flask app = flask(__name__) @app.route('/') def home(): return 'hello, api!'


Your Query or type (exit) to leave : 
  exit

Good Bye...
