In [3]:
!pip install datasets



#Load Dataset

In [4]:

import pandas as pd

df = pd.read_csv("/content/code program csv generator.csv")
df

Unnamed: 0,Query,Code_Snippet,Language,Tags
0,How to create a list comprehension in Python?,[x**2 for x in range(10)],Python,tutorial
1,How to handle missing data in pandas?,"df.fillna(0, inplace=True)",Python,example
2,How to use a lambda function in Python?,lambda x: x + 2,Python,advanced
3,How to create a REST API in Flask?,from flask import Flask\napp = Flask(__name__)...,Python,tutorial
4,How to perform matrix multiplication in numpy?,"import numpy as np\nnp.dot(A, B)",Python,advanced
...,...,...,...,...
3015,How to implement a class in C++?,class MyClass {\npublic:\n void myMethod() ...,C++,tutorial
3016,How to use pointers in C++?,int x = 10;\nint* ptr = &x;,C++,advanced
3017,How to read a file in C++?,#include <fstream>\nstd::ifstream file('file.t...,C++,common-issues
3018,How to create a vector in C++?,"#include <vector>\nstd::vector<int> v = {1, 2,...",C++,tutorial


In [5]:
df.shape

(3020, 4)

In [6]:
df = df.sample(n=1000, random_state=42).reset_index(drop=True)
df.shape

(1000, 4)

In [7]:

df['Language'].value_counts()

Unnamed: 0_level_0,count
Language,Unnamed: 1_level_1
SQL,205
Shell,200
Java,200
JavaScript,199
Python,193
C++,3


In [8]:
# Convert all text columns to lowercase
df['Query'] = df['Query'].str.lower()
df['Code_Snippet'] = df['Code_Snippet'].str.lower()
df['Tags'] = df['Tags'].str.lower()

In [9]:
# Convert all entries to strings
df["Query"] = df["Query"].astype(str)
df["Code_Snippet"] = df["Code_Snippet"].astype(str)

# Splits

In [10]:
# Split dataset into training and validation sets
from sklearn.model_selection import train_test_split
from datasets import Dataset
train_data, val_data = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

In [11]:
train_dataset

Dataset({
    features: ['Query', 'Code_Snippet', 'Language', 'Tags', '__index_level_0__'],
    num_rows: 900
})

# Tokenization

In [26]:
from transformers import AutoTokenizer

# Load a tokenizer for the chosen model (e.g., mT5 or mBART)
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")


In [27]:
input_max_len = max([len(tokenizer.encode(text)) for text in df['Query']])
input_max_len

15

In [28]:
output_max_len = max([len(tokenizer.encode(text)) for text in df['Code_Snippet']])
output_max_len

59

In [29]:
# Tokenization function
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples['Query'],
        max_length=64,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        examples['Code_Snippet'],
        max_length=100,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [30]:
train_dataset[0]

{'input_ids': tensor([250004,   3642,     47,  54529, 124519,     23,     10,  11435,     32,
              2,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor([250004,  92922,     20,    141,  11435,      5, 124326,      2,      1,
              1,      1,      1,      1,      1,      1,      1,      1

# Load Pretrained Sequence-to-Sequence Model

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
import os

results_dir = "/content/drive/MyDrive/xxx/rst"
model_dir = "/content/drive/MyDrive/xxx/mdl"

# Create the results directory if it doesn't exist
os.makedirs(results_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

# Fine Tuning Model

In [33]:
from transformers import Seq2SeqTrainingArguments

from transformers import AutoModelForSeq2SeqLM

# AutoModelForSeq2SeqLM is a class in the Hugging Face Transformers library that automatically loads a pre-trained sequence-to-sequence model.
# It is used for tasks like machine translation, summarization, and other text generation tasks, where both the encoder and decoder are trained to process input and generate output sequences.

# Load a pre-trained sequence-to-sequence model
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")

# # Set language-specific tokens if using mBART
# model.config.decoder_start_token_id = tokenizer.lang_code_to_id["ur_PK"]  # Urdu token
# tokenizer.src_lang = "en_XX"
# tokenizer.tgt_lang = "ur_PK"


# Seq2SeqTrainingArguments is a class in Hugging Face's Transformers library designed specifically for training sequence-to-sequence models.
# It provides various training configurations such as batch size, number of epochs, evaluation strategy, and output directory, optimized for tasks like translation, summarization, or text generation.

training_args = Seq2SeqTrainingArguments(
    output_dir=results_dir,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="steps",           # Change from "epoch" to "steps"
    save_steps=50,                  # Save checkpoint every 100 steps
    eval_steps=50,                  # Evaluate every 100 steps (optional)
    logging_steps=50,              # Log training info every 100 steps (optional)
    logging_dir="./logs",
    predict_with_generate=True,
    generation_max_length=128,
)

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


  trainer = Seq2SeqTrainer(


Step,Training Loss
50,5.9616
100,0.4364
150,0.0991
200,0.0654
250,0.0401
300,0.0899




TrainOutput(global_step=339, training_loss=0.9913225631094964, metrics={'train_runtime': 1140.3102, 'train_samples_per_second': 2.368, 'train_steps_per_second': 0.297, 'total_flos': 365703148339200.0, 'train_loss': 0.9913225631094964, 'epoch': 3.0})

#Save Fine Tuned Model

In [37]:
model.save_pretrained("/content/drive/MyDrive/xxx/model")
tokenizer.save_pretrained("/content/drive/MyDrive/xxx/token")


('/content/drive/MyDrive/xxx/token/tokenizer_config.json',
 '/content/drive/MyDrive/xxx/token/special_tokens_map.json',
 '/content/drive/MyDrive/xxx/token/sentencepiece.bpe.model',
 '/content/drive/MyDrive/xxx/token/added_tokens.json',
 '/content/drive/MyDrive/xxx/token/tokenizer.json')

#Code Snipet Generation System

In [36]:
# Load the fine-tuned model
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/xxx/model")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/xxx/token")


def Generate_code(text):
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", max_length=64, truncation=True)
    # Generate translation
    outputs = model.generate(inputs["input_ids"], max_length=100, num_beams=4, early_stopping=True)
    # Decode the translation
    code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return code

# Example usage
text_to_code = "How to create a table in SQL?"
code = Generate_code(text_to_code)
print("Generated Code:", code)


Generated Code: create table users (id int, name varchar(100));


In [42]:
print("Code Generator Chatbot (type 'exit' to quit)")
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Exiting chatbot. Goodbye!")
        break
    generated_code = Generate_code(user_input)
    print("Generated Code:/n", generated_code)

Code Generator Chatbot (type 'exit' to quit)
You: How to create a table in SQL?
Generated Code: create table users (id int, name varchar(100));
You: exit
Exiting chatbot. Goodbye!
