#**SEP 775 Assignment - 4**
                        
**Student: Rutvik Roy**\
**Student ID: 400490159**


In [None]:
!pip install datasets
!pip install pandas
!pip install peft
!pip install tensorboard
!pip install torch
!pip install transformers

In [None]:
!pip install accelerate>=0.21.0
!pip install rouge

In [None]:
# import all necessary libraries

import torch
from datasets import load_dataset,load_metric,Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

# Data Cleaning

In [None]:
# load dataset

dataset=load_dataset("flytech/python-codes-25k",split="train[:100]")

In [None]:
#dataset to dataframe

dataset_df = pd.DataFrame(dataset)
dataset_df.head()

In [None]:
# remove (python\n) from the 'output' column

dataset_df['output'] = dataset_df['output'].str.replace("python\n", "")

# remove unnecessary string like quatations
dataset_df['output'] = dataset_df['output'].str.replace('```', '')

# keep only necessary columns only
dataset_df=dataset_df[['instruction','output']]

# remove duplicate entry
dataset_df.drop_duplicates(['instruction','output'],inplace=True)

# delete empty rows
dataset_df.dropna(inplace=True)

# select only 100 rows for less training time
dataset_df=dataset_df[:100]

dataset_df.head()

In [None]:
input_text=dataset_df.instruction.values
output_code=dataset_df.output.values

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

# the tokenizer of pretained model T5-base
model_name = 'Salesforce/codet5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize_text(text):
  return tokenizer.tokenize(text)

def tokenized_text_id(tokenized_text):
  return tokenizer.convert_tokens_to_ids(tokenized_text)

def add_special_tokens(input_text):
  return ['<CLS>']+input_text+['<SEP>']

In [None]:
print("Original:",input_text[0])
print("Tokenized input:",tokenizer.tokenize(input_text[0]))
print("Tokenized input ids:",tokenizer.convert_tokens_to_ids(tokenizer.tokenize(input_text[0])))
print(" ")
print("Original:",output_code[0])
print("Tokenized output:",tokenizer.tokenize(output_code[0]))
print("Tokenized output ids:",tokenizer.convert_tokens_to_ids(tokenizer.tokenize(output_code[0])))

In [None]:
input_ids=[]

for input_txt in (input_text):
    input_txt=tokenize_text(input_txt)
    input_tokens=add_special_tokens(input_txt)
    ids=tokenized_text_id(input_tokens)
    input_ids.append(ids)

output_ids=[]

for output in (output_code):
    output=tokenize_text(output)
    output_tokens=add_special_tokens(output)
    ids=tokenized_text_id(output_tokens)
    output_ids.append(ids)

In [None]:
input_legnths=[]
for ids in input_ids:
    input_legnths.append(len(ids))
input_maximum_length=max(input_legnths)
print("Maximum input length:",input_maximum_length)


output_legnths=[]
for ids in output_ids:
    output_legnths.append(len(ids))
output_maximum_length=max(output_legnths)
print("Maximum output length:",output_maximum_length)

In [None]:
# function to tokenize the data and do padding according to maximum length

def tokenize_function(examples):
    inputs = tokenizer(examples['instruction'], padding='max_length', truncation=True, max_length=19, return_tensors='pt')
    inputs['labels'] = tokenizer(examples['output'], padding='max_length', truncation=True, max_length=166, return_tensors='pt')['input_ids']
    return inputs

In [None]:
# Split the dataset into training, validation, and test sets

train_df, temp_df = train_test_split(dataset_df, train_size=0.8, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# **Impliment Baseline Model and LORA**

In [None]:
from peft import PeftConfig, get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForSeq2SeqLM

model_name = 'Salesforce/codet5-base'
model=AutoModelForSeq2SeqLM.from_pretrained(model_name)

# freez the original or baseline model parameters
for param in model.parameters():
    param.requires_grad = False


In [None]:
# LORA configuration to be adapted

peft_config=LoraConfig(

    task_type=TaskType.SEQ_2_SEQ_LM,

    # the dimension of the low-rank matrices
    r=4,

    # the scaling factor for the low-rank matrices
    lora_alpha=32,

    # the dropout probability of the LoRA layers
    lora_dropout=0.01,

    target_modules=["k","q","v","o"],
)

peft_model=get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

In [None]:
# verify trainable parameters using require_grad method

trainable_params = 0
all_param = 0

for _, param in peft_model.named_parameters():

    #adding parameter
    all_param += param.numel()
    
    #adding parameters to trainable if they require a graident
    if param.requires_grad:
        trainable_params += param.numel()


print(f"trainable params: {trainable_params}")
print(f"all params: {all_param}")
print(f"trainable: {100 * trainable_params / all_param:.2f}%")

In [None]:
# the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=peft_model)

# dataset object for training
training_data = Dataset.from_pandas(train_df)

# tokenize the training data
tokenized_training_data = training_data.map(tokenize_function, batched=True)

# dataset object for validation
validation_data = Dataset.from_pandas(val_df)

# tokenize the validation data
tokenized_validation_data = validation_data.map(tokenize_function, batched=True)

In [None]:
training_data

In [None]:
tokenized_training_data[:1]

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="./LORA_model"

# training args

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=100,
    learning_rate=0.001,
    num_train_epochs=100,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=100,
    report_to="tensorboard",
    load_best_model_at_end=False,
    save_strategy="no",
)

# trainer instance

trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_training_data,
    eval_dataset=tokenized_validation_data,)

model.config.use_cache = False  # silence the warnings.

In [None]:
# start training

trainer.train()

In [None]:
#save trained model

import os
output_dir="/content/LORA_model"

if not os.path.exists(output_dir):
  os.makedirs(output_dir)
  
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# **Model Evaluation**

In [None]:
from peft import PeftModel, PeftConfig

# load peft config for pre-trained checkpoint

peft_model_id="/content/LORA_model"
config = PeftConfig.from_pretrained(peft_model_id)

# load model and tokenizer

trained_model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

trained_model = PeftModel.from_pretrained(trained_model, peft_model_id)
trained_model.eval()

In [None]:
# Testing with Prompts (these prompts or instructions have been taken from original flytech/python-25k training dataset)

prompts = ['Help me set up my daily to-do list!', 'Create a shopping list based on my inputs!', 'Calculate how much time I spend on my phone per week!', "Help me split the bill among my friends!"]
generated_code_list_1 = []

# Generate code for each prompt

for prompt in prompts:
    input_ids = tokenizer(prompt, return_tensors="pt", padding=True, max_length=19, truncation=True).input_ids.to(trained_model.device)
    generated_ids = trained_model.generate(input_ids=input_ids, max_length=166)
    generated_code = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    generated_code_list_1.append(generated_code)

In [None]:
print(generated_code_list_1)

# 1. BLUE Score

In [None]:
from nltk.translate.bleu_score import sentence_bleu,SmoothingFunction

# reference code or true code for the prompts
reference_code =["tasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n",
                 "shopping_list = {}\nwhile True:\n    item = input('Enter an item or type 'done' to finish: ')\n    if item == 'done': break\n    quantity = input(f'Enter the quantity for {item}: ')\n    shopping_list[item] = quantity\nprint(f'Your shopping list: {shopping_list}')\n",
                 "total_time = 0\nfor i in range(1, 8):\n    time = float(input(f'Enter phone usage in hours for day {i}: '))\n    total_time += time\nprint(f'You spend approximately {total_time} hours per week on your phone.')\n",
                 "total_bill = float(input('Enter the total bill amount: '))\nfriends_count = int(input('Enter the number of friends: '))\nper_person = total_bill / friends_count\nprint(f'Each person should pay {per_person}')\n"]
bleu_scores_1 = []

# calculate BLEU scores for each prompt

for generated, reference in zip(generated_code_list_1, reference_code):
    generated_tokens = generated.split()
    reference_tokens = reference.split()
    bleu_score = sentence_bleu([reference_tokens], generated_tokens)
    bleu_scores_1.append(bleu_score)

# print BLEU scores for each prompt
for i, (prompt, bleu_score,code) in enumerate(zip(prompts, bleu_scores_1,generated_code_list_1)):
    print(f"Prompt {i+1}: {prompt}")
    print(f"Generated Code: {code}")
    print(f"BLEU Score: {bleu_score:.4f}")
    print("-" * 50)

# calculate the average BLEU score
average_bleu_score = sum(bleu_scores_1) / len(bleu_scores_1)
print("Average BLEU Score of Lora model:", average_bleu_score)

# 2. ROUGE Score

In [None]:
from rouge import Rouge

rouge=Rouge()
rouge_scores_1 = []

print(f"Rouge Scores of Lora Model:")
print(" ")

# calculate Rouge scores for each prompt
for generated, reference in zip(generated_code_list_1, reference_code):
    rouge_score = rouge.get_scores([generated], [reference])
    rouge_scores_1.append(rouge_score)

# print Rouge scores for each prompt
for i, (prompt, rouge_score) in enumerate(zip(prompts, rouge_scores_1)):
    print(f"Prompt {i+1}: {prompt}")
    print(f"Rouge Score:",rouge_score)
    print("-" * 50)

# **Comparision with baseline model**

In [None]:
# load the baseline model which has been used for training

model_name = "Salesforce/codet5-base"
baseline_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
generated_code_list_2 = []

# Generate code for each prompt using baseline model

for prompt in prompts:
    input_ids = tokenizer(prompt, return_tensors="pt", padding=True, max_length=38, truncation=True).input_ids.to(baseline_model.device)
    generated_ids = baseline_model.generate(input_ids=input_ids,max_new_tokens=400)
    generated_code = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    generated_code_list_2.append(generated_code)

# 1. Baseline model BLUE score

In [None]:
from nltk.translate.bleu_score import sentence_bleu

bleu_scores_2 = []

# calculate BLEU scores for each prompt
for generated, reference in zip(generated_code_list_2, reference_code):
    generated_tokens = generated.split()
    reference_tokens = reference.split()
    bleu_score = sentence_bleu([reference_tokens], generated_tokens)
    bleu_scores_2.append(bleu_score)

# print BLEU scores for each prompt
for i, (prompt, bleu_score,code) in enumerate(zip(prompts, bleu_scores_2,generated_code_list_2)):
    print(f"Prompt {i+1}: {prompt}")
    print(f"Generated Code: {code}")
    print(f"BLEU Score: {bleu_score:.4f}")
    print("-" * 50)

# calculate the average BLEU score
average_bleu_score = sum(bleu_scores_2) / len(bleu_scores_2)
print("Average BLEU Score of Baseline model:", average_bleu_score)

# 2. Baseline model Rouge score

In [None]:
from rouge import Rouge

rouge=Rouge()
rouge_scores_2 = []

print(f"Rouge Scores of Baseline Model:")
print(" ")

# calculate Rouge scores for each prompt
for generated, reference in zip(generated_code_list_2, reference_code):
    rouge_score = rouge.get_scores([generated], [reference])
    rouge_scores_2.append(rouge_score)

# print Rouge scores for each prompt
for i, (prompt, rouge_score) in enumerate(zip(prompts, rouge_scores_2)):
    print(f"Prompt {i+1}: {prompt}")
    print(f"Rouge Score:",rouge_score)
    print("-" * 50)