# Preparing Kotlin code completion dataset and finetuning the Phi 1.5 model using PEFT(Parameter-Efficient Fine-Tuning)

### Import necessary libraries

In [None]:
import os
import re
import random

import torch
import pandas as pd
import numpy as np
from peft import PeftModel, LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

### Preparing Kotlin files

In [None]:
def remove_comments_and_format_new_lines(source_code):
    # Remove single line comments and the newline character immediately following it
    source_code = re.sub(r'//.*?\n', '\n', source_code)
    # Remove multi-line comments
    source_code = re.sub(r'/\*.*?\*/', '', source_code, flags=re.DOTALL)
    # Reduce three or more consecutive newlines to exactly two newlines
    source_code = re.sub(r'\n{3,}', '\n\n', source_code)
    return source_code

def load_and_clean_files(directory_path):
    all_files = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.kt'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                cleaned_content = remove_comments_and_format_new_lines(content)
                all_files.append(cleaned_content)
    return all_files

def split_dataset(data, train_ratio=0.6, val_ratio=0.2):
    random.shuffle(data)
    total = len(data)
    train_end = int(total * train_ratio)
    val_end = int(total * (train_ratio + val_ratio))
    train_data = data[:train_end]
    val_data = data[train_end:val_end]
    test_data = data[val_end:]
    return train_data, val_data, test_data

# Load and clean the data
directory_path = 'kotlin_files'
data = load_and_clean_files(directory_path)

# Split the data into training, validation, and test sets
train_data, val_data, test_data = split_dataset(data)

with open('train_data.txt', 'w', encoding='utf-8') as f:
    for item in train_data:
        f.write(f"{item}\n\n")

with open('val_data.txt', 'w', encoding='utf-8') as f:
    for item in val_data:
        f.write(f"{item}\n\n")

with open('test_data.txt', 'w', encoding='utf-8') as f:
    for item in test_data:
        f.write(f"{item}\n\n")

print("Data preparation complete. Data split into training, validation, and test sets.")

### Load a pretrained tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
tokenizer.pad_token = tokenizer.eos_token

### Configure model quantization and Parameter Efficient Fine-Tuning (PEFT)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-1_5",
    device_map={"":0},
    trust_remote_code=True,
    quantization_config=bnb_config
)

In [None]:
print(model)

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

### Apply PEFT to the model

In [None]:
model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

### Load the dataset and prepare for training

In [None]:
val_data = ""
with open("val_data.txt", "r") as f:
    val_data = f.read()
    
chunk_size = 2000
text_chunks = [val_data[i:i + chunk_size] for i in range(0, len(val_data), chunk_size)]

val_df = pd.DataFrame(text_chunks, columns=['text'])
val_df['Prompt'] = val_df['text'].str[:1000]
val_df['Completion'] = val_df['text'].str[1000:]

### Create a combined text field for tokenization


In [None]:
val_df["text"] = val_df[["Prompt", "Completion"]].apply(lambda x: "Prompt: " + x["Prompt"] + " Completion: " + x["Completion"], axis=1)

### Tokenize the data

In [None]:
def tokenize(sample):
    tokenized_text =  tokenizer(sample["text"], padding=True, truncation=True, max_length=512)
    return tokenized_text

In [None]:
data = Dataset.from_pandas(val_df)

tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)

# Split the tokenized data into training and test sets
dataset = tokenized_data.train_test_split(test_size=0.2)

train_dataset = dataset['train']
eval_dataset = dataset['test']

### Training configuration

In [None]:
training_arguments = TrainingArguments(
        output_dir="phi-1_5-finetuned-med-text",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=1,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=100,
        max_steps=1000,
        num_train_epochs=1
    )

### Function to compute metrics for evaluation

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    # Logic is missing because when I tried to run the evaluation with compute_metrics I was running into "OutOfMemoryError: CUDA out of memory" error
    
    return {
        "BLEU": sentence_bleu,
        "ROUGE-L": Rouge().get_scores
    }

In [None]:
torch.cuda.empty_cache()

### Initialize and run the trainer

In [1]:
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
#     compute_metrics=compute_metrics
)
a = trainer.evaluate(eval_dataset)
print(a)
trainer.train()
b = trainer.evaluate(eval_dataset)
print(b)

# Training loss is decreasing but I was not able to compute all metrics because of "OutOfMemoryError: CUDA out of memory" issue

NameError: name 'Trainer' is not defined