# Model Fine-tuning

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install fuzzywuzzy
!pip install python-Levenshtein
!pip install tree-sitter
!pip install rouge-score
!pip install accelerate transformers einops datasets peft bitsandbytes --upgrade

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
import datetime
import torch
import random
import numpy as np
import json
import re
import pandas as pd
from tqdm import tqdm
from fuzzywuzzy import fuzz
import Levenshtein
import nltk
from rouge_score import rouge_scorer
import tree_sitter
from peft import PeftModel, LoraConfig, get_peft_model
from datasets import load_dataset, Dataset
import os

In [5]:
# torch.set_default_device("cuda")

print("Loading model...")
time = datetime.datetime.now()

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
tokenizer.pad_token = tokenizer.eos_token

'''
Chosen parameters:
- bnb_4bit_use_double_quant: enables a second quantization after the first one to save an additional 0.4 bits per parameter
- bnb_4bit_quant_type: non-float 4-bit
- bnb_4bit_compute_dtype: we need to specify a computation type because while nf4 stores weights in 4-bit type, the computation still happens in 16/32 bits
'''
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-1_5",
    device_map={"":0},  # device index of 0 refers to the first available GPU device
    trust_remote_code=True,
    quantization_config=bnb_config
)

time1 = datetime.datetime.now()
print(f"Model loaded. Time to load the model: {time1 - time}")

Loading model...
Model loaded. Time to load the model: 0:00:02.430820


In [6]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (dense): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2048, out_features=8192, bias=True)
          (fc2): Linear4bit(in_features=8192, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (final_laye

In [7]:
'''
Chosen parameters:
- r: The number of buckets used in the LORA algorithm for quantization.
- target_modules:
  - 'dense': A fully connected layer in a neural network (lated in PhiSdpaAttention).
  - 'fc2': The second fully connected layer in Phi - Multi Layered Perceptron.
  - 'q_proj', 'k_proj', 'v_proj': Projection layers used in the attention mechanism of transformer models.
    They project input embeddings into query, key, and value vectors for attention computation. (Located in PhiSdpaAttention)
'''

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["dense", "fc2", "q_proj", "k_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 10,223,616 || all params: 1,428,494,336 || trainable%: 0.7156917421617274


In [8]:
def replace_tags(code):
    """
    Replaces special tags in the input code with their corresponding literals or empty strings.
    Original function is here:
    https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/CodeCompletion-line/evaluator/evaluator.py

    Parameters:
        code (str): The input code containing special tags.

    Returns:
        str: The code with tags replaced by literals or empty strings.
    """
    # Replace special tags with their corresponding literals or empty strings
    code = code.replace("<NUM_LIT>", "0").replace("<STR_LIT>", "").replace("<CHAR_LIT>", "")

    # Find literals enclosed in special tags and replace them with the literal itself
    pattern = re.compile(r"<(STR|NUM|CHAR)_LIT:(.*?)>", re.S)
    lits = re.findall(pattern, code)
    for lit in lits:
        code = code.replace(f"<{lit[0]}_LIT:{lit[1]}>", lit[1])

    # Find special tags and replace them with empty spaces
    pattern = r'<([A-Z][^<>]*)>'
    liners = re.findall(pattern, code)
    for tag in liners:
        code = code.replace(f'<{tag}>', ' ')

    return code

def read_jsonl_file(file_path):
    """
    Reads a JSONL file and replaces special tags in the 'signature' and 'body' fields of each JSON object.

    Parameters:
        file_path (str): The path to the JSONL file.

    Returns:
        list: A list of dictionaries, each containing the modified JSON objects.
    """
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            json_obj = json.loads(line)
            json_obj['signature'] = replace_tags(json_obj['signature'])
            json_obj['body'] = replace_tags(json_obj['body'])
            data.append(json_obj)
    return data

file_path = '/content/drive/MyDrive/CodeCompletion/CodeXGlue/test.jsonl'
codexglue_test = read_jsonl_file(file_path)
print(f'{codexglue_test[0]}\n')

columns_to_convert = ['is_single_expression', 'is_test', '0-20', '100+', '20-50', '50-100']

file_path = '/content/drive/MyDrive/CodeCompletion/functions_df_inputs_outputs.csv'
functions_df = pd.read_csv(file_path)
functions_df[columns_to_convert] = functions_df[columns_to_convert].astype(str)
print(f'{functions_df.iloc[0]}\n')

file_path = '/content/drive/MyDrive/CodeCompletion/context_functions_df.csv'
context_functions_df = pd.read_csv(file_path)
context_functions_df[columns_to_convert] = context_functions_df[columns_to_convert].astype(str)
print(f'{context_functions_df.iloc[0]}\n')

{'signature': 'def debug(user, message):', 'body': 'message_user(user, message, constants.DEBUG) ', 'docstring': 'Adds a message with the ``DEBUG`` level.\n\n:param user: User instance\n:param message: Message to show', 'id': 'f4:m0'}

Unnamed: 0                                                              0
function_id                                                         27692
signature               private fun bitIndex(elementIndex: Int, bitOff...
body                    =\n        elementIndex * ELEMENT_SIZE + bitOf...
is_single_expression                                                 True
is_test                                                             False
0-20                                                                False
100+                                                                False
20-50                                                               False
50-100                                                               True
Name: 0, dtype: object



In [9]:
def tokenize(sample):
    tokenized_text = tokenizer(sample["text"], padding=True, truncation=True, max_length=256)
    return tokenized_text

functions_df["text"] = functions_df[["signature", "body"]].apply(lambda x: "Prompt: " + x["signature"] + " Completion: " + x["body"], axis=1)
print(functions_df.iloc[0])

Unnamed: 0                                                              0
function_id                                                         27692
signature               private fun bitIndex(elementIndex: Int, bitOff...
body                    =\n        elementIndex * ELEMENT_SIZE + bitOf...
is_single_expression                                                 True
is_test                                                             False
0-20                                                                False
100+                                                                False
20-50                                                               False
50-100                                                               True
text                    Prompt: private fun bitIndex(elementIndex: Int...
Name: 0, dtype: object


In [None]:
data = Dataset.from_pandas(functions_df)
tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)

tokenized_data[0]

In [11]:
training_arguments = TrainingArguments(
    output_dir="phi-1_5-finetuned-kotlin",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=100,
    max_steps=1000,
    num_train_epochs=1
)

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_data,
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()

In [24]:
model.save_pretrained("phi-1_5-finetuned-kotlin")

In [None]:
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype=torch.float32)

peft_model = PeftModel.from_pretrained(model, "phi-1_5-finetuned-kotlin", from_transformers=True)

model = peft_model.merge_and_unload()

# Now we can load fine-tuned model in evaluate_pretrained.ipynb and try to evaluate them