# Install Needed Libraries

In [None]:
!pip install transformers
!pip install wandb
!pip install python-dotenv
!pip install datasets
!pip install bitsandbytes

# Add Needed Import Statements

In [None]:
import json
import os
from os.path import join
import random
from transformers import AutoModelForCausalLM, AutoTokenizer
import wandb
from dotenv import load_dotenv
from datasets import load_dataset
from huggingface_hub import HfApi
from huggingface_hub import snapshot_download


load_dotenv()
base_model_id = "Qwen/Qwen2.5-14B-Instruct"
device = "cuda"
torch_dtype = None

# Authenticate Accounts For HF & WANDB

In [None]:
wandb.login(key=os.getenv("WANDB"))
hf_token = os.getenv("HF_KEY")
!huggingface-cli login --token {hf_token}

# Setup LLAMA-Factory

In [None]:
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
!cd LLaMA-Factory && pip install -e .

# Prepare Fintuning Dataset

In [9]:
system_message = "\n".join([
    "You are a senior software engineer.",
    "Follow the provided `Task` by the user and the `Output Scheme` to generate the `Output JSON`.",
    "Do not generate any introduction or conclusion."
])
llm_finetuning_data = []

dataset = load_dataset("CodeAid/CouplingDetectionData", data_files="CouplingDetection.jsonl")

for rec in dataset['train']:
    llm_finetuning_data.append({
        "system": system_message,
        "instruction": '\n'.join([
            "## Code:",
            json.dumps(rec["prompt"]),
            "",
            "# Task:",
            rec["task"],
            "# Output Scheme:",
            rec["output_schema"],
            "",
            "# Output  :",
            "```json"
        ]),
        "input": "",
        "output": "\n".join([
            "```json",
            json.dumps(rec["couplingSmells"], ensure_ascii=False, default=str),
            "```"
        ]),
        "history":[]
    })

# Shuffle and Split

In [10]:
random.Random(101).shuffle(llm_finetuning_data)

train_data = llm_finetuning_data[:int(len(llm_finetuning_data) * 0.8)]
test_data = llm_finetuning_data[int(len(llm_finetuning_data) * 0.8):]

os.makedirs(join("", "", "couplingD-finetune-data"), exist_ok=True)

with open(join("", "", "couplingD-finetune-data", "train.json"), "w") as dest:
    json.dump(train_data, dest, ensure_ascii=False, default=str)

with open(join("", "", "couplingD-finetune-data", "test.json"), "w", encoding="utf8") as dest:
    json.dump(test_data, dest, ensure_ascii=False, default=str)

## Clone Checkpoint from HF

In [None]:
# In case of resuming from a checkpoint, clone it first from HF
snapshot_download(
    repo_id="CodeAid/coupling_smells_detection_model",
    allow_patterns="checkpoint-/*",
    local_dir="/teamspace/studios/this_studio/llm-finetuning/coupling_model"
)

# Finetune

In [None]:
%%writefile LLaMA-Factory/examples/train_qlora/codeAid_finetune.yaml

### model
model_name_or_path: Qwen/Qwen2.5-14B-Instruct
quantization_bit: 4
quantization_method: bnb
double_quantization: false
trust_remote_code: true

### method
stage: sft
do_train: true
finetuning_type: lora
lora_rank: 64
lora_target: all

### dataset
dataset: couplingDetection_finetune_train
eval_dataset: couplingDetection_finetune_test
template: qwen
overwrite_cache: true
preprocessing_num_workers: 16
dataloader_num_workers: 4

### output
output_dir: /teamspace/studios/this_studio/llm-finetuning/coupling_model/
logging_dir: /teamspace/studios/this_studio/llm-finetuning/logs
logging_steps: 10
save_strategy: "steps"
save_steps: 200
save_total_limit: 3
plot_loss: true
save_only_model: false
# resume_from_checkpoint: "/teamspace/studios/this_studio/llm-finetuning/coupling_model/checkpoint-"      # Used only in case of resuming from a checkpoint 


### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000

### eval
per_device_eval_batch_size: 1
eval_strategy: steps
eval_steps: 100

report_to: wandb
run_name: codeaid-coupling-llm-llamafactory

push_to_hub: true
export_hub_model_id: "CodeAid/coupling_smells_detection_model"
hub_strategy: checkpoint

In [None]:
!cd LLaMA-Factory/ && llamafactory-cli train examples/train_qlora/codeAid_finetune.yaml

# Upload Checkpoints & Weight Files to HF

In [None]:
api = HfApi(token=os.getenv("HF_KEY"))
api.upload_folder(
    folder_path="/teamspace/studios/this_studio/llm-finetuning/coupling_model/",
    repo_id="CodeAid/coupling_model_v1",
    repo_type="model",
    commit_message="Upload checkpoint 1400"
)