<a href="https://colab.research.google.com/github/VasanthiSivasankar/qwen_code_reviewer_finetuning_llm/blob/main/qwen_code_reviewer_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install -q transformers datasets bitsandbytes accelerate trl

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.5/532.5 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!ls


code_reviewer_train.jsonl  sample_data


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from transformers import DataCollatorForLanguageModeling


In [4]:
model_name = "Qwen/Qwen2.5-Coder-7B"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,

    device_map="auto",
    load_in_4bit=True,
    trust_remote_code=True,
)
model.config.use_cache=False

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/668 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.33G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

In [6]:
from peft import LoraConfig, get_peft_model, TaskType

In [7]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj"
    ]
)


In [8]:
model = get_peft_model(model, lora_config)


In [9]:
model.print_trainable_parameters()


trainable params: 10,092,544 || all params: 7,625,709,056 || trainable%: 0.1323


In [10]:
with open("code_reviewer_train.jsonl", "r") as f:
    print(f.readline())


{"instruction":"Review the following code and identify issues and improvements.","input":"def average(nums):\n    return sum(nums) / len(nums)","output":"Code Intent:\nCompute the average of a list of numbers.\n\nIssues Found:\n- Correctness: Does not handle empty lists.\n\nImpact:\nCalling this function with an empty list raises ZeroDivisionError.\n\nSuggested Improvements:\nAdd input validation.\n\nRevised Code:\ndef average(nums):\n    if not nums:\n        raise ValueError(\"Input list must not be empty\")\n    return sum(nums) / len(nums)"}



In [11]:
import json

with open("code_reviewer_train.jsonl") as f:
    for i, line in enumerate(f):
        try:
            json.loads(line)
        except Exception as e:
            print(f"❌ Error at line {i}: {e}")
            break
    else:
        print("✅ All lines are valid JSON")


✅ All lines are valid JSON


In [12]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files="code_reviewer_train.jsonl",
    split="train"
)

print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 28
})


In [13]:
def tokenize_fn(example):
    text = (
        f"### Instruction:\n{example['instruction']}\n\n"
        f"### Code:\n{example['input']}\n\n"
        f"### Review:\n{example['output']}"
    )

    tokenized = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors=None,
    )

    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized


In [14]:
tokenized_dataset = dataset.map(
    tokenize_fn,
    remove_columns=dataset.column_names,
)


Map:   0%|          | 0/28 [00:00<?, ? examples/s]

In [15]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


In [16]:
ex = tokenized_dataset[0]

print(type(ex["input_ids"]), type(ex["labels"]))
print(len(ex["input_ids"]), len(ex["labels"]))
print(isinstance(ex["labels"][0], int))


<class 'list'> <class 'list'>
512 512
True


In [17]:
print(tokenized_dataset[0].keys())


dict_keys(['input_ids', 'attention_mask', 'labels'])


In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./qwen-code-reviewer",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    save_strategy="epoch",
    optim="paged_adamw_8bit",
    report_to="none",
    remove_unused_columns=False  # 🔴 THIS IS THE FIX
)


In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)


In [20]:
trainer.train()


Step,Training Loss
1,1.9331
2,1.8644
3,1.8217
4,1.8179
5,1.4956
6,1.635
7,1.2526
8,1.0939
9,1.1122
10,1.0807


TrainOutput(global_step=12, training_loss=1.414301668604215, metrics={'train_runtime': 91.9029, 'train_samples_per_second': 0.914, 'train_steps_per_second': 0.131, 'total_flos': 1827163487600640.0, 'train_loss': 1.414301668604215, 'epoch': 3.0})

In [21]:
trainer.save_model("qwen_code_reviewer_lora")
tokenizer.save_pretrained("qwen_code_reviewer_lora")


('qwen_code_reviewer_lora/tokenizer_config.json',
 'qwen_code_reviewer_lora/special_tokens_map.json',
 'qwen_code_reviewer_lora/chat_template.jinja',
 'qwen_code_reviewer_lora/vocab.json',
 'qwen_code_reviewer_lora/merges.txt',
 'qwen_code_reviewer_lora/added_tokens.json',
 'qwen_code_reviewer_lora/tokenizer.json')

In [22]:
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-Coder-7B",
    device_map="auto",
    load_in_4bit=True,
)

model = PeftModel.from_pretrained(
    base_model,
    "qwen_code_reviewer_lora"
)

model.eval()


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(152064, 3584)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=3584, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3584, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

In [23]:
prompt = """### Instruction:
Review the following code and identify issues and improvements.

### Code:
def login(user, pwd):
    if user == "admin" and pwd == "1234":
        return True
    return False

### Review:
"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

output = model.generate(
    **inputs,
    max_new_tokens=200,
    temperature=0.3,
)

print(tokenizer.decode(output[0], skip_special_tokens=True))


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


### Instruction:
Review the following code and identify issues and improvements.

### Code:
def login(user, pwd):
    if user == "admin" and pwd == "1234":
        return True
    return False

### Review:
**Issues:**
- Hardcoded credentials.
- No error handling.

**Improvements:**
- Use environment variables for credentials.
- Add error handling.

**Revised Code:**
import os

def login(user, pwd):
    if user == os.environ.get("ADMIN_USER") and pwd == os.environ.get("ADMIN_PASSWORD"):
        return True
    return False


In [24]:
### “Fine-tuned Qwen2.5-Coder-7B using QLoRA for automated code review with security-focused reasoning.”