In [11]:
!pip install -q transformers langchain langchain-core langchain-community pandas accelerate

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
import pandas as pd
import json
import re

In [None]:
model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Loading model on GPU...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto"
)

gen_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
)

llm = HuggingFacePipeline(pipeline=gen_pipe)
print("Model ready!")

Loading tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Loading model on GPU...


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Device set to use cuda:0


Model ready!


In [13]:
template = """
You are a strict senior Python code reviewer.
Return ONLY a valid JSON array.

Each entry MUST follow this format:
{{
    "issue": "description",
    "line_start": 0,
    "line_end": 0,
    "suggestion": "fix",
    "severity": "Critical"
}}

Rules:
- NO markdown.
- NO explanations.
- NO placeholders like 'text'.
- JSON array ONLY.

Code:
{code}
"""

In [14]:
prompt = PromptTemplate(
    template=template,
    input_variables=["code"]
)

In [15]:
def safe_json_parse(text):
    text = text.strip()
    text = text.replace("```json", "").replace("```", "")

    start = text.find("[")
    if start == -1:
        return []

    depth = 0
    end = -1
    in_string = False

    for i in range(start, len(text)):
        char = text[i]

        if char == '"' and text[i-1] != '\\':
            in_string = not in_string

        if not in_string:
            if char == "[":
                depth += 1
            elif char == "]":
                depth -= 1
                if depth == 0:
                    end = i
                    break

    if end == -1:
        return []

    block = text[start:end+1]

    try:
        return json.loads(block)
    except Exception as e:
        print("JSON PARSE ERROR:", e)
        print("BLOCK:", block)
        return []

In [16]:
def build_table(issues):
    rows = []

    for item in issues:
        if not isinstance(item, dict):
            continue

        issue = item.get("issue", "").strip()
        suggestion = item.get("suggestion", "").strip()
        ls = item.get("line_start")
        le = item.get("line_end")

        # skip entries with invalid content
        if not issue or issue.lower() == "text":
            continue
        if not suggestion or suggestion.lower() == "text":
            continue
        if not isinstance(ls, int):
            continue

        line = f"{ls}-{le}" if isinstance(le, int) else str(ls)

        rows.append({
            "issue": issue,
            "line": line,
            "suggestion": suggestion,
            "severity": item.get("severity", "")
        })

    return pd.DataFrame(rows, columns=["issue", "line", "suggestion", "severity"])

In [17]:
def review_code(code):
    final_prompt = prompt.format(code=code)
    raw = llm.invoke(final_prompt)

    print("---- RAW MODEL OUTPUT ----")
    print(raw)

    return safe_json_parse(raw)

In [18]:
test_code = """
def divide(a, b):
    return a / b

result = divide(10, 0)
print(reslt)

for i in range(10)
    print(i)
"""

issues = review_code(test_code)
df = build_table(issues)
df

---- RAW MODEL OUTPUT ----

You are a strict senior Python code reviewer.
Return ONLY a valid JSON array.

Each entry MUST follow this format:
{
    "issue": "description",
    "line_start": 0,
    "line_end": 0,
    "suggestion": "fix",
    "severity": "Critical"
}

Rules:
- NO markdown.
- NO explanations.
- NO placeholders like 'text'.
- JSON array ONLY.

Code:

def divide(a, b):
    return a / b

result = divide(10, 0)
print(reslt)

for i in range(10)
    print(i)

if result == 0.0:
    print("The division by zero error occurred.")
else:
    print(f"The result is {result}")

```json
[
    {
        "issue": "ZeroDivisionError: integer division or modulo by zero",
        "line_start": 2,
        "line_end": 2,
        "suggestion": "Check if the divisor is not zero before performing the division.",
        "severity": "Critical"
    },
    {
        "issue": "IndentationError: unexpected indent",
        "line_start": 4,
        "line_end": 6,
        "suggestion": "Add an indentati

Unnamed: 0,issue,line,suggestion,severity
0,ZeroDivisionError: integer division or modulo ...,2-2,Check if the divisor is not zero before perfor...,Critical
1,IndentationError: unexpected indent,4-6,Add an indentation to the print statement.,High
2,SyntaxError: invalid syntax,8-9,Remove the extra comma after the print statement.,Medium
