## Download Libraries Needed

In [None]:
!pip install transformers
!pip install unsloth

## Add Imports and Define Constants

In [2]:
import json
import unsloth
import torch
from pydantic import BaseModel, Field
from typing import List, Literal
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from unsloth import FastLanguageModel

login("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")

base_model_id = "Qwen/Qwen2.5-14B-Instruct"
finetuned_model_id1 = "CodeAid/solidV-Detection-model"
finetuned_model_id2 = "CodeAid/couplingSmells-detection-model"
finetuned_model_SRefactor = "CodeAid/SolidV-refactoring-model"


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


## Define Pydantics

In [None]:
Principle = Literal[
    "Single Responsibility", "Open-Close", "Liskov",
    "Interface Segregation", "Dependency Inversion"
]


class ViolatedPrinciple(BaseModel):
    principle: Principle = Field(..., description="The violated SOLID principle.")
    justification: str = Field(..., max_length=300,
                               description="Explanation of why the principle was violated in 2 sentences only.")


class Violation(BaseModel):
    main_file_path: str = Field(..., description="Path of the main file.")
    violatedPrinciples: List[ViolatedPrinciple] = Field(...,
                                                        description="List of violated principles with justifications.")


class SolidDetectionOutput(BaseModel):
    violations: Violation = Field(..., description="Detected SOLID violations.")


Smell = Literal[
    "Feature Envy", "Inappropriate Intimacy",
    "Message Chains", "Middle Man"
]


class CouplingSmell(BaseModel):
    smell: Smell = Field(..., description="Type of coupling smell detected.")
    justification: str = Field(..., max_length=300,
                               description="Justification for the detected coupling smell in 2 sentences only.")


class CouplingViolation(BaseModel):
    filesPaths: List[str] = Field(..., description="Files involved in the coupling smell must include the main file.")
    smells: List[CouplingSmell] = Field(..., description="Details about the detected coupling smells.")


class CouplingDetectionOutput(BaseModel):
    couplingSmells: List[CouplingViolation] = Field(..., description="Detected coupling code smells.")

* Solid Refactoring pydantics 

In [3]:
class RefactoredFile(BaseModel):
    filePath: str = Field(..., description="Path to the file either created or refactored.")
    fileContent: str = Field(..., description="The full content of the file")

class RefactoringOutput(BaseModel):
    refactored_files: List[RefactoredFile] = Field(..., description="List of all refactored files and their changes.")



## Load Model

#### We only choose one cell to execute

### Load Base Model

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model_id,
    max_seq_length=32768,
    dtype = torch.float16,
    load_in_4bit = True,
)

==((====))==  Unsloth 2025.6.5: Fast Qwen2 patching. Transformers: 4.52.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 21.964 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

### Load SOLID Violations Detection Model

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = finetuned_model_id1,
    max_seq_length=32768,
    dtype = torch.float16,
    load_in_4bit = True,
)

### Load Coupling Smells Detection Model

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = finetuned_model_id2,
    max_seq_length=32768,
    dtype = torch.float16,
    load_in_4bit = True,
)

### Load Solid Violations Refactoring Model

In [9]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = finetuned_model_SRefactor,
    max_seq_length=32768,
    dtype = torch.float16,
    load_in_4bit = True,
)

## Send Prompt Function

In [7]:
def send_prompt(prompt_text: str):
    inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=8192)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return decoded_output


## Detection Functions

In [5]:
def detect_solid_violations(input_path, output_path1):
    with open(input_path, "r") as f_in, open(output_path1, "a") as f_out1:
        for line in f_in:
            data = json.loads(line)

            solid_violations_detection_messages =  "\n".join([
                "You are a senior software engineer.",
                "You will be given one Java file (main file) along with its file dependencies.",
                "Your task is to detect violations of SOLID principles *only in the main_file_content*: Single Responsibility, Open/Closed, Liskov Substitution, Interface Segregation, and Dependency Inversion.",
                "",
                "You can use the dependency files just for context, but only analyze and extract violations from the main_file_content only.",
                "",
                "Principle definitions (apply these strictly):",
                "SRP: A class has exactly one reason to change—only one responsibility.",
                "OCP: A class may be extended without modifying its existing code.",
                "LSP: Subtypes must behave interchangeably with their base types.",
                "ISP: Clients should only depend on the methods they actually use.",
                "DIP: High‑level (policy/business) modules must depend on abstractions (interfaces/abstract classes), not on concrete (implementation) classes. Low‑level modules must implement those abstractions; they should NOT be directly referenced by high‑level modules.",
                "Don't include the usage of built-in classes (e.g. java.util.Scanner, java.lang.String, List, Map), they don't break DIP.",
                "",
                "Apply a step-by-step reasoning process to identify any violations.",
                "Start by explaining what each principle means in the current context, and how the main file code complies or fails to comply with it.",
                "",
                "After providing your first assessment, re-evaluate your findings and refine your judgment if necessary.",
                "",
                "Finally, reflect on your answer: did you miss anything? Could your answer be improved? If so, revise accordingly.",
                "",
                "Always respond in a structured JSON format. Do not include any explanation outside the JSON.",
                "You have to extract SOLID Violations from the *main file code only* according to the following Pydantic schema.",
                "Be objective and thorough, even if no violations are found.",
                "Do not generate any introduction or conclusion.",
                "",
                "## Code:",
                json.dumps(data["content"], ensure_ascii=False),
                "",
                "## Pydantic Details:",
                json.dumps(SolidDetectionOutput.model_json_schema(), ensure_ascii=False),
                "",
                "## SOLID Violations:",
                "json"
            ])

            response1 = send_prompt(solid_violations_detection_messages)


            result1 = {
                "output": response1
            }
           
            f_out1.write(json.dumps(result1) + "\n")
            print("done")

def detect_coupling(input_path, output_path1):
    with open(input_path, "r") as f_in, open(output_path1, "a") as f_out1:
        for line in f_in:
            data = json.loads(line)
            coupling_smells_detection_messages = "\n".join([
                "You are a software engineer.",
                "You will be given one file with its file dependencies. Just extract coupling smells that is related to main_file_content",
                "Your task is to identify and explain any of the following coupling smells:",
                "",
                "- Feature Envy: A method that seems more interested in another class than the one it is in, accessing its data and methods frequently.",
                "- Inappropriate Intimacy: Two classes that share too much information or access each other's internal details excessively.",
                "- Incomplete Library Class: A library class is missing functionality that should be there, forcing users to add methods or subclasses that break encapsulation.",
                "- Message Chains: A client asks one object for another object, then that object for another, and so on, forming a long chain of calls.",
                "- Middle Man: A class that delegates almost everything to another class and does very little itself.",
                "",
                "Use a step-by-step reasoning process (Chain of Thought) to evaluate if any of these smells exist in the code.",
                "For each suspected smell, explain what triggered it, and which class/method is involved.",
                "",
                "After your first pass, review your analysis and refine it if necessary.",
                "Then, critically evaluate your final result.",
                "- Did you miss any smell?",
                "- Did you misclassify anything?",
                "- Could your reasoning be more precise?",
                "",
                "Always respond in a structured JSON format. Do not include any explanation outside the JSON.",
                "You have to extract Coupling code smells from Code according the Pydantic details.",
                "Be objective and thorough, even if no violations are found.",
                "Do not generate any introduction or conclusion.",
                "## Code:",
                json.dumps(data["content"], ensure_ascii=False),
                "",
                "## Pydantic Details:",
                json.dumps(CouplingDetectionOutput.model_json_schema(), ensure_ascii=False),
                "",
                "## Coupling code smells:",
                "json"
            ])
            response1 = send_prompt(coupling_smells_detection_messages)


            result1 = {
                "output": response1
            }
            
            f_out1.write(json.dumps(result1) + "\n")
            print("done")

### Refactoring functions


In [5]:
def refactor_solid_violations(input_path, output_path):
    with open(input_path, "r") as f_in, open(output_path, "a") as f_out:
        for line in f_in:
            data = json.loads(line)

            solid_violations_refactoring_messages =  "\n".join([
                        "You are an expert Java developer specialized in applying Single Responsibility and Open-Closed principles through code refactoring.",
                        "You will be given one main Java file, with some dependencies (maybe none) along with a structured JSON detailing the detected Single Responsibility, Open-Closed violations in the main file.",
                        "Your task is to refactor the code to eliminate these violations while maintaining and improving overall code clarity and design.",
                        "",
                        "For reference, here are brief descriptions of the SRP and OCP principles:",
                        "- SRP (Single Responsibility): A class should have only one reason to change, i.e., one responsibility.",
                        "- OCP (Open/Closed): Classes should be open for extension, but closed for modification.",
                        "Apply a step-by-step reasoning process to identify the best approach for refactoring each violation.",
                        "After making initial changes, re-evaluate the result and improve it further if needed.",
                        "Then, reflect on the outcome: did you miss anything? Did your refactoring introduce new issues? If so, revise accordingly.",
                        "You should return the main file in case of being updated with its updated content.",
                        "You should return the created files with its content.",
                        "Never add multiple classes/enums/interfaces in the same file; if needed, create a new file for each.",
                        "After refactoring the main file and adding any new files, you must:",
                        "- Review all dependency files for references to the main file’s class, methods, or fields.",
                        "- Update those dependency files to reflect any renames, deletions, or new methods introduced in your refactor.",
                        "- Ensure there are no invalid references in dependency files (such as calling a method that no longer exists).",
                        "All updated dependency files should be included in your output alongside the main file and new files, following the Pydantic schema format.",
                        "Don't return a file unless it is updated or created.",
                        "",
                        "## Critical Output and Formatting Rules:",
                        "1. **Comment Formatting for Unfixable Dependencies:** This is a strict requirement. If a dependency cannot be updated due to missing context, you must leave a comment. IT IS CRITICAL that you add a line break (`\\n`) immediately after the comment. The code that follows the comment MUST start on a new line to avoid compilation errors.",
                        "2. **No Extra Content:** Do not include any explanation, introduction, or conclusion outside the final JSON output.",
                        "3. **Code Formatting:** Return the code in one line without extra spaces or break lines. Don't add any comments.",
                        "4. **JSON Structure:** You must follow the format defined in the Pydantic schema for the refactoring output.",
                        "",
                        "Be precise, complete, and objective. If no changes are needed, reflect that in the response.",
                        "## Code:",
                        json.dumps(data["prompt"], ensure_ascii=False),
                        "",
                        "## SO Violations:",
                        json.dumps(data["violations"], ensure_ascii=False),
                        "",
                        "## Pydantic Details:",
                        json.dumps(RefactoringOutput.model_json_schema(), ensure_ascii=False),
                        "",
                        "## Refactored Code:",
                        "```json"
                    ])

            response = send_prompt(solid_violations_refactoring_messages)           

            result = {
                "output": response
            }
           
            f_out.write(json.dumps(result) + "\n")
            print("Done")


## Execution For Testing Before & After Finetuning

In [None]:
detect_solid_violations("data.jsonl", "outputFile1.jsonl")
detect_coupling("data.jsonl", "outputFile2.jsonl")

In [None]:
refactor_solid_violations("test.jsonl","outRefactor.jsonl")