In [1]:
import re
from datasets import Dataset







In [2]:
# Load hugging face personal access token from file
with open("hf_pat.txt", "r") as f:
    token = f.read().strip()

with open("oai_pat.txt", "r") as f:
    openai_key = f.read().strip()

# Set it for HuggingFaceHub
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = token
os.environ["OPENAI_API_KEY"] = openai_key


In [None]:
def extract_cwe_id_only(raw_line: str) -> str:
    """Extracts numeric CWE ID from a line like 'CWE-502: Deserialization of Untrusted Data'"""
    match = re.search(r"CWE-(\d+)", raw_line)
    return match.group(1) if match else raw_line.strip()

def run_router(cwe_id: str) -> tuple[str, str]:
    """Calls Layer 1 agents via router to generate context and prompt"""
    from router import router  
    from llm_tools import zephyr_tool 
    result = router(cwe_id, zephyr_tool)
    return result.get("cwe_context", ""), result.get("agent_prompt", "")

def extract_cwe_context(context_line: str) -> str:
    """Extracts additional context from the CWE context input into the model"""
    parts = context_line.strip().split("||")
    return parts[1].strip() if len(parts) > 1 else ""

def extract_agent_prompt(context_line: str) -> str:
    """Extracts the prompt for the agent from router input"""
    parts = context_line.strip().split("||")
    return parts[2].strip() if len(parts) > 2 else ""


In [None]:
def load_data_from_folders(root_dir):
    """Loads input data from the local directory as defined."""
    fine_tuning_data = []

    for repo_folder in os.listdir(root_dir):
        repo_path = os.path.join(root_dir, repo_folder)
        file_groups = {}

        for filename in os.listdir(repo_path):
            match = re.match(r"(.+)_(\d+)\.txt$", filename)
            if match:
                prefix, number = match.groups()
                number = int(number)
                if number not in file_groups:
                    file_groups[number] = {}
                if "source" in prefix:
                    file_groups[number]["source"] = os.path.join(repo_path, filename)
                elif "target" in prefix:
                    file_groups[number]["target"] = os.path.join(repo_path, filename)
                elif "context" in prefix:
                    file_groups[number]["context"] = os.path.join(repo_path, filename)

        for number, files in sorted(file_groups.items()):
            source_file = files.get("source")
            target_file = files.get("target")
            context_file = files.get("context")

            if source_file and target_file and context_file:
                with open(source_file, "r", encoding="utf-8") as src, \
                     open(target_file, "r", encoding="utf-8") as tgt, \
                     open(context_file, "r", encoding="utf-8") as ctx:

                    sources = src.readlines()
                    targets = tgt.readlines()
                    contexts = ctx.readlines()

                    for s, t, c in zip(sources, targets, contexts):
                        cwe_id = extract_cwe_id_only(c)

                        # Calls updated router function that invokes LangChain agents
                        try:
                            cwe_context, agent_prompt = run_router(cwe_id)
                        except Exception as e:
                            print(f"Failed to route CWE-{cwe_id}: {e}")
                            continue

                        if cwe_id == "78":
                            combined_input = f"CWE-{cwe_id} Fix Request:\n{s.strip()}\nIssue: Command Injection (CWE-78)\nFix:"
                        else:
                            combined_input = f"CWE-{cwe_id}\n{cwe_context}\n{agent_prompt}\nCode: {s.strip()}"


                        fine_tuning_data.append({
                            "cwe_id": cwe_id,
                            "cwe_context": cwe_context,
                            "agent_prompt": agent_prompt,
                            "code": s.strip(),
                            "source": combined_input,
                            "target": t.strip()
                        })

    return fine_tuning_data

In [5]:
fine_tuning_data = load_data_from_folders("E:/Data Collection/Gold_Standard_for_tuning")



In [None]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer

from codebleu import calc_codebleu
import os
import re
from datasets import Dataset
from tokenizers import Tokenizer

import torch
# Identifies and prints the name of available GPUs
print(torch.cuda.is_available())          
print(torch.cuda.get_device_name(0))      

model_name = "Salesforce/codet5p-770m"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Converts list of data to a Hugging Face Dataset
dataset = Dataset.from_list(fine_tuning_data)

# Splits data into 80-20 train-test split
split_dataset = dataset.train_test_split(test_size=0.2)
def preprocess_function(examples):
    # Tokenize the source inputs
    model_inputs = tokenizer(
        examples["source"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

    # Tokenizes the targets (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target"],
            padding="max_length",
            truncation=True,
            max_length=512
        )

    # Replaces padding token ids in labels with -100 so they are ignored the loss function
    labels_ids = labels["input_ids"]
    labels_ids = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels_ids
    ]

    model_inputs["labels"] = labels_ids
    return model_inputs

# Preprocesses test and train datasets
train_dataset = split_dataset["train"].map(preprocess_function, batched=True)
test_dataset = split_dataset["test"].map(preprocess_function, batched=True)
'''
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["source"], padding="max_length", truncation=True, max_length=256
    )
    #128 or 256
    
    labels = tokenizer(
        examples["target"], padding="max_length", truncation=True, max_length=256
    )

   
    # Ensure `labels` is a **flat list** of token IDs
    model_inputs["labels"] = labels["input_ids"]

    print("Fixed Labels: ", model_inputs["labels"][:2])  # Debugging print
    return model_inputs
'''


import evaluate 
from sacrebleu import corpus_bleu
import numpy as np
from pygments.lexers import guess_lexer
from pygments.util import ClassNotFound


def detect_language(code_snippet):
    """Detects the programming language and maps it to a valid CodeBLEU language."""
    from pygments.lexers import guess_lexer
    from pygments.util import ClassNotFound

    AVAILABLE_LANGS = {'java', 'javascript', 'c_sharp', 'php', 'c', 'cpp', 'python', 'go', 'ruby', 'rust'}
    
    try:
        lexer = guess_lexer(code_snippet)
        lang = lexer.name.lower()
    except ClassNotFound:
        lang = "unknown"

    # Maps detected language to CodeBLEU supported languages
    lang_map = {
        "c++": "cpp",
        "c#": "c_sharp",
        "javascript": "javascript",
        "java": "java",
        "python": "python",
        "php": "php",
        "go": "go",
        "ruby": "ruby",
        "rust": "rust",
        "c": "c"
    }
    
    mapped_lang = lang_map.get(lang, "python")  
    return mapped_lang if mapped_lang in AVAILABLE_LANGS else "python"

# Loads metrics
bleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    predictions, labels = eval_preds

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    predictions = np.argmax(predictions, axis=-1)

    # Ensures labels are a clean list of lists containing only ints
    labels = [[token if token != -100 else tokenizer.pad_token_id for token in label] for label in labels]

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    lang = detect_language(decoded_preds[0]) or "python"

    bleu_score = corpus_bleu(decoded_preds, [decoded_labels]).score
    accuracy = sum(p == l for p, l in zip(decoded_preds, decoded_labels)) / len(decoded_preds)
    rouge_scores = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_l = rouge_scores["rougeL"]

    codebleu_result = calc_codebleu(decoded_labels, decoded_preds, lang)
    codebleu_score = float(codebleu_result.get("codebleu", 0.0)) if isinstance(codebleu_result, dict) else float(codebleu_result)

    return {
        "bleu": bleu_score,
        "rougeL": rouge_l * 100,
        "accuracy": accuracy * 100,
        "codebleu": codebleu_score * 100
    }


from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments
from transformers import EarlyStoppingCallback

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps",   
    save_strategy="steps",        
    load_best_model_at_end=True,          
    metric_for_best_model="eval_codebleu",
    greater_is_better=True,
    save_total_limit=5,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True
)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

trainer.train()
metrics = trainer.evaluate()
print(metrics)


True
NVIDIA GeForce RTX 4090


Map:   0%|          | 0/72 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss,Bleu,Rougel,Accuracy,Codebleu
10,2.1122,1.900841,47.572816,54.221165,5.555556,24.408615
20,1.3882,1.748871,50.996993,62.502491,16.666667,29.593473
30,0.8404,1.764852,55.462921,64.9672,16.666667,32.260414
40,0.7456,1.758283,57.317605,67.387335,22.222222,38.038599
50,0.3863,1.921272,57.802095,66.809225,22.222222,33.813879
60,0.3155,1.878762,57.566852,66.300161,27.777778,40.201706
70,0.1858,1.887004,55.487267,64.986625,22.222222,29.125037
80,0.1438,1.927315,49.598683,64.675491,22.222222,29.090819
90,0.0975,1.941606,52.174322,67.305476,27.777778,28.929189


{'eval_loss': 1.941605806350708, 'eval_bleu': 52.1743223895892, 'eval_rougeL': 67.30547606656998, 'eval_accuracy': 27.77777777777778, 'eval_codebleu': 28.929188825401454, 'eval_runtime': 122.8682, 'eval_samples_per_second': 0.146, 'eval_steps_per_second': 0.041, 'epoch': 5.0}


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Loads local fine-tuned CodeT5+ model and tokenizer
model_path = "C:/Users/laure/OneDrive/Documents/Benchmark Experiments/.Salesforce/codet5-base-finetuned/checkpoint-180"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
model.eval()

def run_codet5_repair(fine_tuning_data, max_input_tokens=512, max_output_tokens=256):
    """Finetunes the codet5+ repair portion in the agentic workflow."""
    results = []

    for item in fine_tuning_data:
        input_text = item["source"]

        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=max_input_tokens
        )

        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=max_output_tokens,
                do_sample=False
            )

        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        results.append({
            "cwe_id": item["cwe_id"],
            "original_code": item["code"],
            "agent_prompt": item["agent_prompt"],
            "cwe_context": item["cwe_context"],
            "generated_fix": prediction,
            "target_fix": item["target"]
        })

    return results


In [None]:
results = run_codet5_repair(fine_tuning_data)

# Prints results for testing purposes
for res in results:
    print(f"CWE-{res['cwe_id']} | Prediction: {res['generated_fix']}")


CWE-502 | Prediction: model = buffer;
CWE-939 | Prediction: if fmt == "url":
CWE-78 | Prediction: output0 = subprocess.run(cmd0, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
Issue: Command Injection (CWE-78)
Fix:
CWE-78 | Prediction: output1 = subprocess.run(cmd1, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
Issue: Command Injection (CWE-78)
Fix:
CWE-78 | Prediction: if not (debug and os.path.exists('vdos.out')):
CWE-611 | Prediction: import defusedxml.etree.ElementTree as ET
CWE-269 | Prediction: CMD streamlit run main.py --server.port $PORT -u **INSERT USER NUMBER HERE**
CWE-89 | Prediction: values_x = ','.join(['%s'] * len(line[1]))
CWE-676 | Prediction: dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=batch_size, collate_fn=collate, pin_memory=True)
CWE-502 | Prediction: void function() {
    strcpy(buffer, "attacker input");
    void* buffer_ptr = buffer;
    // continue using buffer here
Code: return
CWE-93

In [None]:
from llm_tools import zephyr_tool
from resource_lifecycle_agent import ResourceLifecycleAgent664

# Defines the agentic model chosen
agent = ResourceLifecycleAgent664(model=zephyr_tool)

# Runs the chosen agent with CWE-664 context
output = agent.run("664", "Pillar Weakness Ancestors of CWE-502:\\n- CWE-664: Improper Control of a Resource Through its Lifetime")

print("CWE Context:", output["cwe_context"])
print("Agent Prompt:", output["agent_prompt"])




CWE Context: Summary:
CWE-664 refers to weaknesses in software that result in improper control of a resource throughout its lifetime. This can lead to issues such as use-after-free, premature release, and resource leaks. Use-after-free occurs when a resource is freed but later accessed, potentially leading to arbitrary code execution or memory corruption. Premature release refers to a situation where a resource is released before it should be, leading to a resource leak or other unexpected behavior. Improper resource tracking involves failing to keep track of resources as they are created, modified, and destroyed, which can lead to issues such as resource exhaustion or race conditions.

Common Attack Patterns:
Attackers often exploit these issues by manipulating the lifetime of resources in unexpected ways. For example, they might free a resource and then access it later, or release a resource prematurely to cause a resource leak. In some cases, they might create multiple instances of 

In [None]:
from langchain.agents import initialize_agent, Tool
from langchain.agents.agent_types import AgentType
from langchain.llms import HuggingFaceHub  
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Imports the CWE tool (additional code file created)
from cwe_tools import get_cwe_context 

# Defines the chosen CWE tool for agent
tools = [
    Tool(
        name="get_cwe_context",
        func=get_cwe_context,
        description="Returns pillar ancestors and description for a given CWE ID",
    )
]

llm = HuggingFaceHub(
    repo_id="google/flan-t5-xl",  # Or "Salesforce/codet5p-770m"
    model_kwargs={"temperature": 0.5, "max_length": 512}
)

# Prompt Template
prompt_template = PromptTemplate(
    input_variables=["cwe_id", "cwe_context"],
    template="""
You are a security assistant AI.

Given the CWE ID and its ancestor context, explain:
- What the CWE generally refers to
- Typical software mistakes that cause it
- Why it matters from a security perspective

CWE ID: {cwe_id}

Context:
{cwe_context}

Explanation:"""
)

# Wraps as an LLM chain
cwe_explainer_chain = LLMChain(
    llm=llm,
    prompt=prompt_template
)

# Runs Agent 
def run_cwe_explainer_agent(cwe_id: str):
    cwe_context = get_cwe_context.run(cwe_id)
    return cwe_explainer_chain.run({"cwe_id": cwe_id, "cwe_context": cwe_context})

# Test
if __name__ == "__main__":
    output = run_cwe_explainer_agent("119")
    print(output)
