In [None]:
!pip install jsonformer

Collecting jsonformer
  Downloading jsonformer-0.12.0-py3-none-any.whl.metadata (5.0 kB)
Downloading jsonformer-0.12.0-py3-none-any.whl (6.6 kB)
Installing collected packages: jsonformer
Successfully installed jsonformer-0.12.0


In [2]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

# Now provide the full path to your file in Google Drive
csv_file = "/content/drive/My Drive/expermintstarcoder/PuppetScripts_V2.csv"
csv_file_ansible = "/content/drive/My Drive/expermintstarcoder/merged_file_ansible_defect.csv"
df = pd.read_csv(csv_file)

Mounted at /content/drive


In [None]:
from huggingface_hub import login
login(token="")

In [None]:
from google.colab import drive
import pandas as pd
import torch
import json
import csv
from jsonformer import Jsonformer
from transformers import AutoModelForCausalLM, AutoTokenizer

# Mount Google Drive
drive.mount('/content/drive')

# Define CSV file path in Google Drive
csv_file = "/content/drive/My Drive/expermintstarcoder/PuppetScripts_V2.csv"  # Input CSV file path
output_file = "/content/drive/My Drive/expermintstarcoder/output_results.csv"  # Output CSV file path

# Load the CSV file from Google Drive
df = pd.read_csv(csv_file)

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model and tokenizer from Hugging Face model hub
model_name = "bigcode/starcoder2-7b"  # Replace with the correct model name

print("Loading model from Hugging Face model hub...")
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Model loaded successfully!")

# Put model in evaluation mode
model.eval()

# Fix tokenizer pad token issue
tokenizer.pad_token = tokenizer.eos_token

# Define JSON Schema (Ensures long responses)
json_schema = {
    "type": "object",
    "properties": {
        "issues_found": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "misconfigured_snippet": {"type": "string"},

                }
            }
        }
    }
}

# Function to analyze Puppet misconfiguration
def analyze_misconfiguration(code_snippet):
    # Define structured prompt
    prompt = f"""
    You are a **security expert specializing in Puppet configuration security**. Your task is to analyze the following Puppet code and identify **all possible security risks**.

    **Puppet Script to Analyze**:
    ```puppet
    {code_snippet}
    ```

    ### **Response Format**
    Return the response in **valid JSON format**:
    ```json
    {{
      "issues_found": [
        {{
          "misconfigured_snippet": "<misconfigured_snippet_1>",

        }},
        {{
          "misconfigured_snippet": "<misconfigured_snippet_2>",

        }}
      ]
    }}
    ```

    ### **Instructions**
    - **Only report security risks**. Ignore syntax or general best practices.
    - If there are **multiple risks**, list **all** of them.
    - If the script **has no security misconfiguration**, return an **empty array**.
    - **Return ONLY JSON** (without extra text or explanations).
    """

    max_length = min(len(tokenizer.encode(prompt)), 6000)

    # Tokenize input and move to GPU (Allowing longer input)
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=max_length,
        truncation=True,
        padding="max_length"
    ).to(device)

    # Generate structured output using Jsonformer with high token limit
    with torch.no_grad():
        structured_output = Jsonformer(
            model,
            tokenizer,
            json_schema=json_schema,
            prompt=prompt,
            max_string_token_length=max_length,
        )

    return structured_output()

# Open CSV file and write header only once
with open(output_file, mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Script", "Label", "MisconfigurationFound", "Reason"])  #

# Process each row and save immediately after processing
for index, row in df.iterrows():
    script_content = str(row.get("Script.Content", "")).strip()
    defect_label = str(row.get("Defect.Label", "")).strip()

    if not script_content:
        print(f" Skipping Row {index + 1} (Empty Script)")
        continue

    try:
        # Analyze misconfiguration
        result = analyze_misconfiguration(script_content)
        issues = result.get("issues_found", [])

        # Convert multiple misconfigurations and reasons into a single comma-separated string
        misconfigured_snippets = ", ".join([issue.get("misconfigured_snippet", "Unknown") for issue in issues]) if issues else "No issues detected"
        reasons = ", ".join([issue.get("reason", "No explanation provided") for issue in issues]) if issues else "None"

        # Save results directly into the CSV
        with open(output_file, mode="a", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow([script_content, defect_label, misconfigured_snippets, reasons])

        # Print result
        print(f"\n🔹 **Processed Row {index + 1}/{len(df)}** 🔹\n")
        print(json.dumps(issues, indent=4))

    except Exception as e:
        print(f"Error processing Row {index + 1}: {e}")

print("\n**All scripts analyzed successfully! Results saved in 'output_results.csv'**")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Loading model from Hugging Face model hub...


model.safetensors:  54%|#####4    | 6.60G/12.1G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.88k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

Model loaded successfully!


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    }
]

🔹 **Processed Row 1325/1958** 🔹

[
    {
        "misconfigured_snippet": "user { 'xvfb': ensure => present, gid => 'xvfb', shell => '/bin/false', home => '/nonexistent', system => true, }"
    }
]
Error processing Row 1326: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 96.12 MiB is free. Process 7279 has 14.64 GiB memory in use. Of the allocated memory 14.45 GiB is allocated by PyTorch, and 66.33 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

🔹 **Processed Row 1327/1958** 🔹

[
    {
    }
]

🔹 **Processed Row 1328/1958** 🔹

[
    {
        "misconfigured_snippet": "misconfigured_snippet_1"
    }
]

🔹 **Processed Row 1329/19

In [None]:
#using starcoder2 for ansible dataset

In [None]:
from google.colab import drive
import pandas as pd
import torch
import json
import csv
from jsonformer import Jsonformer
from transformers import AutoModelForCausalLM, AutoTokenizer

# Mount Google Drive
drive.mount('/content/drive')

# Define CSV file path in Google Drive
csv_file_ansible = "/content/drive/My Drive/expermintstarcoder/merged_file_ansible_defect.csv"  # Input CSV file path
output_file = "/content/drive/My Drive/expermintstarcoder/merged_file_ansible_defect_output.csv"  # Output CSV file path

# Load the CSV file from Google Drive
df = pd.read_csv(csv_file_ansible)

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model and tokenizer from Hugging Face model hub
model_name = "bigcode/starcoder2-3b"  # Replace with the correct model name

print("Loading model from Hugging Face model hub...")
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Model loaded successfully!")

# Put model in evaluation mode
model.eval()

# Fix tokenizer pad token issue
tokenizer.pad_token = tokenizer.eos_token

# Define JSON Schema (Ensures long responses)
json_schema = {
    "type": "object",
    "properties": {
        "issues_found": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "misconfigured_snippet": {"type": "string"},
                }
            }
        }
    }
}

# Function to analyze Puppet misconfiguration
def analyze_misconfiguration(code_snippet):
    # Define structured prompt
    prompt = f"""
    You are a **security expert specializing in Ansible configuration security**. Your task is to analyze the following Ansible code and identify **all possible security risks**.

    **Ansible Script to Analyze**:
    ```Ansible
    {code_snippet}
    ```

    ### **Response Format if isuues found **
    Return the response in **valid JSON format**:
    ```json
    {{
      "issues_found": [
        {{
          "misconfigured_snippet": "<misconfigured_snippet_1>",

        }},
        {{
          "misconfigured_snippet": "<misconfigured_snippet_2>",

        }},
          {{
          "misconfigured_snippet": "No misconfigured_snippet Found ",

        }}

      ]
    }}

    ```

    ### **Instructions**
    - **Only report security risks**. Ignore syntax or general best practices.
    - If there are **multiple risks**, list **all** of them.








    - **Return ONLY JSON** (without extra text or explanations).
    """

    max_length = min(len(tokenizer.encode(prompt)), 6000)

    # Tokenize input and move to GPU (Allowing longer input)
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=max_length,
        truncation=True,
        padding="max_length"
    ).to(device)

    # Generate structured output using Jsonformer with high token limit
    with torch.no_grad():
        structured_output = Jsonformer(
            model,
            tokenizer,
            json_schema=json_schema,
            prompt=prompt,
            max_string_token_length=max_length,
        )

    return structured_output()

# Open CSV file and write header only once
with open(output_file, mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Added_lines_fixing_commit", "Added_lines_bug_inducing_commit", "Fixing_Script_MisconfigurationFoundbyLLM", "Bug_Inducing_Script_MisconfigurationFoundbyLLM"])

# Process each row and save immediately after processing
for index, row in df.iterrows():
    fixing_commit_code = str(row.get("Added_lines_fixing_commit", "")).strip()
    bug_inducing_commit_code = str(row.get("Added_lines_bug_inducing_commit", "")).strip()

    if not fixing_commit_code and not bug_inducing_commit_code:
        print(f"Skipping Row {index + 1} (Empty Scripts)")
        continue

    try:
        # Analyze fixing commit (Added_lines_fixing_commit)
        result_fixing = analyze_misconfiguration(fixing_commit_code)
        issues_fixing = result_fixing.get("issues_found", [])

        # Misconfigured snippets from fixing commit
        fixing_misconfigured_snippets = ", ".join([issue.get("misconfigured_snippet", "No issues detected") for issue in issues_fixing]) if issues_fixing else "No issues detected"

        # Analyze bug-inducing commit (Added_lines_bug_inducing_commit)
        result_bug_inducing = analyze_misconfiguration(bug_inducing_commit_code)
        issues_bug_inducing = result_bug_inducing.get("issues_found", [])

        # Misconfigured snippets from bug-inducing commit
        bug_inducing_misconfigured_snippets = ", ".join([issue.get("misconfigured_snippet", "No issues detected") for issue in issues_bug_inducing]) if issues_bug_inducing else "No issues detected"

        # Write results for the same row with both fixing and bug-inducing commit analysis
        with open(output_file, mode="a", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow([fixing_commit_code, bug_inducing_commit_code, fixing_misconfigured_snippets, bug_inducing_misconfigured_snippets])

        # Print result
        print(f"\n🔹 **Processed Row {index + 1}/{len(df)}** 🔹")
        print(f"Fixing commit misconfiguration (Fixing_Script_MisconfigurationFoundbyLLM): {fixing_misconfigured_snippets}")
        print(f"Bug-inducing commit misconfiguration (Bug_Inducing_Script_MisconfigurationFoundbyLLM): {bug_inducing_misconfigured_snippets}")

    except Exception as e:
        print(f"Error processing Row {index + 1}: {e}")

print("\n**All scripts analyzed successfully! Results saved in 'output_results.csv'**")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Loading model from Hugging Face model hub...
Model loaded successfully!

🔹 **Processed Row 1/308** 🔹
Fixing commit misconfiguration (Fixing_Script_MisconfigurationFoundbyLLM): command: php occ config:system:set {{ item.name }} --value={{ item.value }}
Bug-inducing commit misconfiguration (Bug_Inducing_Script_MisconfigurationFoundbyLLM): No misconfigured_snippet Found

🔹 **Processed Row 2/308** 🔹
Fixing commit misconfiguration (Fixing_Script_MisconfigurationFoundbyLLM): when: (not _nextcloud_conf.stat.exists) or (_nextcloud_configured.rc is defined and _nextcloud_configured.rc!= 0)
Bug-inducing commit misconfiguration (Bug_Inducing_Script_MisconfigurationFoundbyLLM): check_mode: no\nwhen: _nextcloud_configured.rc!= 0\n

🔹 **Processed Row 3/308** 🔹
Fixing commit misconfiguration (Fixing_Script_MisconfigurationFoundbyLLM): when: (nc_sudo_insta

In [None]:

#Adding labels to Ansible script after llm prompting
import pandas as pd
import numpy as np

def create_misconfiguration_label(df, column_name, label_column_name):
    """
    Create a new label column based on the misconfiguration column.
    - If misconfiguration is found, label = 1.
    - If no misconfiguration is found, label = 0.
    """
    # Apply logic to create a new label column
    df[label_column_name] = df[column_name].apply(
        lambda x: 1 if str(x).strip() not in ["No misconfigured_snippet Found", "misconfigured_snippet_1", "0", ""] else 0
    )
    return df

# Load your CSV file
df = pd.read_csv("/content/drive/My Drive/expermintstarcoder/imp/merged_file_ansible_defect_output_llm.csv")

# Create new label columns based on existing misconfiguration columns
df = create_misconfiguration_label(df, "Fixing_Script_MisconfigurationFoundbyLLM", "Fixing_Script_MisconfigurationFoundbyLLM_Label")
df = create_misconfiguration_label(df, "Bug_Inducing_Script_MisconfigurationFoundbyLLM", "Bug_Inducing_Script_MisconfigurationFoundbyLLM_Label")

# Save the DataFrame with the new label columns
df.to_csv("/content/drive/My Drive/expermintstarcoder/imp/merged_file_ansible_llm_labeld.csv", index=False)

print("New label columns created and saved successfully!")


New label columns created and saved successfully!


In [None]:
def clean_code(code):
    """Remove empty lines and lines with only spaces."""
    return "\n".join([line for line in code.splitlines() if line.strip()])

# Process each row and save immediately after processing
for index, row in df.iterrows():
    # Clean the code snippets by removing empty lines
    fixing_commit_code = clean_code(str(row.get("Added_lines_fixing_commit", "")).strip())
    bug_inducing_commit_code = clean_code(str(row.get("Added_lines_bug_inducing_commit", "")).strip())

    if not fixing_commit_code and not bug_inducing_commit_code:
        print(f"Skipping Row {index + 1} (Empty Scripts)")
        continue

In [None]:
# metrics accuracy for puppet dataset
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Load your dataset
df = pd.read_csv("/content/drive/My Drive/expermintstarcoder/imp/puppet_labeld.csv")

# True labels (the actual labels from your dataset)
true_labels = df['Label']  # Adjust this to match the column name for true labels

# Predicted labels (from your model)
predicted_labels = df['Misconfiguration_Found_LLM_Label']  # Adjust this to match the column name for predicted labels

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)

# Calculate precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(
    true_labels,  # True labels
    predicted_labels,  # Predicted labels
    average='binary',  # We are working with a binary classification (0 or 1)
    zero_division=0 , # To avoid division by zero errors if there are no predicted positives
     pos_label=0,
)

# Print out the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")



Accuracy: 0.4374
Precision: 0.4254
Recall: 0.7062
F1-Score: 0.5310


In [None]:
#to be removed or reviewed

In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Load your dataset
df = pd.read_csv("/content/drive/My Drive/expermintstarcoder/imp/puppet_labeld.csv")

# True labels (the actual labels from your dataset)
true_labels = df['Label']  # Adjust this to match the column name for true labels

# Predicted labels (from your model)
predicted_labels = df['Misconfiguration_Found_LLM_Label']  # Adjust this to match the column name for predicted labels

# Temporarily filter the DataFrame to keep only rows where Label is 1 (misconfiguration)
df_filtered = df[true_labels == 1]

# Step 3: Re-calculate accuracy, precision, recall, and F1-score based only on rows with Label = 1
true_labels_filtered = df_filtered['Label']
predicted_labels_filtered = df_filtered['Misconfiguration_Found_LLM_Label']

# Calculate accuracy (for misconfigurations)
accuracy = accuracy_score(true_labels_filtered, predicted_labels_filtered)

# Calculate precision, recall, and F1-score for misconfigurations (1 as positive class)
precision, recall, f1, _ = precision_recall_fscore_support(
    true_labels_filtered,  # True labels for misconfigurations
    predicted_labels_filtered,  # Predicted labels for misconfigurations
    average='binary',  # Binary classification (0 or 1)
    pos_label=1,  # Treat '1' (misconfiguration) as the positive class
    zero_division=0  # Handle division by zero errors if no predicted positives
)

# Print out the results
print(f"Accuracy (for misconfigurations): {accuracy:.4f}")
print(f"Precision (for misconfigurations): {precision:.4f}")
print(f"Recall (for misconfigurations): {recall:.4f}")
print(f"F1-Score (for misconfigurations): {f1:.4f}")



Accuracy (for misconfigurations): 0.2167
Precision (for misconfigurations): 1.0000
Recall (for misconfigurations): 0.2167
F1-Score (for misconfigurations): 0.3561


In [None]:
#this the matrics for Ansible using starcoder2-3

import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

# Load the dataset
df = pd.read_csv('/content/drive/My Drive/expermintstarcoder/merged_with_labels.csv')

# Predicted labels for Fixing Script Misconfigurations
fixing_predicted_labels = df['Fixing_Script_MisconfigurationFoundbyLLM_Label']

# Predicted labels for Bug-Inducing Script Misconfigurations
bug_inducing_predicted_labels = df['Bug_Inducing_Script_MisconfigurationFoundbyLLM_Label']

# Actual labels for Added_lines_fixing_commit and Added_lines_bug_inducing_commit
# Added_lines_fixing_commit is always 1 (Fixing Script Misconfigurations)
# Added_lines_bug_inducing_commit is always 0 (Bug-Inducing Script Misconfigurations)
true_fixing_labels = [1] * len(df)  # All are 1 for fixing misconfigurations
true_bug_inducing_labels = [0] * len(df)  # All are 0 for bug-inducing misconfigurations

# Calculate Precision, Recall, and F1-Score for Fixing Script Misconfigurations
fixing_precision, fixing_recall, fixing_f1, _ = precision_recall_fscore_support(
    true_fixing_labels,  # True labels for fixing misconfigurations (always 1)
    fixing_predicted_labels,
    average='binary',
    zero_division=0  # Handle division by zero if no true positives or false negatives
)

# Calculate Precision, Recall, and F1-Score for Bug-Inducing Script Misconfigurations
# Here, we set pos_label=0 to treat 0 as the positive class
bug_inducing_precision, bug_inducing_recall, bug_inducing_f1, _ = precision_recall_fscore_support(
    true_bug_inducing_labels,  # True labels for bug-inducing misconfigurations (always 0)
    bug_inducing_predicted_labels,
    average='binary',
    zero_division=0,  # Handle division by zero if no true positives or false negatives
    pos_label=0  # Treat 0 as the positive class for Bug-Inducing
)

# Overall F1-score (average of both fixing and bug-inducing F1 scores)
overall_f1 = (fixing_f1 + bug_inducing_f1) / 2

# Overall Recall (average of both fixing and bug-inducing recall scores)
overall_recall = (fixing_recall + bug_inducing_recall) / 2

# Accuracy for Fixing Script Misconfigurations
fixing_accuracy = (fixing_predicted_labels == true_fixing_labels).sum() / len(df)

# Accuracy for Bug-Inducing Script Misconfigurations
bug_inducing_accuracy = (bug_inducing_predicted_labels == true_bug_inducing_labels).sum() / len(df)

# Print out the metrics for Fixing Script Misconfigurations
print(f"Precision for Fixing Script Misconfigurations: {fixing_precision:.4f}")
print(f"Recall for Fixing Script Misconfigurations: {fixing_recall:.4f}")
print(f"F1-Score for Fixing Script Misconfigurations: {fixing_f1:.4f}")
print(f"Accuracy for Fixing Script Misconfigurations: {fixing_accuracy:.4f}")

# Print out the metrics for Bug-Inducing Script Misconfigurations
print(f"Precision for Bug-Inducing Script Misconfigurations: {bug_inducing_precision:.4f}")
print(f"Recall for Bug-Inducing Script Misconfigurations: {bug_inducing_recall:.4f}")
print(f"F1-Score for Bug-Inducing Script Misconfigurations: {bug_inducing_f1:.4f}")
print(f"Accuracy for Bug-Inducing Script Misconfigurations: {bug_inducing_accuracy:.4f}")

# Print Overall Metrics
print(f"Overall F1-Score: {overall_f1:.4f}")
print(f"Overall Recall: {overall_recall:.4f}")



Precision for Fixing Script Misconfigurations: 1.0000
Recall for Fixing Script Misconfigurations: 0.5292
F1-Score for Fixing Script Misconfigurations: 0.6921
Accuracy for Fixing Script Misconfigurations: 0.5292
Precision for Bug-Inducing Script Misconfigurations: 1.0000
Recall for Bug-Inducing Script Misconfigurations: 0.8613
F1-Score for Bug-Inducing Script Misconfigurations: 0.9255
Accuracy for Bug-Inducing Script Misconfigurations: 0.8613
Overall F1-Score: 0.8088
Overall Recall: 0.6953


In [None]:
!pip install jsonformer

Collecting jsonformer
  Downloading jsonformer-0.12.0-py3-none-any.whl.metadata (5.0 kB)
Collecting termcolor<3.0.0,>=2.3.0 (from jsonformer)
  Downloading termcolor-2.5.0-py3-none-any.whl.metadata (6.1 kB)
Downloading jsonformer-0.12.0-py3-none-any.whl (6.6 kB)
Downloading termcolor-2.5.0-py3-none-any.whl (7.8 kB)
Installing collected packages: termcolor, jsonformer
  Attempting uninstall: termcolor
    Found existing installation: termcolor 3.1.0
    Uninstalling termcolor-3.1.0:
      Successfully uninstalled termcolor-3.1.0
Successfully installed jsonformer-0.12.0 termcolor-2.5.0


In [None]:
from huggingface_hub import login
login(token="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")

In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [None]:
!pip install transformers==4.48.3

Collecting transformers==4.48.3
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.48.3-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.3
    Uninstalling transformers-4.51.3:
      Successfully uninstalled transformers-4.51.3
Successfully installed transformers-4.48.3


In [None]:
!pip install torch==2.5.1 torchvision==0.20.1

Collecting torch==2.5.1
  Downloading torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision==0.20.1
  Downloading torchvision-0.20.1-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting triton==3.1.0 (from torch==2.5.1)
  Downloading triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl (906.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.5/906.5 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchvision-0.20.1-cp311-cp311-manylinux1_x86_64.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected 

In [None]:
import pandas as pd
import torch
import json
import csv
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig
from jsonformer import Jsonformer
from google.colab import drive
import torch
import sys
import os
# ========== 1. Mount Google Drive ==========

# ========== 2. Paths ==========
drive.mount('/content/drive')

# ========== 3. File Paths ==========
drive_folder = '/content/drive/MyDrive/expermintstarcoder'
input_file = os.path.join(drive_folder, 'imp/merged_file_ansible.csv')

output_file = os.path.join(drive_folder, 'final-ansible_dataset_befor_varification_starcoder2.csv')



# ========== 4. Load Model & Tokenizer ==========
model_name = "bigcode/starcoder2-7b"
device = "cuda" if torch.cuda.is_available() else "cpu"

# ========== 4. Load Model & Tokenizer ==========
print("Loading model...")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=bnb_config
    )
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.add_eos_token = True
if '<pad>' not in tokenizer.get_vocab():
        tokenizer.add_special_tokens({"pad_token": "<pad>"})
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

if tokenizer.eos_token is None:
   tokenizer.eos_token = "[EOS]"

model.eval()
# ========== 5. Define JSON Schema ==========
json_schema = {
    "type": "object",
    "properties": {
        "issues_found": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "misconfigured_snippet": {"type": "string"},
                    "related_cwe": {"type": "string"},
                }
            }
        },
        "misconfiguration_label": {"type": "string"}
    }
}

# ========== 6. Misconfiguration Analyzer ==========
def analyze_misconfiguration(code_snippet):
    prompt = f"""
    You are a security expert specializing in Ansible configuration security.
    Your task is to analyze the following Ansible playbook/task and identify only critical security misconfigurations that could lead to vulnerabilities and have a real CWE ID.
    Focus only on the available code and avoid making assumptions about unavailable details.

    Ansible Playbook to Analyze:
    ```yaml
    {code_snippet}
    ```

    Response Format:
    Return the response in **valid JSON format**:

    - If there are critical issues:
    ```json
    {{
      "issues_found": [
        {{
          "misconfigured_snippet": "<actual misconfigured code snippet>",
          "related_cwe": "<CWE-ID>"
        }}
      ],
      "misconfiguration_label": 1
    }}
    ```

    - If there are NO critical issues:
    ```json
    {{
      "issues_found": [],
      "misconfiguration_label": 0
    }}
    ```

    Instructions:
    - Only list real misconfigurations tied to known CWE IDs.
    - Do NOT insert "N/A".
    - Do not include syntax errors.
    - If no critical misconfiguration is found, return an empty issues_found array.
    """
    truncated_prompt = tokenizer.decode(
    tokenizer.encode(prompt, max_length=2000, truncation=True),

    )

    structured_output = Jsonformer(
        model=model,
        tokenizer=tokenizer,
        json_schema=json_schema,
        prompt=truncated_prompt,
        max_string_token_length=300,
    )

    return structured_output()


# ========== 7. Write CSV ==========
# Remove output file if it exists

chunksize = 5  # You can adjust this
csv_reader = pd.read_csv(input_file, chunksize=chunksize)

with open(output_file, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow([
        "Added_lines_fixing_commit",
        "Added_lines_bug_inducing_commit",
        "Fixing_Script_Misconfiguration_Label",
        "Fixing_Script_MisconfigurationFoundbyLLM",
        "Fixing_Script_CWE",
        "Bug_Inducing_Script_Misconfiguration_Label",
        "Bug_Inducing_Script_MisconfigurationFoundbyLLM",
        "Bug_Inducing_Script_CWE"
    ])

    row_count = 0

    for chunk in csv_reader:
        for index, row in chunk.iterrows():
            row_count += 1
            fixing_commit_code = str(row.get("Added_lines_fixing_commit", "")).strip()
            bug_inducing_commit_code = str(row.get("Added_lines_bug_inducing_commit", "")).strip()

            if not fixing_commit_code and not bug_inducing_commit_code:
                print(f"⏭️ Skipping Row {row_count} (Empty Scripts)")
                continue

            try:
                fixing_snippets = fixing_cwes = "Not analyzed"
                bug_snippets = bug_cwes = "Not analyzed"
                fixing_label = bug_label = "N/A"

                if fixing_commit_code:
                  with torch.inference_mode():
                    result_fixing = analyze_misconfiguration(fixing_commit_code)
                    torch.cuda.empty_cache()
                    issues_fixing = result_fixing.get("issues_found", [])
                    fixing_snippets = ", ".join(sorted(set(issue.get("misconfigured_snippet", "") for issue in issues_fixing))) if issues_fixing else "No issues detected"
                    fixing_cwes = ", ".join(sorted(set(issue.get("related_cwe", "") for issue in issues_fixing))) if issues_fixing else "None"
                    fixing_label = result_fixing.get("misconfiguration_label", "N/A")

                if bug_inducing_commit_code:
                  with torch.inference_mode():
                    result_bug = analyze_misconfiguration(bug_inducing_commit_code)
                    issues_bug = result_bug.get("issues_found", [])
                    bug_snippets = ", ".join(sorted(set(issue.get("misconfigured_snippet", "") for issue in issues_bug))) if issues_bug else "No issues detected"
                    bug_cwes = ", ".join(sorted(set(issue.get("related_cwe", "") for issue in issues_bug))) if issues_bug else "None"
                    bug_label = result_bug.get("misconfiguration_label", "N/A")

                writer.writerow([
                    fixing_commit_code,
                    bug_inducing_commit_code,
                    fixing_label,
                    fixing_snippets,
                    fixing_cwes,
                    bug_label,
                    bug_snippets,
                    bug_cwes
                ])


                print(f"✅ Row {row_count} processed")
                if fixing_commit_code:
                    print(json.dumps(result_fixing, indent=4))
                sys.stdout.flush()

            except Exception as e:
                print(f"❌ Error at Row {row_count}: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading model...


config.json:   0%|          | 0.00/893 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/41.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.51G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.88k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
            "misconfigured_snippet": "name: \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\",
            "related_cwe": "CWE-125"
        },
        {
            "misconfigured_snippet": "name: \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\",
            "related_cwe": "CWE-125"
        },
        {
            "misconfigured_snippet": "name: \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

In [None]:
# Required packages
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

# Load the dataset from Google Drive (update the path as needed)
df = pd.read_csv('/content/drive/My Drive/expermintstarcoder/ansible_final/final-ansible_dataset_after_varification_starcoder.csv')


# Convert the labels to numeric, coercing invalid entries to NaN
df['Fixing_Script_Misconfiguration_Label'] = pd.to_numeric(df['Fixing_Script_Misconfiguration_Label'], errors='coerce')
df['Bug_Inducing_Script_Misconfiguration_Label'] = pd.to_numeric(df['Bug_Inducing_Script_Misconfiguration_Label'], errors='coerce')

# Drop rows with NaN values in these columns
df.dropna(subset=['Fixing_Script_Misconfiguration_Label', 'Bug_Inducing_Script_Misconfiguration_Label'], inplace=True)

# Predicted labels for Fixing Script Misconfigurations
fixing_predicted_labels = df['verification_Fixing_Script'].astype(int)

# Predicted labels for Bug-Inducing Script Misconfigurations
bug_inducing_predicted_labels = df['verification_Bug_Inducing'].astype(int)

# Actual labels for Fixing Script Misconfigurations
true_fixing_labels = df['Fixing_Script_Misconfiguration_Label'].astype(int)

# Actual labels for Bug-Inducing Script Misconfigurations
true_bug_inducing_labels = df['Bug_Inducing_Script_Misconfiguration_Label'].astype(int)

# Calculate Precision, Recall, and F1-Score for Fixing Script Misconfigurations
fixing_precision, fixing_recall, fixing_f1, _ = precision_recall_fscore_support(
    true_fixing_labels,  # True labels for fixing misconfigurations
    fixing_predicted_labels,
    average='binary',
    zero_division=0  # Handle division by zero if no true positives or false negatives
)

# Calculate Precision, Recall, and F1-Score for Bug-Inducing Script Misconfigurations
# Here, we set pos_label=0 to treat 0 as the positive class
bug_inducing_precision, bug_inducing_recall, bug_inducing_f1, _ = precision_recall_fscore_support(
    true_bug_inducing_labels,  # True labels for bug-inducing misconfigurations
    bug_inducing_predicted_labels,
    average='binary',
    zero_division=0,  # Handle division by zero if no true positives or false negatives
    pos_label=1  # Treat 0 as the positive class for Bug-Inducing
)

# Overall F1-score (average of both fixing and bug-inducing F1 scores)
overall_f1 = (fixing_f1 + bug_inducing_f1) / 2

# Overall Recall (average of both fixing and bug-inducing recall scores)
overall_recall = (fixing_recall + bug_inducing_recall) / 2

# Accuracy for Fixing Script Misconfigurations
fixing_accuracy = (fixing_predicted_labels == true_fixing_labels).sum() / len(df)

# Accuracy for Bug-Inducing Script Misconfigurations
bug_inducing_accuracy = (bug_inducing_predicted_labels == true_bug_inducing_labels).sum() / len(df)

# Print out the metrics for Fixing Script Misconfigurations
print(f"Precision for Fixing Script Misconfigurations: {fixing_precision:.4f}")
print(f"Recall for Fixing Script Misconfigurations: {fixing_recall:.4f}")
print(f"F1-Score for Fixing Script Misconfigurations: {fixing_f1:.4f}")
print(f"Accuracy for Fixing Script Misconfigurations: {fixing_accuracy:.4f}")

# Print out the metrics for Bug-Inducing Script Misconfigurations
print(f"Precision for Bug-Inducing Script Misconfigurations: {bug_inducing_precision:.4f}")
print(f"Recall for Bug-Inducing Script Misconfigurations: {bug_inducing_recall:.4f}")
print(f"F1-Score for Bug-Inducing Script Misconfigurations: {bug_inducing_f1:.4f}")
print(f"Accuracy for Bug-Inducing Script Misconfigurations: {bug_inducing_accuracy:.4f}")

# Print Overall Metrics
print(f"Overall F1-Score: {overall_f1:.4f}")
print(f"Overall Recall: {overall_recall:.4f}")

Precision for Fixing Script Misconfigurations: 0.9940
Recall for Fixing Script Misconfigurations: 0.6255
F1-Score for Fixing Script Misconfigurations: 0.7678
Accuracy for Fixing Script Misconfigurations: 0.6231
Precision for Bug-Inducing Script Misconfigurations: 1.0000
Recall for Bug-Inducing Script Misconfigurations: 0.4515
F1-Score for Bug-Inducing Script Misconfigurations: 0.6221
Accuracy for Bug-Inducing Script Misconfigurations: 0.4515
Overall F1-Score: 0.6950
Overall Recall: 0.5385


In [None]:
# Required packages
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

# Load the dataset (update path as needed)
df = pd.read_csv('/content/drive/My Drive/expermintstarcoder/ansible_final/final-ansible_dataset_after_varification_starcoder.csv')

# Convert the labels to numeric (in case of non-numeric entries)
df['Fixing_Script_Misconfiguration_Label'] = pd.to_numeric(df['Fixing_Script_Misconfiguration_Label'], errors='coerce')
df['Bug_Inducing_Script_Misconfiguration_Label'] = pd.to_numeric(df['Bug_Inducing_Script_Misconfiguration_Label'], errors='coerce')

# Drop rows with NaN values in these columns
df.dropna(subset=['Fixing_Script_Misconfiguration_Label', 'Bug_Inducing_Script_Misconfiguration_Label'], inplace=True)

# Predicted labels for Fixing Script Misconfigurations (verification labels)
fixing_verification_labels = df['verification_Fixing_Script']

# Predicted labels for Bug-Inducing Script Misconfigurations (verification labels)
bug_inducing_verification_labels = df['verification_Bug_Inducing']

# Actual labels for Bug-Inducing Script Misconfigurations (truth labels)
true_bug_inducing_labels = df['Bug_Inducing_Script_Misconfiguration_Label']

# Calculate Accuracy for Bug-Inducing Script Misconfigurations (mean of verification labels)
bug_inducing_accuracy = bug_inducing_verification_labels.mean()

# Precision, Recall, and F1-Score for Bug-Inducing Script Misconfigurations
bug_inducing_precision, bug_inducing_recall, bug_inducing_f1, _ = precision_recall_fscore_support(
    [1] * len(bug_inducing_verification_labels),  # Truth label is always 1
    bug_inducing_verification_labels,
    average='binary',
    zero_division=0,  # Handle division by zero if no true positives or false negatives
    pos_label=1  # Treat 1 as the positive class for Bug-Inducing (CWE-related)
)

# Print out the metrics for Bug-Inducing Script Misconfigurations
print(f"Precision for Bug-Inducing Script Misconfigurations: {bug_inducing_precision:.4f}")
print(f"Recall for Bug-Inducing Script Misconfigurations: {bug_inducing_recall:.4f}")
print(f"F1-Score for Bug-Inducing Script Misconfigurations: {bug_inducing_f1:.4f}")
print(f"Accuracy for Bug-Inducing Script Misconfigurations: {bug_inducing_accuracy:.4f}")

# Similarly, calculate metrics for Fixing Script Misconfigurations:

# Predicted labels for Fixing Script Misconfigurations (verification labels)
fixing_verification_labels = df['verification_Fixing_Script']

# Accuracy for Fixing Script Misconfigurations (mean of verification labels)
fixing_accuracy = fixing_verification_labels.mean()

# Precision, Recall, and F1-Score for Fixing Script Misconfigurations
fixing_precision, fixing_recall, fixing_f1, _ = precision_recall_fscore_support(
    [1] * len(fixing_verification_labels),  # Truth label is always 1
    fixing_verification_labels,
    average='binary',
    zero_division=0,  # Handle division by zero if no true positives or false negatives
    pos_label=1  # Treat 1 as the positive class for Fixing (CWE-related)
)

# Print out the metrics for Fixing Script Misconfigurations
print(f"Precision for Fixing Script Misconfigurations: {fixing_precision:.4f}")
print(f"Recall for Fixing Script Misconfigurations: {fixing_recall:.4f}")
print(f"F1-Score for Fixing Script Misconfigurations: {fixing_f1:.4f}")
print(f"Accuracy for Fixing Script Misconfigurations: {fixing_accuracy:.4f}")

# Calculate Overall Metrics (averaging between Fixing and Bug-Inducing)
overall_precision = (fixing_precision + bug_inducing_precision) / 2
overall_recall = (fixing_recall + bug_inducing_recall) / 2
overall_f1 = (fixing_f1 + bug_inducing_f1) / 2
overall_accuracy = (fixing_accuracy + bug_inducing_accuracy) / 2

# Print out the overall metrics
print(f"Overall Precision: {overall_precision:.4f}")
print(f"Overall Recall: {overall_recall:.4f}")
print(f"Overall F1-Score: {overall_f1:.4f}")
print(f"Overall Accuracy: {overall_accuracy:.4f}")



Precision for Bug-Inducing Script Misconfigurations: 1.0000
Recall for Bug-Inducing Script Misconfigurations: 0.4515
F1-Score for Bug-Inducing Script Misconfigurations: 0.6221
Accuracy for Bug-Inducing Script Misconfigurations: 0.4515
Precision for Fixing Script Misconfigurations: 1.0000
Recall for Fixing Script Misconfigurations: 0.6269
F1-Score for Fixing Script Misconfigurations: 0.7706
Accuracy for Fixing Script Misconfigurations: 0.6269
Overall Precision: 1.0000
Overall Recall: 0.5392
Overall F1-Score: 0.6964
Overall Accuracy: 0.5392


In [3]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    precision_recall_curve,
    auc,
    matthews_corrcoef
)

# Load the dataset
df = pd.read_csv('/content/drive/My Drive/expermintstarcoder/ansible_final/final-ansible_dataset_after_varification_starcoder.csv')

# Convert to numeric
df['Fixing_Script_Misconfiguration_Label'] = pd.to_numeric(df['Fixing_Script_Misconfiguration_Label'], errors='coerce')
df['Bug_Inducing_Script_Misconfiguration_Label'] = pd.to_numeric(df['Bug_Inducing_Script_Misconfiguration_Label'], errors='coerce')
df['verification_Fixing_Script'] = pd.to_numeric(df['verification_Fixing_Script'], errors='coerce')
df['verification_Bug_Inducing'] = pd.to_numeric(df['verification_Bug_Inducing'], errors='coerce')

# Drop missing values
df.dropna(subset=[
    'Fixing_Script_Misconfiguration_Label',
    'Bug_Inducing_Script_Misconfiguration_Label',
    'verification_Fixing_Script',
    'verification_Bug_Inducing'
], inplace=True)

# Fixing labels
true_fixing = df['Fixing_Script_Misconfiguration_Label'].astype(int)
pred_fixing = df['verification_Fixing_Script'].astype(int)

# Inducing labels
true_inducing = df['Bug_Inducing_Script_Misconfiguration_Label'].astype(int)
pred_inducing = df['verification_Bug_Inducing'].astype(int)

# Metrics for Fixing
fixing_accuracy = accuracy_score(true_fixing, pred_fixing)
fixing_precision, fixing_recall, fixing_f1, _ = precision_recall_fscore_support(true_fixing, pred_fixing, average='binary', pos_label=1, zero_division=0)
fixing_mcc = matthews_corrcoef(true_fixing, pred_fixing)
fixing_prec_curve, fixing_rec_curve, _ = precision_recall_curve(true_fixing, pred_fixing)
fixing_pr_auc = auc(fixing_rec_curve, fixing_prec_curve)

# Metrics for Inducing
inducing_accuracy = accuracy_score(true_inducing, pred_inducing)
inducing_precision, inducing_recall, inducing_f1, _ = precision_recall_fscore_support(true_inducing, pred_inducing, average='binary', pos_label=1, zero_division=0)
inducing_mcc = matthews_corrcoef(true_inducing, pred_inducing)
inducing_prec_curve, inducing_rec_curve, _ = precision_recall_curve(true_inducing, pred_inducing)
inducing_pr_auc = auc(inducing_rec_curve, inducing_prec_curve)

# Overall metrics (average of both)
overall_accuracy = (fixing_accuracy + inducing_accuracy) / 2
overall_precision = (fixing_precision + inducing_precision) / 2
overall_recall = (fixing_recall + inducing_recall) / 2
overall_f1 = (fixing_f1 + inducing_f1) / 2
overall_pr_auc = (fixing_pr_auc + inducing_pr_auc) / 2
overall_mcc = (fixing_mcc + inducing_mcc) / 2

# Print Fixing Metrics
print(f"Fixing → Precision: {fixing_precision:.4f}, Recall: {fixing_recall:.4f}, F1: {fixing_f1:.4f}, Accuracy: {fixing_accuracy:.4f}, PR-AUC: {fixing_pr_auc:.4f}, MCC: {fixing_mcc:.4f}")

# Print Inducing Metrics
print(f"Inducing → Precision: {inducing_precision:.4f}, Recall: {inducing_recall:.4f}, F1: {inducing_f1:.4f}, Accuracy: {inducing_accuracy:.4f}, PR-AUC: {inducing_pr_auc:.4f}, MCC: {inducing_mcc:.4f}")

# Print Overall Metrics
print(f"Overall → Precision: {overall_precision:.4f}, Recall: {overall_recall:.4f}, F1: {overall_f1:.4f}, Accuracy: {overall_accuracy:.4f}, PR-AUC: {overall_pr_auc:.4f}, MCC: {overall_mcc:.4f}")


Fixing → Precision: 0.9940, Recall: 0.6255, F1: 0.7678, Accuracy: 0.6231, PR-AUC: 0.9963, MCC: -0.0472
Inducing → Precision: 1.0000, Recall: 0.4515, F1: 0.6221, Accuracy: 0.4515, PR-AUC: 1.0000, MCC: 0.0000
Overall → Precision: 0.9970, Recall: 0.5385, F1: 0.6950, Accuracy: 0.5373, PR-AUC: 0.9982, MCC: -0.0236
