In [1]:
import json
import os
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import random
from datasets import Dataset
import pandas as pd

In [2]:
def load_knowledge_base(json_file_paths):
    """
    Loads data from a list of JSON files and flattens it into a list of Q&A objects.
    Each Q&A object will contain:
    - 'question': The question string
    - 'answer': The answer string
    - 'id': The ID of the Q&A pair
    - 'intent': The intent of the question
    - 'type': The type of the question/answer
    - 'source_file': The filename it came from (for debugging/info)
    """
    knowledge_base = []
    for file_path in json_file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                for category, items in data.items():
                    for item in items:
                        knowledge_base.append({
                            'question': item['question'],
                            'answer': item['answer'],
                            'id': item['id'],
                            'intent': item.get('intent', 'N/A'),
                            'type': item.get('type', 'N/A'),
                            'related_topics': item.get('related_topics', []),
                            'source_file': os.path.basename(file_path)
                        })
            print(f"Successfully loaded {len(data)} categories from {os.path.basename(file_path)}")
        except FileNotFoundError:
            print(f"Error: JSON file not found at {file_path}. Please ensure it's in the correct directory. Skipping.")
        except json.JSONDecodeError:
            print(f"Error: Could not decode JSON from {file_path}. Check for syntax errors. Skipping.")
        except Exception as e:
            print(f"An unexpected error occurred while loading {file_path}: {e}")
    return knowledge_base

In [3]:
json_file_paths = [
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A01_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A02_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A03_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A04_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A05_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A06_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A07_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A08_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A09_2021.json',
    r'D:\OWASP_BERT\QA_Pairs\Enhanced_QA\A10_2021.json',
]

print("Loading knowledge base for fine-tuning...")
kb_data = load_knowledge_base(json_file_paths)

if not kb_data:
    raise ValueError("Knowledge base is empty. Cannot fine-tune without data. Please check your JSON files and paths.")

train_kb_data, val_kb_data = train_test_split(kb_data, test_size=0.1, random_state=42)


train_examples = []
for item in train_kb_data:
    train_examples.append(InputExample(texts=[item['question'], item['answer']]))

print(f"Prepared {len(train_examples)} training examples.")


eval_queries = {}
eval_corpus = {}
eval_relevant_docs = {}

answer_id_counter = 0
answer_id_map = {}

for item in val_kb_data:
    query_id = item['id']
    query_text = item['question']

    if item['answer'] not in answer_id_map:
        answer_id_map[item['answer']] = f"ans_{answer_id_counter}"
        answer_id_counter += 1
    corpus_id = answer_id_map[item['answer']]
    corpus_text = item['answer']

    eval_queries[query_id] = query_text
    eval_corpus[corpus_id] = corpus_text

    if query_id not in eval_relevant_docs:
        eval_relevant_docs[query_id] = set()
    eval_relevant_docs[query_id].add(corpus_id)

print(f"Prepared {len(eval_queries)} validation queries and {len(eval_corpus)} validation corpus entries.")

# Create DataLoader for training
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32) # Batch size for GPU training

Loading knowledge base for fine-tuning...
Successfully loaded 8 categories from A01_2021.json
Successfully loaded 8 categories from A02_2021.json
Successfully loaded 8 categories from A03_2021.json
Successfully loaded 8 categories from A04_2021.json
Successfully loaded 8 categories from A05_2021.json
Successfully loaded 8 categories from A06_2021.json
Successfully loaded 8 categories from A07_2021.json
Successfully loaded 8 categories from A08_2021.json
Successfully loaded 8 categories from A09_2021.json
Successfully loaded 8 categories from A10_2021.json
Prepared 3398 training examples.
Prepared 378 validation queries and 378 validation corpus entries.


In [4]:
# Cell 4: Initialize the base BERT model and define the loss function

print("Initializing base SentenceTransformer model 'all-mpnet-base-v2' for fine-tuning...")
model = SentenceTransformer('all-mpnet-base-v2') # Changed base model for higher performance
print("Base model initialized.")

train_loss = losses.MultipleNegativesRankingLoss(model=model)

Initializing base SentenceTransformer model 'all-mpnet-base-v2' for fine-tuning...
Base model initialized.


In [5]:
# Define training parameters
num_epochs = 10  # Number of training epochs. Adjust based on dataset size and performance.
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) # 10% of total training steps
output_path = './fine_tuned_owasp_model_advanced' # Directory to save the fine-tuned model
model_save_name = 'best_model' # Name for the best model subdirectory (within output_path)

print(f"Starting advanced fine-tuning for {num_epochs} epochs...")

# Create the InformationRetrievalEvaluator
# This evaluator calculates metrics like Average Precision, Recall@k during training
# and saves the model that achieves the best score on the specified metric.
ir_evaluator = evaluation.InformationRetrievalEvaluator(
    queries=eval_queries,
    corpus=eval_corpus,
    relevant_docs=eval_relevant_docs,
    show_progress_bar=True,
    corpus_chunk_size=500, # Adjust based on GPU memory. Larger chunk_size uses more memory but might be faster.
    name='owasp_validation' # Name for the evaluation log
)

# Fine-tune the model
# The evaluator will be called periodically (every 10% of an epoch by default)
# and the best model based on the evaluator's score will be saved.
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=ir_evaluator,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=output_path,
          save_best_model=True, # Save the model that achieves the best performance on the evaluator
          optimizer_params={'lr': 2e-5}, # Learning rate (common starting point)
          use_amp=True, # Use Automatic Mixed Precision for faster training on GPUs
          checkpoint_path=output_path, # Path to save checkpoints
          checkpoint_save_steps=len(train_dataloader) // 2, # Save checkpoint after half of an epoch (adjust as needed)
          checkpoint_save_total_limit=3 # Keep only the last 3 checkpoints
         )

print("\nAdvanced fine-tuning complete!")
print(f"Best fine-tuned model saved to: {output_path}/{model_save_name}")

Starting advanced fine-tuning for 10 epochs...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Owasp Validation Cosine Accuracy@1,Owasp Validation Cosine Accuracy@3,Owasp Validation Cosine Accuracy@5,Owasp Validation Cosine Accuracy@10,Owasp Validation Cosine Precision@1,Owasp Validation Cosine Precision@3,Owasp Validation Cosine Precision@5,Owasp Validation Cosine Precision@10,Owasp Validation Cosine Recall@1,Owasp Validation Cosine Recall@3,Owasp Validation Cosine Recall@5,Owasp Validation Cosine Recall@10,Owasp Validation Cosine Ndcg@10,Owasp Validation Cosine Mrr@10,Owasp Validation Cosine Map@100
107,No log,No log,0.896825,0.992063,0.997354,1.000000,0.896825,0.330688,0.199471,0.100000,0.896825,0.992063,0.997354,1.000000,0.957710,0.943122,0.943122
214,No log,No log,0.902116,0.989418,0.994709,0.997354,0.902116,0.329806,0.198942,0.099735,0.902116,0.989418,0.994709,0.997354,0.959726,0.946649,0.946838
321,No log,No log,0.912698,0.997354,1.000000,1.000000,0.912698,0.332451,0.200000,0.100000,0.912698,0.997354,1.000000,1.000000,0.965518,0.953483,0.953483
428,No log,No log,0.910053,0.994709,1.000000,1.000000,0.910053,0.331570,0.200000,0.100000,0.910053,0.994709,1.000000,1.000000,0.963665,0.951058,0.951058
500,0.030000,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log
535,0.030000,No log,0.920635,0.997354,1.000000,1.000000,0.920635,0.332451,0.200000,0.100000,0.920635,0.997354,1.000000,1.000000,0.968101,0.957011,0.957011
642,0.030000,No log,0.923280,0.994709,1.000000,1.000000,0.923280,0.331570,0.200000,0.100000,0.923280,0.994709,1.000000,1.000000,0.968431,0.957540,0.957540
749,0.030000,No log,0.923280,0.992063,0.997354,1.000000,0.923280,0.330688,0.199471,0.100000,0.923280,0.992063,0.997354,1.000000,0.968397,0.957540,0.957540
856,0.030000,No log,0.920635,0.994709,0.997354,1.000000,0.920635,0.331570,0.199471,0.100000,0.920635,0.994709,0.997354,1.000000,0.967374,0.956129,0.956129
963,0.030000,No log,0.920635,0.994709,1.000000,1.000000,0.920635,0.331570,0.200000,0.100000,0.920635,0.994709,1.000000,1.000000,0.968148,0.957099,0.957099


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:21<00:00, 21.32s/it]


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:21<00:00, 21.34s/it]


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:21<00:00, 21.43s/it]


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:21<00:00, 21.39s/it]


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:21<00:00, 21.57s/it]


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:21<00:00, 21.47s/it]


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:21<00:00, 21.51s/it]


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:21<00:00, 21.41s/it]


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:21<00:00, 21.46s/it]


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:21<00:00, 21.20s/it]



Advanced fine-tuning complete!
Best fine-tuned model saved to: ./fine_tuned_owasp_model_advanced/best_model


In [6]:
# Cell 6: Verify the fine-tuned model

import os
from sentence_transformers import SentenceTransformer, util

output_path = './fine_tuned_owasp_model_advanced' # Directory where the model was saved

# IMPORTANT CHANGE: The model components are directly in output_path,
# not in a 'best_model' subdirectory in this case.
full_model_path = output_path # We load directly from the output_path

print(f"\nAttempting to load model from: {full_model_path}")

# --- Debugging Checks (confirming the path we are trying to load) ---
print(f"Checking if '{full_model_path}' directory exists: {os.path.exists(full_model_path)}")
if os.path.exists(full_model_path):
    print(f"Contents of '{full_model_path}': {os.listdir(full_model_path)}")
# --- End Debugging Checks ---

if os.path.exists(full_model_path):
    try:
        # Load the fine-tuned model directly from the output_path
        best_fine_tuned_model = SentenceTransformer(full_model_path)
        print("Fine-tuned model loaded successfully.")

        # Test with some example queries and answers
        print("\nTesting fine-tuned model with example embeddings:")
        test_question_1 = "What is SQL Injection?"
        test_answer_1 = "SQL injection is a web security vulnerability that allows an attacker to alter the SQL queries made by an application."

        test_question_2 = "Tell me about broken access control"
        test_answer_2 = "Broken access control refers to a failure in enforcing policies that restrict users from acting outside their intended permissions."

        test_embedding_q1 = best_fine_tuned_model.encode(test_question_1, convert_to_tensor=True)
        test_embedding_a1 = best_fine_tuned_model.encode(test_answer_1, convert_to_tensor=True)

        test_embedding_q2 = best_fine_tuned_model.encode(test_question_2, convert_to_tensor=True)
        test_embedding_a2 = best_fine_tuned_model.encode(test_answer_2, convert_to_tensor=True)

        # Calculate similarity between a question and its correct answer
        similarity_q1_a1 = util.cos_sim(test_embedding_q1, test_embedding_a1).item()
        similarity_q2_a2 = util.cos_sim(test_embedding_q2, test_embedding_a2).item()

        # Calculate similarity between a question and an incorrect answer (e.g., Q1 with A2)
        similarity_q1_a2 = util.cos_sim(test_embedding_q1, test_embedding_a2).item()

        print(f"Similarity (Q1, A1): {similarity_q1_a1:.4f} (Expected High)")
        print(f"Similarity (Q2, A2): {similarity_q2_a2:.4f} (Expected High)")
        print(f"Similarity (Q1, A2): {similarity_q1_a2:.4f} (Expected Low)")

        # If fine-tuning was successful, you should see higher scores for (Q, A) pairs
        # and lower scores for (Q, incorrect A) pairs compared to the base model.
    except Exception as e:
        print(f"An error occurred during model loading or testing: {e}")
        print("This might indicate an issue with the saved model files, even though the directory exists.")
        print("Please review the output of Cell 5 for any errors or warnings during training.")
else:
    print(f"\nError: Model directory '{full_model_path}' still not found or empty!")
    print("This indicates a fundamental issue with the saving process in Cell 5.")
    print("Please ensure Cell 5 ran to completion without errors and that enough disk space is available.")
    print("If using Google Colab, ensure the session hasn't disconnected or reset and try saving to /content/ drive.")


Attempting to load model from: ./fine_tuned_owasp_model_advanced
Checking if './fine_tuned_owasp_model_advanced' directory exists: True
Contents of './fine_tuned_owasp_model_advanced': ['1_Pooling', '2_Normalize', 'checkpoint-1007', 'checkpoint-1060', 'checkpoint-1070', 'config.json', 'config_sentence_transformers.json', 'eval', 'model.safetensors', 'modules.json', 'README.md', 'sentence_bert_config.json', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'vocab.txt']
Fine-tuned model loaded successfully.

Testing fine-tuned model with example embeddings:
Similarity (Q1, A1): 0.8621 (Expected High)
Similarity (Q2, A2): 0.5542 (Expected High)
Similarity (Q1, A2): 0.1228 (Expected Low)
