In [35]:
import os
import json
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configuration
ENHANCED_QA_DIR = r'd:\OWASP_BERT\QA_Pairs\Enhanced_QA'
MODEL_PATH = r'd:\OWASP_BERT\fine_tuned_owasp_model_advanced'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 32
MAX_TEXT_LENGTH = 512
PINECONE_INDEX_NAME = "owasp-qa"
EMBEDDING_DIM = 768  # For BERT-based models

# OWASP Category Mapping
OWASP_CATEGORY_MAP = {
    'A01_2021.json': 'A01_2021_Broken_Access_Control',
    'A02_2021.json': 'A02_2021_Cryptographic_Failures',
    'A03_2021.json': 'A03_2021_Injection',
    'A04_2021.json': 'A04_2021_Insecure_Design',
    'A05_2021.json': 'A05_2021_Security_Misconfiguration',
    'A06_2021.json': 'A06_2021_Vulnerable_Components',
    'A07_2021.json': 'A07_2021_Identification_Failures',
    'A08_2021.json': 'A08_2021_Software_Integrity_Failures',
    'A09_2021.json': 'A09_2021_Logging_Monitoring_Failures',
    'A10_2021.json': 'A10_2021_Server_Side_Request_Forgery'
}

In [36]:
def load_model(model_path: str) -> SentenceTransformer:
    """Load the fine-tuned SentenceTransformer model."""
    print(f"Loading fine-tuned model from: {model_path}")
    print(f"Using device: {DEVICE.upper()}")
    
    try:
        model = SentenceTransformer(model_path, device=DEVICE)
        model.max_seq_length = MAX_TEXT_LENGTH
        return model
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise

# Load the model
model = load_model(MODEL_PATH)

Loading fine-tuned model from: d:\OWASP_BERT\fine_tuned_owasp_model_advanced
Using device: CUDA


In [37]:
def initialize_pinecone(index_name: str, dimension: int) -> Any:
    """Initialize Pinecone and return the index."""
    # Initialize Pinecone
    pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
    
    # Check if index exists, create if not
    if index_name not in [index.name for index in pc.list_indexes()]:
        print(f"Creating new index: {index_name}")
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric="cosine",
            spec=ServerlessSpec(cloud='aws', region='us-east-1')
        )
        print(f"Index '{index_name}' created.")
    
    # Connect to the index
    index = pc.Index(index_name)
    print(f"Connected to index: {index_name}")
    print(f"Index stats: {index.describe_index_stats()}")
    return index

# Initialize Pinecone
index = initialize_pinecone(PINECONE_INDEX_NAME, EMBEDDING_DIM)

Creating new index: owasp-qa
Index 'owasp-qa' created.
Connected to index: owasp-qa
Index stats: {'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [38]:
def process_and_upsert_file(file_path: str, namespace: str) -> dict:
    """Process a single JSON file and upsert its contents to Pinecone."""
    try:
        # Load the JSON file
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Extract Q&A pairs
        qa_pairs = []
        for section, items in data.items():
            if isinstance(items, list):
                for item in items:
                    if isinstance(item, dict) and 'question' in item and 'answer' in item:
                        qa_pairs.append({
                            'id': item.get('id', f"{namespace}_{len(qa_pairs)}"),
                            'question': item['question'],
                            'answer': item['answer'],
                            'intent': item.get('intent', ''),
                            'type': item.get('type', ''),
                            'related_topics': item.get('related_topics', [])
                        })
        
        if not qa_pairs:
            print(f"No valid Q&A pairs found in {file_path}")
            return {'success': False, 'file': file_path, 'reason': 'No valid Q&A pairs'}
        
        # Generate embeddings
        print(f"Generating embeddings for {len(qa_pairs)} Q&A pairs from {file_path}")
        questions = [qa['question'] for qa in qa_pairs]
        embeddings = model.encode(questions, show_progress_bar=True, batch_size=BATCH_SIZE)
        
        # Prepare vectors for upsert
        vectors = []
        for i, qa in enumerate(qa_pairs):
            vectors.append({
                'id': qa['id'],
                'values': embeddings[i].tolist(),
                'metadata': {
                    'text': qa['question'],
                    'answer': qa['answer'],
                    'intent': qa['intent'],
                    'type': qa['type'],
                    'related_topics': qa['related_topics'],
                    'owasp_category': namespace
                }
            })
        
        # Upsert to Pinecone
        print(f"Upserting {len(vectors)} vectors to namespace: {namespace}")
        index.upsert(vectors=vectors, namespace=namespace)
        
        return {'success': True, 'file': file_path, 'vectors_upserted': len(vectors)}
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return {'success': False, 'file': file_path, 'reason': str(e)}

# Process each JSON file
results = []
for filename, namespace in OWASP_CATEGORY_MAP.items():
    file_path = os.path.join(ENHANCED_QA_DIR, filename)
    if os.path.exists(file_path):
        print(f"\n{'='*80}")
        print(f"Processing: {filename} -> {namespace}")
        print("="*80)
        result = process_and_upsert_file(file_path, namespace)
        results.append(result)
    else:
        print(f"\nFile not found: {file_path}")
        results.append({'success': False, 'file': file_path, 'reason': 'File not found'})

# Print summary
print("\n" + "="*50)
print("Processing Summary:")
print("="*50)
for i, result in enumerate(results, 1):
    status = "SUCCESS" if result.get('success', False) else "FAILED"
    print(f"{i}. {result['file']} - {status}")
    if not result.get('success', False):
        print(f"   Reason: {result.get('reason', 'Unknown error')}")
    elif 'vectors_upserted' in result:
        print(f"   Vectors upserted: {result['vectors_upserted']}")

# Get final index stats
try:
    stats = index.describe_index_stats()
    print("\nFinal index statistics:")
    print("-"*50)
    print(f"Total namespaces: {len(stats.get('namespaces', {}))}")
    print(f"Total vectors: {stats.get('total_vector_count', 0)}")
    print("\nNamespace-wise vector counts:")
    for ns, ns_stats in stats.get('namespaces', {}).items():
        print(f"  - {ns}: {ns_stats.get('vector_count', 0)} vectors")
except Exception as e:
    print(f"\nCould not retrieve final index stats: {str(e)}")


Processing: A01_2021.json -> A01_2021_Broken_Access_Control
Generating embeddings for 350 Q&A pairs from d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A01_2021.json


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Upserting 350 vectors to namespace: A01_2021_Broken_Access_Control

Processing: A02_2021.json -> A02_2021_Cryptographic_Failures
Generating embeddings for 372 Q&A pairs from d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A02_2021.json


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Upserting 372 vectors to namespace: A02_2021_Cryptographic_Failures

Processing: A03_2021.json -> A03_2021_Injection
Generating embeddings for 333 Q&A pairs from d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A03_2021.json


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Upserting 333 vectors to namespace: A03_2021_Injection

Processing: A04_2021.json -> A04_2021_Insecure_Design
Generating embeddings for 350 Q&A pairs from d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A04_2021.json


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Upserting 350 vectors to namespace: A04_2021_Insecure_Design

Processing: A05_2021.json -> A05_2021_Security_Misconfiguration
Generating embeddings for 336 Q&A pairs from d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A05_2021.json


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Upserting 336 vectors to namespace: A05_2021_Security_Misconfiguration

Processing: A06_2021.json -> A06_2021_Vulnerable_Components
Generating embeddings for 368 Q&A pairs from d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A06_2021.json


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Upserting 368 vectors to namespace: A06_2021_Vulnerable_Components

Processing: A07_2021.json -> A07_2021_Identification_Failures
Generating embeddings for 400 Q&A pairs from d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A07_2021.json


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Upserting 400 vectors to namespace: A07_2021_Identification_Failures

Processing: A08_2021.json -> A08_2021_Software_Integrity_Failures
Generating embeddings for 420 Q&A pairs from d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A08_2021.json


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Upserting 420 vectors to namespace: A08_2021_Software_Integrity_Failures

Processing: A09_2021.json -> A09_2021_Logging_Monitoring_Failures
Generating embeddings for 408 Q&A pairs from d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A09_2021.json


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Upserting 408 vectors to namespace: A09_2021_Logging_Monitoring_Failures

Processing: A10_2021.json -> A10_2021_Server_Side_Request_Forgery
Generating embeddings for 439 Q&A pairs from d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A10_2021.json


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Upserting 439 vectors to namespace: A10_2021_Server_Side_Request_Forgery

Processing Summary:
1. d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A01_2021.json - SUCCESS
   Vectors upserted: 350
2. d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A02_2021.json - SUCCESS
   Vectors upserted: 372
3. d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A03_2021.json - SUCCESS
   Vectors upserted: 333
4. d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A04_2021.json - SUCCESS
   Vectors upserted: 350
5. d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A05_2021.json - SUCCESS
   Vectors upserted: 336
6. d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A06_2021.json - SUCCESS
   Vectors upserted: 368
7. d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A07_2021.json - SUCCESS
   Vectors upserted: 400
8. d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A08_2021.json - SUCCESS
   Vectors upserted: 420
9. d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A09_2021.json - SUCCESS
   Vectors upserted: 408
10. d:\OWASP_BERT\QA_Pairs\Enhanced_QA\A10_2021.json - SUCCESS
   Vectors upserted: 439

Final index statistics:
----------

In [39]:
def query_pinecone(query: str, namespace: str = None, top_k: int = 3):
    """Query the Pinecone index to verify data was stored correctly."""
    # Generate query embedding
    query_embedding = model.encode(query, convert_to_tensor=True).cpu().numpy().tolist()
    
    # Query parameters
    query_params = {
        'vector': query_embedding,
        'top_k': top_k,
        'include_metadata': True
    }
    
    # Add namespace if specified
    if namespace:
        query_params['namespace'] = namespace
        print(f"Searching in namespace: {namespace}")
    else:
        print("Searching across all namespaces")
    
    # Execute query
    results = index.query(**query_params)
    
    # Print results
    print(f"\nTop {top_k} results for query: '{query}'\n")
    for i, match in enumerate(results.matches, 1):
        print(f"Result {i}:")
        print(f"  ID: {match.id}")
        print(f"  Score: {match.score:.4f}")
        print(f"  Question: {match.metadata.get('text', 'N/A')}")
        print(f"  Answer: {match.metadata.get('answer', 'N/A')[:200]}...")
        print(f"  OWASP Category: {match.metadata.get('owasp_category', 'N/A')}")
        print("-" * 80)

# Example queries
print("Verifying data in Pinecone...")
query_pinecone("What is SQL injection?", namespace="A03_2021_Injection")
query_pinecone("How to prevent XSS?", namespace="A03_2021_Injection")

Verifying data in Pinecone...
Searching in namespace: A03_2021_Injection

Top 3 results for query: 'What is SQL injection?'

Result 1:
  ID: A03-Q002
  Score: 0.8290
  Question: How does SQL Injection work in web applications?
  Answer: SQL Injection happens when an application builds SQL queries by directly including user input without proper validation or escaping. Attackers can inject malicious SQL code to manipulate queries, retr...
  OWASP Category: A03_2021_Injection
--------------------------------------------------------------------------------
Result 2:
  ID: A03-Q014
  Score: 0.7442
  Question: How does SQL injection affect a web application?
  Answer: SQL injection occurs when an attacker manipulates SQL queries by injecting malicious input, allowing them to read, modify, or delete database records, bypass authentication, or even execute administra...
  OWASP Category: A03_2021_Injection
--------------------------------------------------------------------------------
Result 

In [40]:
def query_all_namespaces():
    """Test queries for all OWASP categories."""
    # Define test queries for each OWASP category
    test_queries = {
        "A01_2021_Broken_Access_Control": [
            "What is broken access control?",
            "How to prevent broken access control?",
            "Examples of broken access control vulnerabilities"
        ],
        "A02_2021_Cryptographic_Failures": [
            "What are cryptographic failures?",
            "How to prevent cryptographic failures?",
            "Examples of cryptographic failures"
        ],
        "A03_2021_Injection": [
            "What is SQL injection?",
            "How to prevent XSS?",
            "Examples of injection attacks"
        ],
        "A04_2021_Insecure_Design": [
            "What is insecure design?",
            "How to prevent insecure design?",
            "Examples of insecure design"
        ],
        "A05_2021_Security_Misconfiguration": [
            "What is security misconfiguration?",
            "How to prevent security misconfiguration?",
            "Examples of security misconfiguration"
        ],
        "A06_2021_Vulnerable_Components": [
            "What are vulnerable components?",
            "How to prevent vulnerable components?",
            "Examples of vulnerable components"
        ],
        "A07_2021_Identification_Failures": [
            "What are identification failures?",
            "How to prevent identification failures?",
            "Examples of identification failures"
        ],
        "A08_2021_Software_Integrity_Failures": [
            "What are software integrity failures?",
            "How to prevent software integrity failures?",
            "Examples of software integrity failures"
        ],
        "A09_2021_Logging_Monitoring_Failures": [
            "What are logging and monitoring failures?",
            "How to prevent logging and monitoring failures?",
            "Examples of logging and monitoring failures"
        ],
        "A10_2021_Server_Side_Request_Forgery": [
            "What is SSRF?",
            "How to prevent SSRF?",
            "Examples of SSRF attacks"
        ]
    }
    
    # Get all namespaces from the index
    try:
        stats = index.describe_index_stats()
        namespaces = list(stats.get('namespaces', {}).keys())
        print(f"Found {len(namespaces)} namespaces in the index")
    except Exception as e:
        print(f"Error getting namespaces: {e}")
        return
    
    # Test each namespace
    for namespace in sorted(OWASP_CATEGORY_MAP.values()):
        if namespace not in namespaces:
            print(f"\n⚠️ Namespace '{namespace}' not found in the index")
            continue
            
        print(f"\n{'='*80}")
        print(f"TESTING NAMESPACE: {namespace}")
        print("="*80)
        
        # Get queries for this namespace
        queries = test_queries.get(namespace, [
            f"Tell me about {namespace}",
            f"How to prevent {namespace}",
            f"Examples of {namespace}"
        ])
        
        # Run each query
        for query in queries[:2]:  # Limit to 2 queries per namespace for brevity
            print(f"\nQuery: '{query}'")
            query_pinecone(query, namespace=namespace, top_k=2)
            print("\n" + "-"*80)

def query_pinecone(query: str, namespace: str = None, top_k: int = 3, verbose: bool = True):
    """Query the Pinecone index."""
    try:
        # Generate query embedding
        query_embedding = model.encode(query, convert_to_tensor=True).cpu().numpy().tolist()
        
        # Query parameters
        query_params = {
            'vector': query_embedding,
            'top_k': top_k,
            'include_metadata': True
        }
        
        # Add namespace if specified
        if namespace:
            query_params['namespace'] = namespace
            if verbose:
                print(f"Searching in namespace: {namespace}")
        elif verbose:
            print("Searching across all namespaces")
        
        # Execute query
        results = index.query(**query_params)
        
        if verbose:
            print(f"\nTop {top_k} results for query: '{query}'\n")
            for i, match in enumerate(results.matches, 1):
                print(f"Result {i}:")
                print(f"  ID: {match.id}")
                print(f"  Score: {match.score:.4f}")
                print(f"  Question: {match.metadata.get('text', 'N/A')}")
                print(f"  Answer: {match.metadata.get('answer', 'N/A')[:200]}...")
                print(f"  OWASP Category: {match.metadata.get('owasp_category', 'N/A')}")
                print("-" * 80)
        
        return results.matches if results else []
    
    except Exception as e:
        print(f"Error querying Pinecone: {e}")
        return []

# Run the tests
print("Starting tests for all namespaces...")
query_all_namespaces()
print("\nAll tests completed!")

Starting tests for all namespaces...
Found 10 namespaces in the index

TESTING NAMESPACE: A01_2021_Broken_Access_Control

Query: 'What is broken access control?'
Searching in namespace: A01_2021_Broken_Access_Control

Top 2 results for query: 'What is broken access control?'

Result 1:
  ID: A01-Q009
  Score: 0.6320
  Question: What does 'Broken Access Control' refer to in the context of secure web applications, and why is understanding it crucial for preventing unauthorized access in system design? (Example 9)
  Answer: Broken Access Control occurs when an application fails to enforce restrictions on what authenticated users are allowed to do. This can allow attackers to act as users or even administrators by exploit...
  OWASP Category: A01_2021_Broken_Access_Control
--------------------------------------------------------------------------------
Result 2:
  ID: A01-Q002
  Score: 0.6299
  Question: What does 'Broken Access Control' refer to in the context of secure web applications, 