In [1]:
import json
from pathlib import Path
import pandas as pd
from tqdm import tqdm

In [2]:
# Set path to your QA pairs directory
qa_dir = "QA_Pairs/Owasp_Top10/"

In [3]:
# Function to load all QA pairs
def load_qa_pairs(directory):
    qa_pairs = []
    
    # Get all JSON files in the directory
    json_files = list(Path(directory).glob('*.json'))
    
    print(f"Found {len(json_files)} JSON files")
    
    for json_file in tqdm(json_files, desc="Processing files"):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
                # Extract all QA pairs from all categories
                for category, items in data.items():
                    if isinstance(items, list):
                        for item in items:
                            if isinstance(item, dict) and 'question' in item and 'answer' in item:
                                # Add filename and category to each QA pair
                                item['source_file'] = json_file.name
                                item['category'] = category
                                qa_pairs.append(item)
                                
        except Exception as e:
            print(f"Error processing {json_file}: {str(e)}")
    
    return qa_pairs

In [4]:
# Load all QA pairs
print("Loading QA pairs...")
all_qa_pairs = load_qa_pairs(qa_dir)

Loading QA pairs...
Found 10 JSON files


Processing files: 100%|██████████| 10/10 [00:00<00:00, 1292.15it/s]


In [5]:
# Convert to DataFrame
df = pd.DataFrame(all_qa_pairs)

# Display basic info
print("\nDataset Overview:")
print(f"Total QA pairs: {len(df)}")
print(f"Categories: {df['category'].nunique()}")
print(f"Source files: {df['source_file'].nunique()}")
print("\nFirst few rows:")
display(df.head())


Dataset Overview:
Total QA pairs: 2251
Categories: 8
Source files: 10

First few rows:


Unnamed: 0,id,question,answer,intent,type,source_file,category
0,A01-Q001,What is broken access control?,Broken access control refers to a failure in e...,define_broken_access_control,basic_understanding,A01_2021.json,basic_understanding
1,A01-Q002,Why is broken access control important in cybe...,Broken access control is critical because it c...,importance_broken_access_control,basic_understanding,A01_2021.json,basic_understanding
2,A01-Q003,What are some examples of broken access contro...,"Examples include privilege escalation, insecur...",examples_broken_access_control,basic_understanding,A01_2021.json,basic_understanding
3,A01-Q017,How is access control different from authentic...,"Authentication verifies who a user is, while a...",difference_access_control_authentication,basic_understanding,A01_2021.json,basic_understanding
4,A01-Q018,What is the role of access control in the SDLC?,Access control should be designed and tested f...,access_control_sdlc_role,basic_understanding,A01_2021.json,basic_understanding


In [6]:
# Save to CSV if needed
output_csv = "owasp_qa_pairs.csv"
df.to_csv(output_csv, index=False)
print(f"\nSaved {len(df)} QA pairs to {output_csv}")


Saved 2251 QA pairs to owasp_qa_pairs.csv


In [7]:
# Additional analysis
print("\nCategory Distribution:")
print(df['category'].value_counts())


Category Distribution:
category
proactive                       320
prevention                      313
example_scenarios               290
vulnerability_identification    277
basic_understanding             270
technical                       265
statistics                      260
reference                       256
Name: count, dtype: int64


In [8]:
# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
id             0
question       0
answer         0
intent         0
type           0
source_file    0
category       0
dtype: int64
