In [6]:
#This file creates question file with question and expected answer
import json
import random

# def load_cve_data(cve_file):
#     """Loads CVE data from a JSON file."""
#     with open(cve_file, 'r', encoding='utf-8') as file:
#         data = json.load(file)
#     return data
def load_cve_data(file_paths):    
        all_data = []
        for file_path in file_paths:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                all_data.extend(data)
        return all_data

def find_value_by_key(nested_dict, target_key):
    if isinstance(nested_dict, dict):
        for key, value in nested_dict.items():
            if key == target_key:
                return value
            # Recursively check nested dictionaries
            found_value = find_value_by_key(value, target_key)
            if found_value is not None:
                return found_value
    elif isinstance(nested_dict, list):
        for item in nested_dict:
            found_value = find_value_by_key(item, target_key)
            if found_value is not None:
                return found_value
    return None

def get_nested_value(data, keys, default="No data available"):
    """Retrieves a nested value from a dictionary given a list of keys."""
    for key in keys:
        data = data.get(key, {})
    return data if data else default

def generate_question(cve, field):
    """Generates a question and expected answer for a given field of the CVE."""
    if field == "PublishedDate":
        question = f"What is the published date of {cve['CVE_ID']}"
        #expected_answer = cve.get('PublishedDate', 'No data available')
        expected_answer =find_value_by_key(cve, 'PublishedDate')
    elif field == "Description":
        question = f"What is the description of {cve['CVE_ID']}"
        #expected_answer = cve.get('Description', 'No data available')
        expected_answer =find_value_by_key(cve, 'Description')
    elif field == "ExploitabilityScore":
        # Retrieve ExploitabilityScore from the nested structure
        #expected_answer = get_nested_value(cve, ['Impact', 'baseMetricV3', 'exploitabilityScore'])
        expected_answer =find_value_by_key(cve, 'exploitabilityScore')
        question = f"What is the exploitability score of {cve['CVE_ID']}"
    elif field == "ImpactScore":
        # Retrieve ImpactScore from the nested structure
        #expected_answer = get_nested_value(cve, ['Impact', 'baseMetricV3', 'impactScore'])
        expected_answer =find_value_by_key(cve, 'impactScore')
        question = f"What is the impact score of {cve['CVE_ID']}"
    elif field == "BaseScore":
        # Retrieve BaseScore from the nested structure
        #expected_answer = get_nested_value(cve, ['Impact', 'baseMetricV3', 'cvssV3', 'baseScore'])
        expected_answer =find_value_by_key(cve, 'baseScore')
        question = f"What is the base score of {cve['CVE_ID']}"
    
    return {
        "question": question,
        "expected_answer": expected_answer
    }

def generate_questions(cve_data, num_questions=5):
    """Generates random questions for the specified number of CVEs."""
    selected_cves = random.sample(cve_data, num_questions)
    questions = []
    
    for cve in selected_cves:
        # Generate questions for each required field
        questions.append(generate_question(cve, "PublishedDate"))
        questions.append(generate_question(cve, "Description"))
        questions.append(generate_question(cve, "ExploitabilityScore"))
        questions.append(generate_question(cve, "ImpactScore"))
        questions.append(generate_question(cve, "BaseScore"))

    return questions

def save_questions_to_file(questions, output_file):
    """Saves the generated questions to a JSON file."""
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(questions, file, indent=4)

# Load CVE data from a file
cve_file = [
      'nvdcve-1.1-2024_updated.json',
      'nvdcve-1.1-2023_updated.json',
 'nvdcve-1.1-2022_updated.json',
 'nvdcve-1.1-2021_updated.json',
 'nvdcve-1.1-2020_updated.json',
 'nvdcve-1.1-2019_updated.json',
 'nvdcve-1.1-2018_updated.json',
 'nvdcve-1.1-2017_updated.json',
 'nvdcve-1.1-2016_updated.json',
 'nvdcve-1.1-2015_updated.json',
 'nvdcve-1.1-2014_updated.json',
 'nvdcve-1.1-2013_updated.json',
 'nvdcve-1.1-2012_updated.json',
 'nvdcve-1.1-2011_updated.json',
 'nvdcve-1.1-2010_updated.json',
 'nvdcve-1.1-2009_updated.json',
 'nvdcve-1.1-2008_updated.json',
 'nvdcve-1.1-2007_updated.json',
 'nvdcve-1.1-2006_updated.json',
 'nvdcve-1.1-2005_updated.json',
 'nvdcve-1.1-2004_updated.json',
 'nvdcve-1.1-2003_updated.json',
 'nvdcve-1.1-2002_updated.json'
              
              ]
cve_data = load_cve_data(cve_file)

# Generate questions from random CVE IDs
questions = generate_questions(cve_data, num_questions=5)

# Save the generated questions to a JSON file
output_file = 'questions1.json'
save_questions_to_file(questions, output_file)

print(f"Questions have been generated and saved to {output_file}")


Questions have been generated and saved to questions1.json


In [3]:
#This file evaluated the output file from different LLMs
import json
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def load_json(file_path):
    """Loads the JSON file with questions, expected answers, and actual answers."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def preprocess_answer(answer):
    """Basic preprocessing to convert answers to lowercase and strip unnecessary characters."""
    return answer.lower().strip()

def compare_answers(expected, actual):
    """Token-based comparison for textual answers using cosine similarity."""
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([expected, actual])
    similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
    return similarity

def evaluate_answers(data):
    """Evaluates the actual answers against the expected answers using token overlap."""
    y_true = []
    y_pred = []
    thresholds = 0.75  # Cosine similarity threshold for text match
    
    for item in data:
        expected_answer = str(item['expected_answer']).lower()
        actual_answer = str(item['actual_answer']).lower()
        
        if isinstance(item['expected_answer'], (int, float)):  # Numeric comparison
            y_true.append(1)
            y_pred.append(1 if expected_answer == actual_answer else 0)
        else:  # Textual comparison
            similarity = compare_answers(preprocess_answer(expected_answer), preprocess_answer(actual_answer))
            y_true.append(1)  # Expected to match
            y_pred.append(1 if similarity >= thresholds else 0)
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return precision, recall, f1

# Load the JSON file
file_path = 'gpt_output_results.json'  # Replace with your JSON file path
data = load_json(file_path)

# Evaluate the answers
precision, recall, f1 = evaluate_answers(data)

# Print results
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Precision: 1.00
Recall: 0.20
F1 Score: 0.33


In [5]:
#this approach shows all the details and uses tokenization for all questions
import json
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def load_json(file_path):
    """Loads the JSON file with questions, expected answers, and actual answers."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def preprocess_answer(answer):
    """Basic preprocessing to convert answers to lowercase and strip unnecessary characters."""
    return answer.lower().strip()

def compare_answers(expected, actual):
    """Token-based comparison for textual answers using cosine similarity."""
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([expected, actual])
    similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
    return similarity

def evaluate_answers(data, threshold=0.75):
    """Evaluates the actual answers against the expected answers and displays results for each question."""
    y_true = []
    y_pred = []
    
    print("Detailed Results:\n")
    
    for idx, item in enumerate(data):
        expected_answer = str(item['expected_answer']).lower()
        actual_answer = str(item['actual_answer']).lower()
        correct = False
        score = 0
        
        # Compare numeric answers directly
        if isinstance(item['expected_answer'], (int, float)):
            correct = expected_answer == actual_answer
            score = 1.0 if correct else 0.0
        else:
            # Use cosine similarity for text-based answers
            score = compare_answers(preprocess_answer(expected_answer), preprocess_answer(actual_answer))
            correct = score >= threshold
        
        y_true.append(1)  # Expected to match
        y_pred.append(1 if correct else 0)
        
        # Display the result for this question
        print(f"Question {idx + 1}: {item['question']}")
        print(f"Expected Answer: {item['expected_answer']}")
        print(f"Actual Answer: {item['actual_answer']}")
        print(f"Similarity Score: {score:.2f}")
        print(f"Correct: {'Yes' if correct else 'No'}\n")
    
    # Calculate precision, recall, and F1 score
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return precision, recall, f1

# Load the JSON file
file_path = 'gpt_output_results.json'  # Replace with your JSON file path
data = load_json(file_path)

# Evaluate the answers and display detailed results
precision, recall, f1 = evaluate_answers(data, threshold=0.75)

# Print overall precision, recall, and F1 score
print("Overall Results:")
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Detailed Results:

Question 1: What is the published date of CVE-2023-49255
Expected Answer: 2024-01-12T15:15Z
Actual Answer: The published date of CVE-2023-49255 is 2024-01-12T15:15Z.
Similarity Score: 0.45
Correct: No

Question 2: What is the description of CVE-2023-49255
Expected Answer: The router console is accessible without authentication at "data" field, and while a user needs to be logged in in order to modify the configuration, the session state is shared. If any other user is currently logged in, the anonymous user can execute commands in the context of the authenticated one. If the logged in user has administrative privileges, it is possible to use webadmin service configuration commands to create a new admin user with a chosen password.
Actual Answer: The description of CVE-2023-49255 is: "The router console is accessible without authentication at 'data' field, and while a user needs to be logged in in order to modify the configuration, the session state is shared. If any 

In [38]:
import json
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

def load_json(file_path):
    """Loads the JSON file with questions, expected answers, and actual answers."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def preprocess_answer(answer):
    """Basic preprocessing to convert answers to lowercase and strip unnecessary characters."""
    return answer.lower().strip()

def compare_textual_answers(expected, actual):
    """Token-based comparison for textual answers using cosine similarity (for descriptions)."""
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([expected, actual])
    similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
    return similarity

def evaluate_answers(data, threshold=0.75):
    """Evaluates the actual answers against the expected answers and displays results for each question."""
    y_true = []
    y_pred = []
    
    print("Detailed Results:\n")
    
    for idx, item in enumerate(data):
        expected_answer = str(item['expected_answer']).lower()
        actual_answer = str(item['actual_answer']).lower()
        correct = False
        score = 0
        
        # Special case for description: Use token-based comparison
        if "description" in item['question'].lower():
            score = compare_textual_answers(preprocess_answer(expected_answer), preprocess_answer(actual_answer))
            correct = score >= threshold
        else:
            # For all other fields, check if the expected answer string is present in the actual answer
            correct = expected_answer in actual_answer
            score = 1.0 if correct else 0.0
        
        y_true.append(1)  # Expected to match
        y_pred.append(1 if correct else 0)
        
        # Display the result for this question
        print(f"Question {idx + 1}: {item['question']}")
        print(f"Expected Answer: {item['expected_answer']}")
        print(f"Actual Answer: {item['actual_answer']}")
        print(f"Score: {score:.2f}")
        print(f"Correct: {'Yes' if correct else 'No'}\n")
    
    # Calculate precision, recall, and F1 score
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return precision, recall, f1

# Load the JSON file
file_path = 'gpt_output_results.json'  # Replace with your JSON file path
data = load_json(file_path)

# Evaluate the answers and display detailed results
precision, recall, f1 = evaluate_answers(data, threshold=0.75)

# Print overall precision, recall, and F1 score
print("Overall Results:")
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Detailed Results:

Question 1: What is the published date of CVE-2023-47196
Expected Answer: 2024-01-23T21:15Z
Actual Answer: The published date of CVE-2023-47196 is January 23, 2024.
Score: 0.00
Correct: No

Question 2: What is the description of CVE-2023-47196
Expected Answer: An origin validation vulnerability in the Trend Micro Apex One security agent could allow a local attacker to escalate privileges on affected installations.

Please note: an attacker must first obtain the ability to execute low-privileged code on the target system in order to exploit this vulnerability.

This vulnerability is similar to, but not identical to, CVE-2023-47197.
Actual Answer: The description of CVE-2023-47196 is: "An origin validation vulnerability in the Trend Micro Apex One security agent could allow a local attacker to escalate privileges on affected installations. Please note: an attacker must first obtain the ability to execute low-privileged code on the target system in order to exploit this

In [1]:
# in this date has been parsed into a common format to check it properly

import json
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import re

def load_json(file_path):
    """Loads the JSON file with questions, expected answers, and actual answers."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def preprocess_answer(answer):
    """Basic preprocessing to convert answers to lowercase and strip unnecessary characters."""
    return answer.lower().strip()

def compare_textual_answers(expected, actual):
    """Token-based comparison for textual answers using cosine similarity (for descriptions)."""
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([expected, actual])
    similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
    return similarity

def parse_date(date_str):
    """Attempts to parse the date string into a standard format for comparison."""
    date_formats = [
        "%Y-%m-%dT%H:%MZ", "%Y-%m-%d", "%Y/%m/%d", "%Y-%m-%d %H:%M:%S", 
        "%B %d, %Y", "%b %d, %Y"  # Adding long and short month formats (e.g., "October 25, 2023")
    ]
    for fmt in date_formats:
        try:
            #return datetime.strptime(date_str, fmt).date()  # Parse to date only
            return datetime.strptime(date_str, fmt).strftime("%Y-%m-%d")  # Parse to date only
        except ValueError:
            continue
    return None


def parse_date_2(date_str):

    # Regular expression to match the date pattern
    date_pattern = r"([A-Za-z]+\s\d{1,2},\s\d{4})"

    # Find the date in the sentence
    match = re.search(date_pattern, date_str)

    # Check if a match is found
    if match:
        extracted_date = match.group(0)
        # Convert the extracted date to a datetime object
        try:
            date_obj = datetime.strptime(extracted_date, "%B %d, %Y")
            # Convert to ISO format
            iso_date = date_obj.strftime("%Y-%m-%d")
            #print("Extracted date in ISO format:", iso_date)
            return iso_date
        except ValueError:
            print("Date format is incorrect.")
    else:
        print("No date found in the sentence.")

def compare_dates(expected, actual):
    """Compares dates by parsing them into a standard format and ignoring time."""
    expected_date = parse_date(expected)
    actual_date = parse_date_2(actual)
    print("Expected data: ")
    print(expected_date)
    print("\n Actual data: ")
    print(actual_date)
    if expected_date and actual_date:
        return expected_date == actual_date
    return False

def evaluate_answers(data, threshold=0.75):
    """Evaluates the actual answers against the expected answers and displays results for each question."""
    y_true = []
    y_pred = []
    
    print("Detailed Results:\n")
    
    for idx, item in enumerate(data):
        expected_answer = str(item['expected_answer']).lower()
        actual_answer = str(item['actual_answer']).lower()
        correct = False
        score = 0
        
        # Special case for description: Use token-based comparison
        if "description" in item['question'].lower():
            score = compare_textual_answers(preprocess_answer(expected_answer), preprocess_answer(actual_answer))
            correct = score >= threshold
        
        # Special case for date: Use date parsing and comparison
        elif "date" in item['question'].lower():
            correct = compare_dates(expected_answer, actual_answer)
            #score = 1.0 if correct else 0.0
            if(correct):
                score = 1.0
            else:
                correct = expected_answer in actual_answer
                score = 1.0 if correct else 0.0
        # For all other fields, check if the expected answer string is present in the actual answer
        else:
            correct = expected_answer in actual_answer
            score = 1.0 if correct else 0.0
        
        y_true.append(1)  # Expected to match
        y_pred.append(1 if correct else 0)
        
        # Display the result for this question
        print(f"Question {idx + 1}: {item['question']}")
        print(f"Expected Answer: {item['expected_answer']}")
        print(f"Actual Answer: {item['actual_answer']}")
        print(f"Score: {score:.2f}")
        print(f"Correct: {'Yes' if correct else 'No'}\n")
    
    # Calculate precision, recall, and F1 score
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return precision, recall, f1

# Load the JSON file
file_path = 'gpt_output_results.json'  # Replace with your JSON file path
data = load_json(file_path)

# Evaluate the answers and display detailed results
precision, recall, f1 = evaluate_answers(data, threshold=0.75)

# Print overall precision, recall, and F1 score
print("Overall Results:")
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Detailed Results:

No date found in the sentence.
Expected data: 
2016-05-11

 Actual data: 
None
Question 1: What is the published date of CVE-2016-1044
Expected Answer: 2016-05-11T10:59Z
Actual Answer: The published date of CVE-2016-1044 is 2016-05-11T10:59Z.
Score: 1.00
Correct: Yes

Question 2: What is the description of CVE-2016-1044
Expected Answer: Adobe Reader and Acrobat before 11.0.16, Acrobat and Acrobat Reader DC Classic before 15.006.30172, and Acrobat and Acrobat Reader DC Continuous before 15.016.20039 on Windows and OS X allow attackers to bypass JavaScript API execution restrictions via unspecified vectors, a different vulnerability than CVE-2016-1038, CVE-2016-1039, CVE-2016-1040, CVE-2016-1041, CVE-2016-1042, CVE-2016-1062, and CVE-2016-1117.
Actual Answer: The description of CVE-2016-1044 is: "Adobe Reader and Acrobat before 11.0.16, Acrobat and Acrobat Reader DC Classic before 15.006.30172, and Acrobat and Acrobat Reader DC Continuous before 15.016.20039 on Windows

In [2]:
# in this date has been parsed into a common format to check it properly

import json
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import re

def load_json(file_path):
    """Loads the JSON file with questions, expected answers, and actual answers."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def preprocess_answer(answer):
    """Basic preprocessing to convert answers to lowercase and strip unnecessary characters."""
    return answer.lower().strip()

def compare_textual_answers(expected, actual):
    """Token-based comparison for textual answers using cosine similarity (for descriptions)."""
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([expected, actual])
    similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
    return similarity

def parse_date(date_str):
    """Attempts to parse the date string into a standard format for comparison."""
    date_formats = [
        "%Y-%m-%dT%H:%MZ", "%Y-%m-%d", "%Y/%m/%d", "%Y-%m-%d %H:%M:%S", 
        "%B %d, %Y", "%b %d, %Y"  # Adding long and short month formats (e.g., "October 25, 2023")
    ]
    for fmt in date_formats:
        try:
            #return datetime.strptime(date_str, fmt).date()  # Parse to date only
            return datetime.strptime(date_str, fmt).strftime("%Y-%m-%d")  # Parse to date only
        except ValueError:
            continue
    return None


def parse_date_2(date_str):

    # Regular expression to match the date pattern
    date_pattern = r"([A-Za-z]+\s\d{1,2},\s\d{4})"

    # Find the date in the sentence
    match = re.search(date_pattern, date_str)

    # Check if a match is found
    if match:
        extracted_date = match.group(0)
        # Convert the extracted date to a datetime object
        try:
            date_obj = datetime.strptime(extracted_date, "%B %d, %Y")
            # Convert to ISO format
            iso_date = date_obj.strftime("%Y-%m-%d")
            print("Extracted date in ISO format:", iso_date)
            return iso_date
        except ValueError:
            print("Date format is incorrect.")
    else:
        print("No date found in the sentence.")

def compare_dates(expected, actual):
    """Compares dates by parsing them into a standard format and ignoring time."""
    expected_date = parse_date(expected)
    actual_date = parse_date_2(actual)
    print("Expected data: ")
    print(expected_date)
    print("\n Actual data: ")
    print(actual_date)
    if expected_date and actual_date:
        return expected_date == actual_date
    return False

def evaluate_answers(data, threshold=0.75):
    """Evaluates the actual answers against the expected answers and displays results for each question."""
    y_true = []
    y_pred = []
    
    print("Detailed Results:\n")
    
    for idx, item in enumerate(data):
        expected_answer = str(item['expected_answer']).lower()
        actual_answer = str(item['actual_answer']).lower()
        correct = False
        score = 0
        
        # Special case for description: Use token-based comparison
        if "description" in item['question'].lower():
            score = compare_textual_answers(preprocess_answer(expected_answer), preprocess_answer(actual_answer))
            correct = score >= threshold
        
        # Special case for date: Use date parsing and comparison
        elif "date" in item['question'].lower():
            correct = compare_dates(expected_answer, actual_answer)
            #score = 1.0 if correct else 0.0
            if(correct):
                score = 1.0
            else:
                correct = expected_answer in actual_answer
                score = 1.0 if correct else 0.0
        
        # For all other fields, check if the expected answer string is present in the actual answer
        else:
            correct = expected_answer in actual_answer
            score = 1.0 if correct else 0.0
        
        y_true.append(1)  # Expected to match
        y_pred.append(1 if correct else 0)
        
        # Display the result for this question
        print(f"Question {idx + 1}: {item['question']}")
        print(f"Expected Answer: {item['expected_answer']}")
        print(f"Actual Answer: {item['actual_answer']}")
        print(f"Score: {score:.2f}")
        print(f"Correct: {'Yes' if correct else 'No'}\n")
    
    # Calculate precision, recall, and F1 score
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return precision, recall, f1

# Load the JSON file
file_path = 'ollama_output_results_1.json'  # Replace with your JSON file path
data = load_json(file_path)

# Evaluate the answers and display detailed results
precision, recall, f1 = evaluate_answers(data, threshold=0.75)

# Print overall precision, recall, and F1 score
print("Overall Results:")
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


Detailed Results:

No date found in the sentence.
Expected data: 
2016-05-11

 Actual data: 
None
Question 1: What is the published date of CVE-2016-1044
Expected Answer: 2016-05-11T10:59Z
Actual Answer: According to the context, the published date of CVE-2016-1044 is:

"PublishedDate": "2016-05-11T10:59Z"

Data not available for anything else.
Score: 1.00
Correct: Yes

Question 2: What is the description of CVE-2016-1044
Expected Answer: Adobe Reader and Acrobat before 11.0.16, Acrobat and Acrobat Reader DC Classic before 15.006.30172, and Acrobat and Acrobat Reader DC Continuous before 15.016.20039 on Windows and OS X allow attackers to bypass JavaScript API execution restrictions via unspecified vectors, a different vulnerability than CVE-2016-1038, CVE-2016-1039, CVE-2016-1040, CVE-2016-1041, CVE-2016-1042, CVE-2016-1062, and CVE-2016-1117.
Actual Answer: The description of CVE-2016-1044 is:

"Adobe Reader and Acrobat before 11.0.16, Acrobat and Acrobat Reader DC Classic before 15.

In [13]:
!pip3 install fuzzywuzzy



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.12 -m pip install --upgrade pip[0m


In [21]:
# in this date has been parsed into a common format to check it properly

import json
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import re
from fuzzywuzzy import fuzz

def load_json(file_path):
    """Loads the JSON file with questions, expected answers, and actual answers."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def preprocess_answer(answer):
    """Basic preprocessing to convert answers to lowercase and strip unnecessary characters."""
    return answer.lower().strip()

def compare_textual_answers(expected, actual):
    """Token-based comparison for textual answers using cosine similarity (for descriptions)."""
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([expected, actual])
    similarity = cosine_similarity(vectors[0], vectors[1])[0][0]
    return similarity

def parse_date(date_str):
    """Attempts to parse the date string into a standard format for comparison."""
    date_formats = [
        "%Y-%m-%dT%H:%MZ", "%Y-%m-%d", "%Y/%m/%d", "%Y-%m-%d %H:%M:%S", 
        "%B %d, %Y", "%b %d, %Y"  # Adding long and short month formats (e.g., "October 25, 2023")
    ]
    for fmt in date_formats:
        try:
            #return datetime.strptime(date_str, fmt).date()  # Parse to date only
            return datetime.strptime(date_str, fmt).strftime("%Y-%m-%d")  # Parse to date only
        except ValueError:
            continue
    return None


def parse_date_2(date_str):

    # Regular expression to match the date pattern
    date_pattern = r"([A-Za-z]+\s\d{1,2},\s\d{4})"

    # Find the date in the sentence
    match = re.search(date_pattern, date_str)

    # Check if a match is found
    if match:
        extracted_date = match.group(0)
        # Convert the extracted date to a datetime object
        try:
            date_obj = datetime.strptime(extracted_date, "%B %d, %Y")
            # Convert to ISO format
            iso_date = date_obj.strftime("%Y-%m-%d")
            print("Extracted date in ISO format:", iso_date)
            return iso_date
        except ValueError:
            print("Date format is incorrect.")
    else:
        print("No date found in the sentence.")

def compare_dates(expected, actual):
    """Compares dates by parsing them into a standard format and ignoring time."""
    expected_date = parse_date(expected)
    actual_date = parse_date_2(actual)
    print("Expected data: ")
    print(expected_date)
    print("\n Actual data: ")
    print(actual_date)
    if expected_date and actual_date:
        return expected_date == actual_date
    return False



def evaluate_answers(data, threshold=0.75):
    """Evaluates the actual answers against the expected answers and displays results for each question."""
    y_true = []
    y_pred = []
    
    print("Detailed Results:\n")
    
    for idx, item in enumerate(data):
        expected_answer = str(item['expected_answer']).lower()
        actual_answer = str(item['actual_answer']).lower()
        correct = False
        score = 0
        
        # Special case for description: Use token-based comparison
        if "description" in item['question'].lower():
            score = compare_textual_answers(preprocess_answer(expected_answer), preprocess_answer(actual_answer))
            correct = score >= threshold
        
        # Special case for date: Use date parsing and comparison
        elif "date" in item['question'].lower():
            correct = compare_dates(expected_answer, actual_answer)
            if correct:
                score = 1.0
            else:
                correct = expected_answer in actual_answer
                score = 1.0 if correct else 0.0
        
        # Check if expected answer is "no data found" and if the actual answer contains similar phrases
        elif expected_answer == "no data found":
            # Define possible variations of "no data found"
            no_data_variants = [
                "no data found", "data not found", "no data available", 
                "information not found", "data unavailable", "no information available"
            ]
            
            # Fuzzy matching threshold
            fuzzy_threshold = 80  # Percentage similarity threshold for approximate matching
            
            # Check if any of the variations appear in the actual answer
            for variant in no_data_variants:
                if fuzz.partial_ratio(variant, actual_answer) >= fuzzy_threshold:
                    correct = True
                    score = 1.0
                    break
            else:
                correct = False
                score = 0.0
        
        # For all other fields, check if the expected answer string is present in the actual answer
        else:
            correct = expected_answer in actual_answer
            score = 1.0 if correct else 0.0
        
        y_true.append(1)  # Expected to match
        y_pred.append(1 if correct else 0)
        
        # Display the result for this question
        print(f"Question {idx + 1}: {item['question']}")
        print(f"Expected Answer: {item['expected_answer']}")
        print(f"Actual Answer: {item['actual_answer']}")
        print(f"Score: {score:.2f}")
        print(f"Correct: {'Yes' if correct else 'No'}\n")
    
    # Calculate precision, recall, and F1 score
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return precision, recall, f1

# Load the JSON file
file_path = 'ollama_output_results.json'  # Replace with your JSON file path
data = load_json(file_path)

# Evaluate the answers and display detailed results
precision, recall, f1 = evaluate_answers(data, threshold=0.75)

# Print overall precision, recall, and F1 score
print("Overall Results:")
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')



ModuleNotFoundError: No module named 'fuzzywuzzy'