In [2]:
import json
from sklearn.metrics import f1_score, recall_score, precision_score
# from langchain_community.llms import ChatGoogleGenerativeAI
from langchain_google_genai import ChatGoogleGenerativeAI

from sklearn.preprocessing import MultiLabelBinarizer

# Initialize the model
model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)

# Define your prompt template
prompt_template = """
You are an expert in cybersecurity. Please answer the following question as accurately as possible.

Question:
{question}

Answer:
"""

# Example questions and expected answers
questions_and_answers = [
    {"question": "What is the impact of CVE-2023-26320?", "expected_answer": "Command injection vulnerability in Xiaomi routers allowing arbitrary code execution."},
    {"question": "What is CVE-2023-26319?", "expected_answer": "Cross-site scripting vulnerability in web applications."},
    # Add more questions and their expected answers here...
]

# Prepare lists for the LLM's answers and expected answers
llm_answers = []
expected_answers = []

for qa in questions_and_answers:
    question = qa["question"]
    expected_answer = qa["expected_answer"]
    
    # Format the prompt with the actual question
    formatted_prompt = prompt_template.format(question=question)
    
    # Invoke the model with the prompt
    response = model.invoke(formatted_prompt)
    
    # Extract the content from the response
    if isinstance(response, str):
        llm_answer = response.strip()
    elif hasattr(response, 'content'):
        llm_answer = response.content.strip()
    else:
        print("Unexpected response format:", response)
        llm_answer = ""
    
    # Store the LLM's answer and the expected answer
    llm_answers.append(llm_answer)
    expected_answers.append(expected_answer)

# Evaluate the LLM's performance using F1 score and Recall
# Convert text to a bag of words or similar structure for comparison
mlb = MultiLabelBinarizer()

# Split answers into sets of words (this is a simplified approach)
y_true = [set(ans.lower().split()) for ans in expected_answers]
y_pred = [set(ans.lower().split()) for ans in llm_answers]

# Binarize the labels for multi-label classification metrics
y_true = mlb.fit_transform(y_true)
y_pred = mlb.transform(y_pred)

# Calculate F1 score, precision, and recall
f1 = f1_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')
precision = precision_score(y_true, y_pred, average='micro')

# Print the results
print(f"F1 Score: {f1:.2f}")
print(f"Recall: {recall:.2f}")
print(f"Precision: {precision:.2f}")


ModuleNotFoundError: No module named 'langchain_google_genai'